PyPI - docent-python - Versions diffs - 0.1.3a0__tar.gz → 0.1.4a0__tar.gz - Mend

docent-python 0.1.3a0tar.gz → 0.1.4a0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docent-python might be problematic. Click here for more details.

Files changed (35) hide show

{docent_python-0.1.3a0 → docent_python-0.1.4a0}/.gitignore RENAMED Viewed

@@ -192,3 +192,6 @@ personal/caden/*
 inspect_evals
 *.swp
+# test data cache
+data/cache

{docent_python-0.1.3a0 → docent_python-0.1.4a0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docent-python
-Version: 0.1.3a0
+Version: 0.1.4a0
 Summary: Docent SDK
 Project-URL: Homepage, https://github.com/TransluceAI/docent
 Project-URL: Issues, https://github.com/TransluceAI/docent/issues

docent_python-0.1.4a0/docent/data_models/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+from docent.data_models.agent_run import AgentRun
+from docent.data_models.citation import Citation
+from docent.data_models.regex import RegexSnippet
+from docent.data_models.transcript import Transcript, TranscriptGroup
+__all__ = [
+    "AgentRun",
+    "Citation",
+    "RegexSnippet",
+    "Transcript",
+    "TranscriptGroup",
+]

{docent_python-0.1.3a0 → docent_python-0.1.4a0}/docent/data_models/agent_run.py RENAMED Viewed

@@ -1,3 +1,4 @@
+import json
 import sys
 from typing import Any, Literal, TypedDict, cast
 from uuid import uuid4
@@ -12,8 +13,11 @@ from pydantic import (
 )
 from docent.data_models._tiktoken_util import get_token_count, group_messages_into_ranges
-from docent.data_models.metadata import BaseAgentRunMetadata
-from docent.data_models.transcript import Transcript, TranscriptWithoutMetadataValidator
+from docent.data_models.transcript import (
+    Transcript,
+    TranscriptWithoutMetadataValidator,
+    fake_model_dump,
+)
 class FilterableField(TypedDict):
@@ -32,7 +36,7 @@ class AgentRun(BaseModel):
         name: Optional human-readable name for the agent run.
         description: Optional description of the agent run.
         transcripts: Dict mapping transcript IDs to Transcript objects.
-        metadata: Additional structured metadata about the agent run.
+        metadata: Additional structured metadata about the agent run as a JSON-serializable dictionary.
     """
     id: str = Field(default_factory=lambda: str(uuid4()))
@@ -40,23 +44,34 @@ class AgentRun(BaseModel):
     description: str | None = None
     transcripts: dict[str, Transcript]
-    metadata: BaseAgentRunMetadata
+    metadata: dict[str, Any] = Field(default_factory=dict)
     @field_serializer("metadata")
-    def serialize_metadata(self, metadata: BaseAgentRunMetadata, _info: Any) -> dict[str, Any]:
+    def serialize_metadata(self, metadata: dict[str, Any], _info: Any) -> dict[str, Any]:
         """
-        Custom serializer for the metadata field so the internal fields are explicitly preserved.
+        Custom serializer for the metadata field - returns the dict as-is since it's already serializable.
         """
-        return metadata.model_dump(strip_internal_fields=False)
+        return fake_model_dump(metadata)
     @field_validator("metadata", mode="before")
     @classmethod
-    def _validate_metadata_type(cls, v: Any) -> Any:
-        if v is not None and not isinstance(v, BaseAgentRunMetadata):
-            raise ValueError(
-                f"metadata must be an instance of BaseAgentRunMetadata, got {type(v).__name__}"
-            )
-        return v
+    def _validate_metadata_json_serializable(cls, v: Any) -> dict[str, Any]:
+        """
+        Validates that metadata is a dictionary and is JSON-serializable.
+        """
+        if v is None:
+            return {}
+        if not isinstance(v, dict):
+            raise ValueError(f"metadata must be a dictionary, got {type(v).__name__}")
+        # Check that the metadata is JSON serializable
+        try:
+            json.dumps(fake_model_dump(cast(dict[str, Any], v)))
+        except (TypeError, ValueError) as e:
+            raise ValueError(f"metadata must be JSON-serializable: {e}")
+        return cast(dict[str, Any], v)
     @model_validator(mode="after")
     def _validate_transcripts_not_empty(self):
@@ -88,16 +103,11 @@ class AgentRun(BaseModel):
         transcripts_str = "\n\n".join(transcript_strs)
         # Gather metadata
-        metadata_obj = self.metadata.model_dump(strip_internal_fields=True)
+        metadata_obj = fake_model_dump(self.metadata)
         if self.name is not None:
             metadata_obj["name"] = self.name
         if self.description is not None:
             metadata_obj["description"] = self.description
-        # Add the field descriptions if they exist
-        metadata_obj = {
-            (f"{k} ({d})" if (d := self.metadata.get_field_description(k)) is not None else k): v
-            for k, v in metadata_obj.items()
-        }
         yaml_width = float("inf")
         transcripts_str = (
@@ -202,7 +212,7 @@ class AgentRun(BaseModel):
                     _explore_dict(cast(dict[str, Any], v), f"{prefix}.{k}", depth + 1)
         # Look at the agent run metadata
-        _explore_dict(self.metadata.model_dump(strip_internal_fields=True), "metadata", 0)
+        _explore_dict(fake_model_dump(self.metadata), "metadata", 0)
         # Look at the transcript metadata
         # TODO(mengk): restore this later when we have the ability to integrate with SQL.
         # for t_id, t in self.transcripts.items():

docent_python-0.1.4a0/docent/data_models/metadata.py ADDED Viewed

@@ -0,0 +1,229 @@
+# import traceback
+# from typing import Any, Optional
+# from pydantic import (
+#     BaseModel,
+#     ConfigDict,
+#     Field,
+#     PrivateAttr,
+#     SerializerFunctionWrapHandler,
+#     model_serializer,
+#     model_validator,
+# )
+# from docent._log_util import get_logger
+# logger = get_logger(__name__)
+# SINGLETONS = (int, float, str, bool)
+# class BaseMetadata(BaseModel):
+#     """Provides common functionality for accessing and validating metadata fields.
+#     All metadata classes should inherit from this class.
+#     Serialization Behavior:
+#         - Field descriptions are highly recommended and stored in serialized versions of the object.
+#         - When a subclass of BaseMetadata is uploaded to a server, all extra fields and their descriptions are retained.
+#         - To recover the original structure with proper typing upon download, use:
+#           `CustomMetadataClass.model_validate(obj.model_dump())`.
+#     Attributes:
+#         model_config: Pydantic configuration that allows extra fields.
+#         allow_fields_without_descriptions: Boolean indicating whether to allow fields without descriptions.
+#     """
+#     model_config = ConfigDict(extra="allow")
+#     allow_fields_without_descriptions: bool = True
+#     # Private attribute to store field descriptions
+#     _field_descriptions: dict[str, str | None] | None = PrivateAttr(default=None)
+#     _internal_basemetadata_fields: set[str] = PrivateAttr(
+#         default={
+#             "allow_fields_without_descriptions",
+#             "model_config",
+#             "_field_descriptions",
+#         }
+#     )
+#     @model_validator(mode="after")
+#     def _validate_field_types_and_descriptions(self):
+#         """Validates that all fields have descriptions and proper types.
+#         Returns:
+#             Self: The validated model instance.
+#         Raises:
+#             ValueError: If any field is missing a description or has an invalid type.
+#         """
+#         # Validate each field in the model
+#         for field_name, field_info in self.__class__.model_fields.items():
+#             if field_name in self._internal_basemetadata_fields:
+#                 continue
+#             # Check that field has a description
+#             if field_info.description is None:
+#                 if not self.allow_fields_without_descriptions:
+#                     raise ValueError(
+#                         f"Field `{field_name}` needs a description in the definition of `{self.__class__.__name__}`, like `{field_name}: T = Field(description=..., default=...)`. "
+#                         "To allow un-described fields, set `allow_fields_without_descriptions = True` on the instance or in your metadata class definition."
+#                     )
+#         # Validate that the metadata is JSON serializable
+#         try:
+#             self.model_dump_json()
+#         except Exception as e:
+#             raise ValueError(
+#                 f"Metadata is not JSON serializable: {e}. Traceback: {traceback.format_exc()}"
+#             )
+#         return self
+#     def model_post_init(self, __context: Any) -> None:
+#         """Initializes field descriptions from extra data after model initialization.
+#         Args:
+#             __context: The context provided by Pydantic's post-initialization hook.
+#         """
+#         fd = self.model_extra.pop("_field_descriptions", None) if self.model_extra else None
+#         if fd is not None:
+#             self._field_descriptions = fd
+#     @model_serializer(mode="wrap")
+#     def _serialize_model(self, handler: SerializerFunctionWrapHandler):
+#         # Call the default serializer
+#         data = handler(self)
+#         # Dump the field descriptions
+#         if self._field_descriptions is None:
+#             self._field_descriptions = self._compute_field_descriptions()
+#         data["_field_descriptions"] = self._field_descriptions
+#         return data
+#     def model_dump(
+#         self, *args: Any, strip_internal_fields: bool = False, **kwargs: Any
+#     ) -> dict[str, Any]:
+#         data = super().model_dump(*args, **kwargs)
+#         # Remove internal fields if requested
+#         if strip_internal_fields:
+#             for field in self._internal_basemetadata_fields:
+#                 if field in data:
+#                     data.pop(field)
+#         return data
+#     def get(self, key: str, default_value: Any = None) -> Any:
+#         """Gets a value from the metadata by key.
+#         Args:
+#             key: The key to look up in the metadata.
+#             default_value: Value to return if the key is not found. Defaults to None.
+#         Returns:
+#             Any: The value associated with the key, or the default value if not found.
+#         """
+#         # Check if the field exists in the model's fields
+#         if key in self.__class__.model_fields or (
+#             self.model_extra is not None and key in self.model_extra
+#         ):
+#             # Field exists, return its value (even if None)
+#             return getattr(self, key)
+#         logger.warning(f"Field '{key}' not found in {self.__class__.__name__}")
+#         return default_value
+#     def get_field_description(self, field_name: str) -> str | None:
+#         """Gets the description of a field defined in the model schema.
+#         Args:
+#             field_name: The name of the field.
+#         Returns:
+#             str or None: The description string if the field is defined in the model schema
+#                 and has a description, otherwise None.
+#         """
+#         if self._field_descriptions is None:
+#             self._field_descriptions = self._compute_field_descriptions()
+#         if field_name in self._field_descriptions:
+#             return self._field_descriptions[field_name]
+#         logger.warning(
+#             f"Field description for '{field_name}' not found in {self.__class__.__name__}"
+#         )
+#         return None
+#     def get_all_field_descriptions(self) -> dict[str, str | None]:
+#         """Gets descriptions for all fields defined in the model schema.
+#         Returns:
+#             dict: A dictionary mapping field names to their descriptions.
+#                 Only includes fields that have descriptions defined in the schema.
+#         """
+#         if self._field_descriptions is None:
+#             self._field_descriptions = self._compute_field_descriptions()
+#         return self._field_descriptions
+#     def _compute_field_descriptions(self) -> dict[str, str | None]:
+#         """Computes descriptions for all fields in the model.
+#         Returns:
+#             dict: A dictionary mapping field names to their descriptions.
+#         """
+#         field_descriptions: dict[str, Optional[str]] = {}
+#         for field_name, field_info in self.__class__.model_fields.items():
+#             if field_name not in self._internal_basemetadata_fields:
+#                 field_descriptions[field_name] = field_info.description
+#         return field_descriptions
+# class BaseAgentRunMetadata(BaseMetadata):
+#     """Extends BaseMetadata with fields specific to agent evaluation runs.
+#     Attributes:
+#         scores: Dictionary of evaluation metrics.
+#     """
+#     scores: dict[str, int | float | bool | None] = Field(
+#         description="A dict of score_key -> score_value. Use one key for each metric you're tracking."
+#     )
+# class InspectAgentRunMetadata(BaseAgentRunMetadata):
+#     """Extends BaseAgentRunMetadata with fields specific to Inspect runs.
+#     Attributes:
+#         task_id: The ID of the 'benchmark' or 'set of evals' that the transcript belongs to
+#         sample_id: The specific task inside of the `task_id` benchmark that the transcript was run on
+#         epoch_id: Each `sample_id` should be run multiple times due to stochasticity; `epoch_id` is the integer index of a specific run.
+#         model: The model that was used to generate the transcript
+#         scoring_metadata: Additional metadata about the scoring process
+#         additional_metadata: Additional metadata about the transcript
+#     """
+#     task_id: str = Field(
+#         description="The ID of the 'benchmark' or 'set of evals' that the transcript belongs to"
+#     )
+#     # Identification of this particular run
+#     sample_id: str = Field(
+#         description="The specific task inside of the `task_id` benchmark that the transcript was run on"
+#     )
+#     epoch_id: int = Field(
+#         description="Each `sample_id` should be run multiple times due to stochasticity; `epoch_id` is the integer index of a specific run."
+#     )
+#     # Parameters for the run
+#     model: str = Field(description="The model that was used to generate the transcript")
+#     # Scoring
+#     scoring_metadata: dict[str, Any] | None = Field(
+#         description="Additional metadata about the scoring process"
+#     )
+#     # Inspect metadata
+#     additional_metadata: dict[str, Any] | None = Field(
+#         description="Additional metadata about the transcript"
+#     )

{docent_python-0.1.3a0 → docent_python-0.1.4a0}/docent/data_models/transcript.py RENAMED Viewed

@@ -11,7 +11,6 @@ from docent.data_models._tiktoken_util import (
     truncate_to_token_limit,
 )
 from docent.data_models.chat import AssistantMessage, ChatMessage, ContentReasoning
-from docent.data_models.metadata import BaseMetadata
 # Template for formatting individual transcript blocks
 TRANSCRIPT_BLOCK_TEMPLATE = """
@@ -63,6 +62,53 @@ def format_chat_message(
     )
+class TranscriptGroup(BaseModel):
+    """Represents a group of transcripts that are logically related.
+    A transcript group can contain multiple transcripts and can have a hierarchical
+    structure with parent groups. This is useful for organizing transcripts into
+    logical units like experiments, tasks, or sessions.
+    Attributes:
+        id: Unique identifier for the transcript group, auto-generated by default.
+        name: Optional human-readable name for the transcript group.
+        description: Optional description of the transcript group.
+        parent_transcript_group_id: Optional ID of the parent transcript group.
+        metadata: Additional structured metadata about the transcript group.
+    """
+    id: str = Field(default_factory=lambda: str(uuid4()))
+    name: str | None = None
+    description: str | None = None
+    parent_transcript_group_id: str | None = None
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    @field_serializer("metadata")
+    def serialize_metadata(self, metadata: dict[str, Any], _info: Any) -> dict[str, Any]:
+        """
+        Custom serializer for the metadata field so the internal fields are explicitly preserved.
+        """
+        return fake_model_dump(metadata)
+    @field_validator("metadata", mode="before")
+    @classmethod
+    def _validate_metadata_type(cls, v: Any) -> Any:
+        if v is not None and not isinstance(v, dict):
+            raise ValueError(f"metadata must be a dictionary, got {type(v).__name__}")
+        return v  # type: ignore
+def fake_model_dump(obj: dict[str, Any]) -> dict[str, Any]:
+    """
+    Emulate the action of pydantic.model_dump() for non-pydantic objects (to handle nested values)
+    """
+    class _FakeModel(BaseModel):
+        data: dict[str, Any]
+    return _FakeModel(data=obj).model_dump()["data"]
 class Transcript(BaseModel):
     """Represents a transcript of messages in a conversation with an AI agent.
@@ -74,6 +120,7 @@ class Transcript(BaseModel):
         id: Unique identifier for the transcript, auto-generated by default.
         name: Optional human-readable name for the transcript.
         description: Optional description of the transcript.
+        transcript_group_id: Optional ID of the transcript group this transcript belongs to.
         messages: List of chat messages in the transcript.
         metadata: Additional structured metadata about the transcript.
     """
@@ -81,27 +128,25 @@ class Transcript(BaseModel):
     id: str = Field(default_factory=lambda: str(uuid4()))
     name: str | None = None
     description: str | None = None
+    transcript_group_id: str | None = None
     messages: list[ChatMessage]
-    metadata: BaseMetadata = Field(default_factory=BaseMetadata)
+    metadata: dict[str, Any] = Field(default_factory=dict)
     _units_of_action: list[list[int]] | None = PrivateAttr(default=None)
     @field_serializer("metadata")
-    def serialize_metadata(self, metadata: BaseMetadata, _info: Any) -> dict[str, Any]:
+    def serialize_metadata(self, metadata: dict[str, Any], _info: Any) -> dict[str, Any]:
         """
         Custom serializer for the metadata field so the internal fields are explicitly preserved.
         """
-        return metadata.model_dump(strip_internal_fields=False)
+        return fake_model_dump(metadata)
     @field_validator("metadata", mode="before")
     @classmethod
     def _validate_metadata_type(cls, v: Any) -> Any:
-        if v is not None and not isinstance(v, BaseMetadata):
-            raise ValueError(
-                f"metadata must be an instance of BaseMetadata, got {type(v).__name__}"
-            )
-        return v
+        if v is not None and not isinstance(v, dict):
+            raise ValueError(f"metadata must be a dict, got {type(v).__name__}")
+        return v  # type: ignore
     @property
     def units_of_action(self) -> list[list[int]]:
@@ -297,12 +342,7 @@ class Transcript(BaseModel):
         blocks_str = "\n".join(au_blocks)
         # Gather metadata
-        metadata_obj = self.metadata.model_dump(strip_internal_fields=True)
-        # Add the field descriptions if they exist
-        metadata_obj = {
-            (f"{k} ({d})" if (d := self.metadata.get_field_description(k)) is not None else k): v
-            for k, v in metadata_obj.items()
-        }
+        metadata_obj = fake_model_dump(self.metadata)
         yaml_width = float("inf")
         block_str = f"<blocks>\n{blocks_str}\n</blocks>\n"

docent_python-0.1.4a0/docent/loaders/load_inspect.py ADDED Viewed

@@ -0,0 +1,88 @@
+from typing import Any
+from inspect_ai.log import EvalLog
+from inspect_ai.scorer import CORRECT, INCORRECT, NOANSWER, PARTIAL, Score
+from docent.data_models import AgentRun, Transcript
+from docent.data_models.chat import parse_chat_message
+def _normalize_inspect_score(score: Score) -> Any:
+    """
+    Normalize an inspect score to a float. This implements the same logic as inspect_ai.scorer._metric.value_to_float, but fails more conspicuously.
+    Args:
+        score: The inspect score to normalize.
+    Returns:
+        The normalized score as a float, or None if the score is not a valid value.
+    """
+    def _leaf_normalize(value: int | float | bool | str | None) -> float | str | None:
+        if value is None:
+            return None
+        if isinstance(value, int | float | bool):
+            return float(value)
+        if value == CORRECT:
+            return 1.0
+        if value == PARTIAL:
+            return 0.5
+        if value in [INCORRECT, NOANSWER]:
+            return 0
+        value = str(value).lower()
+        if value in ["yes", "true"]:
+            return 1.0
+        if value in ["no", "false"]:
+            return 0.0
+        if value.replace(".", "").isnumeric():
+            return float(value)
+        return value
+    if isinstance(score.value, int | float | bool | str):
+        return _leaf_normalize(score.value)
+    if isinstance(score.value, list):
+        return [_leaf_normalize(v) for v in score.value]
+    assert isinstance(score.value, dict), "Inspect score must be leaf value, list, or dict"
+    return {k: _leaf_normalize(v) for k, v in score.value.items()}
+def load_inspect_log(log: EvalLog) -> list[AgentRun]:
+    if log.samples is None:
+        return []
+    # TODO(vincent): fix this
+    agent_runs: list[AgentRun] = []
+    for s in log.samples:
+        sample_id = s.id
+        epoch_id = s.epoch
+        if s.scores is None:
+            sample_scores = {}
+        else:
+            sample_scores = {k: _normalize_inspect_score(v) for k, v in s.scores.items()}
+        metadata = {
+            "task_id": log.eval.task,
+            "sample_id": str(sample_id),
+            "epoch_id": epoch_id,
+            "model": log.eval.model,
+            "additional_metadata": s.metadata,
+            "scores": sample_scores,
+            # Scores could have answers, explanations, and other metadata besides the values we extract
+            "scoring_metadata": s.scores,
+        }
+        agent_runs.append(
+            AgentRun(
+                transcripts={
+                    "main": Transcript(
+                        messages=[parse_chat_message(m.model_dump()) for m in s.messages],
+                        metadata={},
+                    )
+                },
+                metadata=metadata,
+            )
+        )
+    return agent_runs

docent-python 0.1.3a0__tar.gz → 0.1.4a0__tar.gz

Potentially problematic release.

docent-python 0.1.3a0tar.gz → 0.1.4a0tar.gz