PyPI - docent-python - Versions diffs - 0.1.41a0__py3-none-any.whl - Mend

docent-python 0.1.41a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docent-python might be problematic. Click here for more details.

Files changed (59) hide show

docent/__init__.py +4 -0
docent/_llm_util/__init__.py +0 -0
docent/_llm_util/data_models/__init__.py +0 -0
docent/_llm_util/data_models/exceptions.py +48 -0
docent/_llm_util/data_models/llm_output.py +331 -0
docent/_llm_util/llm_cache.py +193 -0
docent/_llm_util/llm_svc.py +472 -0
docent/_llm_util/model_registry.py +134 -0
docent/_llm_util/providers/__init__.py +0 -0
docent/_llm_util/providers/anthropic.py +537 -0
docent/_llm_util/providers/common.py +41 -0
docent/_llm_util/providers/google.py +530 -0
docent/_llm_util/providers/openai.py +745 -0
docent/_llm_util/providers/openrouter.py +375 -0
docent/_llm_util/providers/preference_types.py +104 -0
docent/_llm_util/providers/provider_registry.py +164 -0
docent/_log_util/__init__.py +3 -0
docent/_log_util/logger.py +141 -0
docent/data_models/__init__.py +14 -0
docent/data_models/_tiktoken_util.py +91 -0
docent/data_models/agent_run.py +473 -0
docent/data_models/chat/__init__.py +37 -0
docent/data_models/chat/content.py +56 -0
docent/data_models/chat/message.py +191 -0
docent/data_models/chat/tool.py +109 -0
docent/data_models/citation.py +187 -0
docent/data_models/formatted_objects.py +84 -0
docent/data_models/judge.py +17 -0
docent/data_models/metadata_util.py +16 -0
docent/data_models/regex.py +56 -0
docent/data_models/transcript.py +305 -0
docent/data_models/util.py +170 -0
docent/judges/__init__.py +23 -0
docent/judges/analysis.py +77 -0
docent/judges/impl.py +587 -0
docent/judges/runner.py +129 -0
docent/judges/stats.py +205 -0
docent/judges/types.py +320 -0
docent/judges/util/forgiving_json.py +108 -0
docent/judges/util/meta_schema.json +86 -0
docent/judges/util/meta_schema.py +29 -0
docent/judges/util/parse_output.py +68 -0
docent/judges/util/voting.py +139 -0
docent/loaders/load_inspect.py +215 -0
docent/py.typed +0 -0
docent/samples/__init__.py +3 -0
docent/samples/load.py +9 -0
docent/samples/log.eval +0 -0
docent/samples/tb_airline.json +1 -0
docent/sdk/__init__.py +0 -0
docent/sdk/agent_run_writer.py +317 -0
docent/sdk/client.py +1186 -0
docent/sdk/llm_context.py +432 -0
docent/trace.py +2741 -0
docent/trace_temp.py +1086 -0
docent_python-0.1.41a0.dist-info/METADATA +33 -0
docent_python-0.1.41a0.dist-info/RECORD +59 -0
docent_python-0.1.41a0.dist-info/WHEEL +4 -0
docent_python-0.1.41a0.dist-info/licenses/LICENSE.md +13 -0

docent/data_models/agent_run.py ADDED Viewed

@@ -0,0 +1,473 @@
+import sys
+import textwrap
+from collections import deque
+from datetime import datetime
+from typing import Any, Literal, TypedDict, cast
+from uuid import uuid4
+import yaml
+from pydantic import (
+    BaseModel,
+    Field,
+    PrivateAttr,
+    field_validator,
+    model_validator,
+)
+from pydantic_core import to_jsonable_python
+from docent._log_util import get_logger
+from docent.data_models._tiktoken_util import get_token_count, group_messages_into_ranges
+from docent.data_models.metadata_util import dump_metadata
+from docent.data_models.transcript import Transcript, TranscriptGroup
+logger = get_logger(__name__)
+class FilterableField(TypedDict):
+    name: str
+    type: Literal["str", "bool", "int", "float"]
+class AgentRun(BaseModel):
+    """Represents a complete run of an agent with transcripts and metadata.
+    An AgentRun encapsulates the execution of an agent, storing all communication
+    transcripts and associated metadata. It must contain at least one transcript.
+    Attributes:
+        id: Unique identifier for the agent run, auto-generated by default.
+        name: Optional human-readable name for the agent run.
+        description: Optional description of the agent run.
+        transcripts: List of Transcript objects.
+        transcript_groups: List of TranscriptGroup objects.
+        metadata: Additional structured metadata about the agent run as a JSON-serializable dictionary.
+    """
+    id: str = Field(default_factory=lambda: str(uuid4()))
+    name: str | None = None
+    description: str | None = None
+    transcripts: list[Transcript]
+    transcript_groups: list[TranscriptGroup] = Field(default_factory=list)
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    @field_validator("transcripts", mode="before")
+    @classmethod
+    def _validate_transcripts_type(cls, v: Any) -> Any:
+        if isinstance(v, dict):
+            logger.warning(
+                "dict[str, Transcript] for transcripts is deprecated. Use list[Transcript] instead."
+            )
+            v = cast(dict[str, Transcript], v)
+            return [Transcript.model_validate(t) for t in v.values()]
+        return v
+    @field_validator("transcript_groups", mode="before")
+    @classmethod
+    def _validate_transcript_groups_type(cls, v: Any) -> Any:
+        if isinstance(v, dict):
+            logger.warning(
+                "dict[str, TranscriptGroup] for transcript_groups is deprecated. Use list[TranscriptGroup] instead."
+            )
+            v = cast(dict[str, TranscriptGroup], v)
+            return [TranscriptGroup.model_validate(tg) for tg in v.values()]
+        return v
+    @model_validator(mode="after")
+    def _validate_transcripts_not_empty(self):
+        """Validates that the agent run contains at least one transcript.
+        Raises:
+            ValueError: If the transcripts list is empty.
+        Returns:
+            AgentRun: The validated AgentRun instance.
+        """
+        if len(self.transcripts) == 0:
+            raise ValueError("AgentRun must have at least one transcript")
+        return self
+    def get_filterable_fields(self, max_depth: int = 1) -> list[FilterableField]:
+        """Returns a list of all fields that can be used to filter the agent run,
+        by recursively exploring the model_dump() for singleton types in dictionaries.
+        Returns:
+            list[FilterableField]: A list of filterable fields, where each field is a
+                                   dictionary containing its 'name' (path) and 'type'.
+        """
+        result: list[FilterableField] = []
+        def _explore_dict(d: dict[str, Any], prefix: str, depth: int):
+            nonlocal result
+            if depth > max_depth:
+                return
+            for k, v in d.items():
+                if isinstance(v, (str, int, float, bool)):
+                    result.append(
+                        {
+                            "name": f"{prefix}.{k}",
+                            "type": cast(Literal["str", "bool", "int", "float"], type(v).__name__),
+                        }
+                    )
+                elif isinstance(v, dict):
+                    _explore_dict(cast(dict[str, Any], v), f"{prefix}.{k}", depth + 1)
+        # Look at the agent run metadata
+        _explore_dict(to_jsonable_python(self.metadata), "metadata", 0)
+        # Look at the transcript metadata
+        # TODO(mengk): restore this later when we have the ability to integrate with SQL.
+        # for t_id, t in self.transcripts.items():
+        #     _explore_dict(
+        #         t.metadata.model_dump(strip_internal_fields=True), f"transcript.{t_id}.metadata", 0
+        #     )
+        # Append the text field
+        result.append({"name": "agent_run_id", "type": "str"})
+        result.append({"name": "text", "type": "str"})
+        return result
+    ######################
+    # Converting to text #
+    ######################
+    def _to_text_impl(self, token_limit: int = sys.maxsize) -> list[str]:
+        """
+        Core implementation for converting agent run to text representation.
+        Args:
+            token_limit: Maximum tokens per returned string under the GPT-4 tokenization scheme
+            use_blocks: If True, use individual message blocks. If False, use action units.
+        Returns:
+            List of strings, each at most token_limit tokens
+        """
+        # Generate transcript strings using appropriate method
+        transcript_strs: list[str] = []
+        for i, t in enumerate(self.transcripts):
+            transcript_content = t.to_str(
+                token_limit=sys.maxsize,
+                transcript_idx=i,
+            )[0]
+            transcript_strs.append(f"<transcript>\n{transcript_content}\n</transcript>")
+        transcripts_str = "\n\n".join(transcript_strs)
+        # Gather metadata
+        metadata_obj = to_jsonable_python(self.metadata)
+        if self.name is not None:
+            metadata_obj["name"] = self.name
+        if self.description is not None:
+            metadata_obj["description"] = self.description
+        yaml_width = float("inf")
+        transcripts_str = (
+            f"Here is a complete agent run for analysis purposes only:\n{transcripts_str}\n\n"
+        )
+        metadata_str = f"Metadata about the complete agent run:\n<agent run metadata>\n{yaml.dump(metadata_obj, width=yaml_width)}\n</agent run metadata>"
+        if token_limit == sys.maxsize:
+            return [f"{transcripts_str}" f"{metadata_str}"]
+        # Compute message length; if fits, return the full transcript and metadata
+        transcript_str_tokens = get_token_count(transcripts_str)
+        metadata_str_tokens = get_token_count(metadata_str)
+        if transcript_str_tokens + metadata_str_tokens <= token_limit:
+            return [f"{transcripts_str}" f"{metadata_str}"]
+        # Otherwise, split up the transcript and metadata into chunks
+        else:
+            results: list[str] = []
+            transcript_token_counts = [get_token_count(t) for t in transcript_strs]
+            ranges = group_messages_into_ranges(
+                transcript_token_counts, metadata_str_tokens, token_limit - 50
+            )
+            for msg_range in ranges:
+                if msg_range.include_metadata:
+                    cur_transcript_str = "\n\n".join(
+                        transcript_strs[msg_range.start : msg_range.end]
+                    )
+                    results.append(
+                        f"Here is a partial agent run for analysis purposes only:\n{cur_transcript_str}"
+                        f"{metadata_str}"
+                    )
+                else:
+                    assert (
+                        msg_range.end == msg_range.start + 1
+                    ), "Ranges without metadata should be a single message"
+                    t = self.transcripts[msg_range.start]
+                    if msg_range.num_tokens < token_limit - 50:
+                        transcript = (
+                            f"<transcript>\n{t.to_str(token_limit=sys.maxsize)[0]}\n</transcript>"
+                        )
+                        result = (
+                            f"Here is a partial agent run for analysis purposes only:\n{transcript}"
+                        )
+                        results.append(result)
+                    else:
+                        transcript_fragments: list[str] = t.to_str(
+                            token_limit=token_limit - 50,
+                        )
+                        for fragment in transcript_fragments:
+                            result = f"<transcript>\n{fragment}\n</transcript>"
+                            result = (
+                                f"Here is a partial agent run for analysis purposes only:\n{result}"
+                            )
+                            results.append(result)
+            return results
+    @property
+    def text(self) -> str:
+        """Concatenates all transcript texts with double newlines as separators.
+        Returns:
+            str: A string representation of all transcripts.
+        """
+        return self._to_text_impl(token_limit=sys.maxsize)[0]
+    ##############################
+    # New text rendering methods #
+    ##############################
+    # Transcript ID -> Transcript
+    _transcript_dict: dict[str, Transcript] | None = PrivateAttr(default=None)
+    # Transcript Group ID -> Transcript Group
+    _transcript_group_dict: dict[str, TranscriptGroup] | None = PrivateAttr(default=None)
+    # Canonical tree cache keyed by full_tree flag
+    _canonical_tree_cache: dict[bool, dict[str | None, list[tuple[Literal["t", "tg"], str]]]] = (
+        PrivateAttr(default_factory=dict)
+    )
+    # Transcript IDs (depth-first) cache keyed by full_tree flag
+    _transcript_ids_ordered_cache: dict[bool, list[str]] = PrivateAttr(default_factory=dict)
+    @property
+    def transcript_dict(self) -> dict[str, Transcript]:
+        """Lazily compute and cache a mapping from transcript ID to Transcript."""
+        if self._transcript_dict is None:
+            self._transcript_dict = {t.id: t for t in self.transcripts}
+        return self._transcript_dict
+    @property
+    def transcript_group_dict(self) -> dict[str, TranscriptGroup]:
+        """Lazily compute and cache a mapping from transcript group ID to TranscriptGroup."""
+        if self._transcript_group_dict is None:
+            self._transcript_group_dict = {tg.id: tg for tg in self.transcript_groups}
+        return self._transcript_group_dict
+    def _invalidate_caches(self) -> None:
+        """Reset cached lookups after mutating transcripts or transcript groups."""
+        self._transcript_dict = None
+        self._transcript_group_dict = None
+        self._canonical_tree_cache.clear()
+        self._transcript_ids_ordered_cache.clear()
+    def get_canonical_tree(
+        self, full_tree: bool = False
+    ) -> dict[str | None, list[tuple[Literal["t", "tg"], str]]]:
+        """Compute and cache the canonical, sorted transcript group tree.
+        Args:
+            full_tree: If True, include all transcript groups regardless of whether
+                they contain transcripts. If False, include only the minimal tree
+                that connects relevant groups and transcripts.
+        Returns:
+            Canonical tree mapping parent group id (or "__global_root") to a list of
+            children (type, id) tuples sorted by creation time.
+        """
+        if (
+            full_tree not in self._canonical_tree_cache
+            or full_tree not in self._transcript_ids_ordered_cache
+        ):
+            canonical_tree, transcript_idx_map = self._build_canonical_tree(full_tree=full_tree)
+            self._canonical_tree_cache[full_tree] = canonical_tree
+            self._transcript_ids_ordered_cache[full_tree] = list(transcript_idx_map.keys())
+        return self._canonical_tree_cache[full_tree]
+    def get_transcript_ids_ordered(self, full_tree: bool = False) -> list[str]:
+        """Compute and cache the depth-first transcript id ordering.
+        Args:
+            full_tree: Whether to compute based on the full tree or the minimal tree.
+        Returns:
+            List of transcript ids in depth-first order.
+        """
+        if (
+            full_tree not in self._transcript_ids_ordered_cache
+            or full_tree not in self._canonical_tree_cache
+        ):
+            canonical_tree, transcript_idx_map = self._build_canonical_tree(full_tree=full_tree)
+            self._canonical_tree_cache[full_tree] = canonical_tree
+            self._transcript_ids_ordered_cache[full_tree] = list(transcript_idx_map.keys())
+        return self._transcript_ids_ordered_cache[full_tree]
+    def _build_canonical_tree(self, full_tree: bool = False):
+        t_dict = self.transcript_dict
+        tg_dict = self.transcript_group_dict
+        # Find all transcript groups that have direct transcript children
+        # Also keep track of transcripts that are not in a group
+        tgs_to_transcripts: dict[str, set[str]] = {}
+        for transcript in t_dict.values():
+            if transcript.transcript_group_id is None:
+                tgs_to_transcripts.setdefault("__global_root", set()).add(transcript.id)
+            else:
+                tgs_to_transcripts.setdefault(transcript.transcript_group_id, set()).add(
+                    transcript.id
+                )
+        # tg_tree maps from parent -> children. A child can be a group or a transcript.
+        #   A parent must be a group (or None, for transcripts that are not in a group).
+        tg_tree: dict[str, set[tuple[Literal["t", "tg"], str]]] = {}
+        if full_tree:
+            for tg_id, tg in tg_dict.items():
+                tg_tree.setdefault(tg.parent_transcript_group_id or "__global_root", set()).add(
+                    ("tg", tg_id)
+                )
+                for t_id in tgs_to_transcripts.get(tg_id, []):
+                    tg_tree.setdefault(tg_id, set()).add(("t", t_id))
+            for t_id, t in t_dict.items():
+                tg_tree.setdefault(t.transcript_group_id or "__global_root", set()).add(("t", t_id))
+        else:
+            # Initialize q with "important" tgs
+            q, seen = deque(tgs_to_transcripts.keys()), set(tgs_to_transcripts.keys())
+            # Do an "upwards BFS" from leaves up to the root. Builds a tree of only relevant nodes.
+            while q:
+                u_id = q.popleft()
+                u = tg_dict.get(u_id)  # None if __global_root
+                # Add the transcripts under this tg
+                for t_id in tgs_to_transcripts.get(u_id, []):
+                    tg_tree.setdefault(u_id, set()).add(("t", t_id))
+                # Add an edge from the parent
+                if u is not None:
+                    par_id = u.parent_transcript_group_id or "__global_root"
+                    # Mark u as a child of par
+                    tg_tree.setdefault(par_id, set()).add(("tg", u_id))
+                    # If we haven't investigated the parent before, add to q
+                    if par_id not in seen:
+                        q.append(par_id)
+                        seen.add(par_id)
+        # For each node, sort by created_at timestamp
+        def _cmp(element: tuple[Literal["t", "tg"], str]) -> datetime:
+            obj_type, obj_id = element
+            if obj_type == "tg":
+                return tg_dict[obj_id].created_at or datetime.max
+            else:
+                return t_dict[obj_id].created_at or datetime.max
+        c_tree: dict[str | None, list[tuple[Literal["t", "tg"], str]]] = {}
+        for tg_id in tg_tree:
+            children_ids = list(set(tg_tree[tg_id]))
+            sorted_children_ids = sorted(children_ids, key=_cmp)
+            c_tree[tg_id] = sorted_children_ids
+        # Compute transcript indices as the depth-first traversal index
+        transcript_idx_map: dict[str, int] = {}
+        def _assign_transcript_indices(cur_tg_id: str, next_idx: int) -> int:
+            children = c_tree.get(cur_tg_id, [])
+            for child_type, child_id in children:
+                if child_type == "tg":
+                    next_idx = _assign_transcript_indices(child_id, next_idx)
+                else:
+                    transcript_idx_map[child_id] = next_idx
+                    next_idx += 1
+            return next_idx
+        _assign_transcript_indices("__global_root", 0)
+        return c_tree, transcript_idx_map
+    def delete_transcript_group_subtree(self, transcript_group_id: str) -> None:
+        """Delete a transcript group and all descendant groups/transcripts using the canonical tree."""
+        if transcript_group_id == "__global_root":
+            raise ValueError("Cannot delete the global root sentinel")
+        if transcript_group_id not in self.transcript_group_dict:
+            raise ValueError(
+                f"Transcript group '{transcript_group_id}' does not exist on this run."
+            )
+        canonical_tree = self.get_canonical_tree(full_tree=True)
+        groups_to_delete: set[str] = set()
+        transcripts_to_delete: set[str] = set()
+        queue: deque[str] = deque([transcript_group_id])
+        while queue:
+            current_group = queue.popleft()
+            groups_to_delete.add(current_group)
+            for child_type, child_id in canonical_tree.get(current_group, []):
+                if child_type == "tg":
+                    queue.append(child_id)
+                else:
+                    transcripts_to_delete.add(child_id)
+        if groups_to_delete:
+            self.transcript_groups = [
+                tg for tg in self.transcript_groups if tg.id not in groups_to_delete
+            ]
+        if transcripts_to_delete:
+            self.transcripts = [t for t in self.transcripts if t.id not in transcripts_to_delete]
+        self._invalidate_caches()
+    def to_text_new(
+        self,
+        agent_run_alias: int | str = 0,
+        t_idx_map: dict[str, int] | None = None,
+        indent: int = 0,
+        full_tree: bool = False,
+    ):
+        if isinstance(agent_run_alias, int):
+            agent_run_alias = f"R{agent_run_alias}"
+        c_tree = self.get_canonical_tree(full_tree=full_tree)
+        t_ids_ordered = self.get_transcript_ids_ordered(full_tree=full_tree)
+        if t_idx_map is None:
+            t_idx_map = {t_id: i for i, t_id in enumerate(t_ids_ordered)}
+        t_dict = self.transcript_dict
+        tg_dict = self.transcript_group_dict
+        # Traverse the tree and render the string
+        def _recurse(tg_id: str) -> str:
+            children_ids = c_tree.get(tg_id, [])
+            children_texts: list[str] = []
+            for child_type, child_id in children_ids:
+                if child_type == "tg":
+                    children_texts.append(_recurse(child_id))
+                else:
+                    cur_text = t_dict[child_id].to_text_new(
+                        transcript_alias=t_idx_map[child_id],
+                        indent=indent,
+                    )
+                    children_texts.append(cur_text)
+            children_text = "\n".join(children_texts)
+            # No wrapper for global root
+            if tg_id == "__global_root":
+                return children_text
+            # Delegate rendering to TranscriptGroup
+            else:
+                tg = tg_dict[tg_id]
+                return tg.to_text_new(children_text=children_text, indent=indent)
+        text = _recurse("__global_root")
+        # Append agent run metadata below the full content
+        metadata_text = dump_metadata(self.metadata)
+        if metadata_text is not None:
+            if indent > 0:
+                metadata_text = textwrap.indent(metadata_text, " " * indent)
+            metadata_alias = f"{agent_run_alias}M"
+            text += f"\n<|agent run metadata {metadata_alias}|>\n{metadata_text}\n</|agent run metadata {metadata_alias}|>"
+        return f"<|agent run {agent_run_alias}|>\n{text}\n</|agent run {agent_run_alias}|>\n"

docent/data_models/chat/__init__.py ADDED Viewed

@@ -0,0 +1,37 @@
+from docent.data_models.chat.content import Content, ContentReasoning, ContentText
+from docent.data_models.chat.message import (
+    AssistantMessage,
+    ChatMessage,
+    DocentAssistantMessage,
+    DocentChatMessage,
+    SystemMessage,
+    ToolMessage,
+    UserMessage,
+    parse_chat_message,
+    parse_docent_chat_message,
+)
+from docent.data_models.chat.tool import (
+    ToolCall,
+    ToolCallContent,
+    ToolInfo,
+    ToolParams,
+)
+__all__ = [
+    "ChatMessage",
+    "DocentChatMessage",
+    "AssistantMessage",
+    "DocentAssistantMessage",
+    "SystemMessage",
+    "ToolMessage",
+    "UserMessage",
+    "Content",
+    "ContentReasoning",
+    "ContentText",
+    "ToolCall",
+    "ToolCallContent",
+    "ToolInfo",
+    "ToolParams",
+    "parse_chat_message",
+    "parse_docent_chat_message",
+]

docent/data_models/chat/content.py ADDED Viewed

@@ -0,0 +1,56 @@
+from typing import Annotated, Literal
+from pydantic import BaseModel, Discriminator
+class BaseContent(BaseModel):
+    """Base class for all content types in chat messages.
+    Provides the foundation for different content types with a discriminator field.
+    Attributes:
+        type: The content type identifier, used for discriminating between content types.
+    """
+    type: Literal["text", "reasoning", "image", "audio", "video"]
+class ContentText(BaseContent):
+    """Text content for chat messages.
+    Represents plain text content in a chat message.
+    Attributes:
+        type: Fixed as "text" to identify this content type.
+        text: The actual text content.
+        refusal: Optional flag indicating if this is a refusal message.
+    """
+    type: Literal["text"] = "text"  # type: ignore
+    text: str
+    refusal: bool | None = None
+class ContentReasoning(BaseContent):
+    """Reasoning content for chat messages.
+    Represents reasoning or thought process content in a chat message.
+    Attributes:
+        type: Fixed as "reasoning" to identify this content type.
+        reasoning: The actual reasoning text.
+        signature: Optional signature associated with the reasoning.
+        redacted: Flag indicating if the reasoning has been redacted.
+    """
+    type: Literal["reasoning"] = "reasoning"  # type: ignore
+    reasoning: str
+    signature: str | None = None
+    redacted: bool = False
+# Content type discriminated union
+Content = Annotated[ContentText | ContentReasoning, Discriminator("type")]
+"""Discriminated union of possible content types using the 'type' field.
+Can be either ContentText or ContentReasoning.
+"""