PyPI - docent-python - Versions diffs - 0.1.0a1__py3-none-any.whl - Mend

docent-python 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docent-python might be problematic. Click here for more details.

Files changed (23) hide show

docent/__init__.py +3 -0
docent/_log_util/__init__.py +3 -0
docent/_log_util/logger.py +141 -0
docent/data_models/__init__.py +25 -0
docent/data_models/_tiktoken_util.py +91 -0
docent/data_models/agent_run.py +231 -0
docent/data_models/chat/__init__.py +25 -0
docent/data_models/chat/content.py +56 -0
docent/data_models/chat/message.py +125 -0
docent/data_models/chat/tool.py +109 -0
docent/data_models/citation.py +223 -0
docent/data_models/filters.py +205 -0
docent/data_models/metadata.py +219 -0
docent/data_models/regex.py +56 -0
docent/data_models/shared_types.py +10 -0
docent/data_models/transcript.py +347 -0
docent/py.typed +0 -0
docent/sdk/__init__.py +0 -0
docent/sdk/client.py +285 -0
docent_python-0.1.0a1.dist-info/METADATA +16 -0
docent_python-0.1.0a1.dist-info/RECORD +23 -0
docent_python-0.1.0a1.dist-info/WHEEL +4 -0
docent_python-0.1.0a1.dist-info/licenses/LICENSE.md +7 -0

docent/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+__all__ = ["Docent"]
+from docent.sdk.client import Docent

docent/_log_util/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+__all__ = ["get_logger"]
+from docent._log_util.logger import get_logger

docent/_log_util/logger.py ADDED Viewed

@@ -0,0 +1,141 @@
+import logging
+import sys
+from dataclasses import dataclass
+from typing import Any, Dict, Literal, MutableMapping, Optional, Tuple
+@dataclass
+class ColorCode:
+    fore: str
+    style: str = ""
+class Colors:
+    # Foreground colors
+    BLACK = ColorCode("\033[30m")
+    RED = ColorCode("\033[31m")
+    GREEN = ColorCode("\033[32m")
+    YELLOW = ColorCode("\033[33m")
+    BLUE = ColorCode("\033[34m")
+    MAGENTA = ColorCode("\033[35m")
+    CYAN = ColorCode("\033[36m")
+    WHITE = ColorCode("\033[37m")
+    BRIGHT_MAGENTA = ColorCode("\033[95m")
+    BRIGHT_CYAN = ColorCode("\033[96m")
+    # Styles
+    BOLD = "\033[1m"
+    RESET = "\033[0m"
+    @staticmethod
+    def apply(text: str, color: ColorCode) -> str:
+        return f"{color.style}{color.fore}{text}{Colors.RESET}"
+class ColoredFormatter(logging.Formatter):
+    COLORS: Dict[int, ColorCode] = {
+        logging.DEBUG: Colors.BLUE,
+        logging.INFO: Colors.GREEN,
+        logging.WARNING: Colors.YELLOW,
+        logging.ERROR: Colors.RED,
+        logging.CRITICAL: ColorCode("\033[31m", Colors.BOLD),
+    }
+    # Available highlight colors
+    HIGHLIGHT_COLORS: Dict[str, ColorCode] = {
+        "magenta": ColorCode(Colors.BRIGHT_MAGENTA.fore, Colors.BOLD),
+        "cyan": ColorCode(Colors.BRIGHT_CYAN.fore, Colors.BOLD),
+        "yellow": ColorCode(Colors.YELLOW.fore, Colors.BOLD),
+        "red": ColorCode(Colors.RED.fore, Colors.BOLD),
+    }
+    def __init__(self, fmt: Optional[str] = None) -> None:
+        super().__init__(
+            fmt or "%(asctime)s [%(levelname)s] %(namespace)s: %(message)s", datefmt="%H:%M:%S"
+        )
+    def format(self, record: logging.LogRecord) -> str:
+        # Add namespace to extra fields if not present
+        if not getattr(record, "namespace", None):
+            record.__dict__["namespace"] = record.name
+        # Color the level name
+        record.levelname = Colors.apply(record.levelname, self.COLORS[record.levelno])
+        # Color the namespace
+        record.__dict__["namespace"] = Colors.apply(record.__dict__["namespace"], Colors.CYAN)
+        # Check if highlight flag is set
+        highlight = getattr(record, "highlight", None)
+        if highlight:
+            # Get the highlight color or default to magenta
+            color_name = highlight if isinstance(highlight, str) else "magenta"
+            highlight_color = self.HIGHLIGHT_COLORS.get(
+                color_name, self.HIGHLIGHT_COLORS["magenta"]
+            )
+            # Apply highlight to the message
+            original_message = record.getMessage()
+            record.msg = Colors.apply(original_message, highlight_color)
+            if record.args:
+                record.args = ()
+        return super().format(record)
+class LoggerAdapter(logging.LoggerAdapter[logging.Logger]):
+    """
+    Logger adapter that allows highlighting specific log messages.
+    """
+    def process(
+        self, msg: Any, kwargs: MutableMapping[str, Any]
+    ) -> Tuple[Any, MutableMapping[str, Any]]:
+        # Pass highlight flag through to the record
+        return msg, kwargs
+    def highlight(
+        self,
+        msg: object,
+        *args: Any,
+        color: Literal["magenta", "cyan", "yellow", "red", "green"] = "magenta",
+        **kwargs: Any,
+    ) -> None:
+        """
+        Log a highlighted message.
+        Args:
+            msg: The message format string
+            color: The color to highlight with (magenta, cyan, yellow, red)
+            *args: The args for the message format string
+            **kwargs: Additional logging kwargs
+        """
+        kwargs.setdefault("extra", {})
+        if isinstance(kwargs["extra"], dict):
+            kwargs["extra"]["highlight"] = color
+        return self.info(msg, *args, **kwargs)
+def get_logger(namespace: str) -> LoggerAdapter:
+    """
+    Get a colored logger for the specified namespace.
+    Args:
+        namespace: The namespace for the logger
+    Returns:
+        A configured logger instance with highlighting support
+    """
+    logger = logging.getLogger(namespace)
+    # Only add handler if it doesn't exist
+    if not logger.handlers:
+        handler = logging.StreamHandler(sys.stdout)
+        handler.setFormatter(ColoredFormatter())
+        logger.addHandler(handler)
+        # Set default level to INFO
+        logger.setLevel(logging.INFO)
+    # Wrap with adapter to support highlighting
+    return LoggerAdapter(logger, {})

docent/data_models/__init__.py ADDED Viewed

@@ -0,0 +1,25 @@
+from docent.data_models.agent_run import AgentRun
+from docent.data_models.citation import Citation
+from docent.data_models.filters import (
+    AgentRunIdFilter,
+    BaseFrameFilter,
+    ComplexFilter,
+    SearchResultPredicateFilter,
+)
+from docent.data_models.metadata import BaseAgentRunMetadata, BaseMetadata, FrameDimension
+from docent.data_models.regex import RegexSnippet
+from docent.data_models.transcript import Transcript
+__all__ = [
+    "AgentRun",
+    "Citation",
+    "RegexSnippet",
+    "AgentRunIdFilter",
+    "FrameDimension",
+    "BaseFrameFilter",
+    "SearchResultPredicateFilter",
+    "ComplexFilter",
+    "BaseAgentRunMetadata",
+    "BaseMetadata",
+    "Transcript",
+]

docent/data_models/_tiktoken_util.py ADDED Viewed

@@ -0,0 +1,91 @@
+import tiktoken
+MAX_TOKENS = 100_000
+def get_token_count(text: str, model: str = "gpt-4") -> int:
+    """Get the number of tokens in a text under the GPT-4 tokenization scheme."""
+    encoding = tiktoken.encoding_for_model(model)
+    return len(encoding.encode(text))
+def truncate_to_token_limit(text: str, max_tokens: int, model: str = "gpt-4") -> str:
+    """Truncate text to stay within the specified token limit."""
+    encoding = tiktoken.encoding_for_model(model)
+    tokens = encoding.encode(text)
+    if len(tokens) <= max_tokens:
+        return text
+    return encoding.decode(tokens[:max_tokens])
+class MessageRange:
+    """A range of messages in a transcript. start is inclusive, end is exclusive."""
+    start: int
+    end: int
+    include_metadata: bool
+    num_tokens: int
+    def __init__(self, start: int, end: int, include_metadata: bool, num_tokens: int):
+        self.start = start
+        self.end = end
+        self.include_metadata = include_metadata
+        self.num_tokens = num_tokens
+def group_messages_into_ranges(
+    token_counts: list[int], metadata_tokens: int, max_tokens: int, margin: int = 50
+) -> list[MessageRange]:
+    """Split a list of messages + metadata into ranges that stay within the specified token limit.
+    Always tries to create ranges with metadata included, unless a single message + metadata is too long,
+    in which case you get a lone message with no metadata
+    """
+    ranges: list[MessageRange] = []
+    start_index = 0
+    running_token_count = 0
+    i = 0
+    while i < len(token_counts):
+        new_token_count = token_counts[i]
+        if running_token_count + new_token_count + metadata_tokens > max_tokens - margin:
+            if start_index == i:  # a single message + metadata is already too long
+                ranges.append(
+                    MessageRange(
+                        start=i, end=i + 1, include_metadata=False, num_tokens=new_token_count
+                    )
+                )
+                i += 1
+            else:
+                # add all messages from start_index to i-1, with metadata included
+                ranges.append(
+                    MessageRange(
+                        start=start_index,
+                        end=i,
+                        include_metadata=True,
+                        num_tokens=running_token_count + metadata_tokens,
+                    )
+                )
+            running_token_count = 0
+            start_index = i
+        else:
+            running_token_count += new_token_count
+            i += 1
+    if running_token_count > 0:
+        include_metadata = running_token_count + metadata_tokens < max_tokens - margin
+        num_tokens = (
+            running_token_count + metadata_tokens if include_metadata else running_token_count
+        )
+        ranges.append(
+            MessageRange(
+                start=start_index,
+                end=len(token_counts),
+                include_metadata=include_metadata,
+                num_tokens=num_tokens,
+            )
+        )
+    return ranges

docent/data_models/agent_run.py ADDED Viewed

@@ -0,0 +1,231 @@
+import sys
+from typing import Any, Literal, TypedDict, cast
+from uuid import uuid4
+import yaml
+from pydantic import (
+    BaseModel,
+    Field,
+    field_serializer,
+    field_validator,
+    model_validator,
+)
+from docent.data_models._tiktoken_util import get_token_count, group_messages_into_ranges
+from docent.data_models.metadata import BaseAgentRunMetadata
+from docent.data_models.transcript import Transcript, TranscriptWithoutMetadataValidator
+class FilterableField(TypedDict):
+    name: str
+    type: Literal["str", "bool", "int", "float"]
+class AgentRun(BaseModel):
+    """Represents a complete run of an agent with transcripts and metadata.
+    An AgentRun encapsulates the execution of an agent, storing all communication
+    transcripts and associated metadata. It must contain at least one transcript.
+    Attributes:
+        id: Unique identifier for the agent run, auto-generated by default.
+        name: Optional human-readable name for the agent run.
+        description: Optional description of the agent run.
+        transcripts: Dict mapping transcript IDs to Transcript objects.
+        metadata: Additional structured metadata about the agent run.
+    """
+    id: str = Field(default_factory=lambda: str(uuid4()))
+    name: str | None = None
+    description: str | None = None
+    transcripts: dict[str, Transcript]
+    metadata: BaseAgentRunMetadata
+    @field_serializer("metadata")
+    def serialize_metadata(self, metadata: BaseAgentRunMetadata, _info: Any) -> dict[str, Any]:
+        """
+        Custom serializer for the metadata field so the internal fields are explicitly preserved.
+        """
+        return metadata.model_dump(strip_internal_fields=False)
+    @field_validator("metadata", mode="before")
+    @classmethod
+    def _validate_metadata_type(cls, v: Any) -> Any:
+        if v is not None and not isinstance(v, BaseAgentRunMetadata):
+            raise ValueError(
+                f"metadata must be an instance of BaseAgentRunMetadata, got {type(v).__name__}"
+            )
+        return v
+    @model_validator(mode="after")
+    def _validate_transcripts_not_empty(self):
+        """Validates that the agent run contains at least one transcript.
+        Raises:
+            ValueError: If the transcripts list is empty.
+        Returns:
+            AgentRun: The validated AgentRun instance.
+        """
+        if len(self.transcripts) == 0:
+            raise ValueError("AgentRun must have at least one transcript")
+        return self
+    def to_text(self, token_limit: int = sys.maxsize) -> list[str]:
+        """
+        Represents an agent run as a list of strings, each of which is at most token_limit tokens
+        under the GPT-4 tokenization scheme.
+        We'll try to split up long AgentRuns along transcript boundaries and include metadata.
+        For very long transcripts, we'll have to split them up further and remove metadata.
+        """
+        transcript_strs: list[str] = [
+            f"<transcript {t_key}>\n{t.to_str(agent_run_idx=None, transcript_idx=i)}\n</transcript {t_key}>"
+            for i, (t_key, t) in enumerate(self.transcripts.items())
+        ]
+        transcripts_str = "\n\n".join(transcript_strs)
+        # Gather metadata
+        metadata_obj = self.metadata.model_dump(strip_internal_fields=True)
+        if self.name is not None:
+            metadata_obj["name"] = self.name
+        if self.description is not None:
+            metadata_obj["description"] = self.description
+        # Add the field descriptions if they exist
+        metadata_obj = {
+            (f"{k} ({d})" if (d := self.metadata.get_field_description(k)) is not None else k): v
+            for k, v in metadata_obj.items()
+        }
+        yaml_width = float("inf")
+        transcripts_str = (
+            f"Here is a complete agent run for analysis purposes only:\n{transcripts_str}\n\n"
+        )
+        metadata_str = f"Metadata about the complete agent run:\n<agent run metadata>\n{yaml.dump(metadata_obj, width=yaml_width)}\n</agent run metadata>"
+        if token_limit == sys.maxsize:
+            return [f"{transcripts_str}" f"{metadata_str}"]
+        # Compute message length; if fits, return the full transcript and metadata
+        transcript_str_tokens = get_token_count(transcripts_str)
+        metadata_str_tokens = get_token_count(metadata_str)
+        if transcript_str_tokens + metadata_str_tokens <= token_limit:
+            return [f"{transcripts_str}" f"{metadata_str}"]
+        # Otherwise, split up the transcript and metadata into chunks
+        # TODO(vincent, mengk): does this code account for multiple transcripts correctly? a little confused.
+        else:
+            results: list[str] = []
+            transcript_token_counts = [get_token_count(t) for t in transcript_strs]
+            ranges = group_messages_into_ranges(
+                transcript_token_counts, metadata_str_tokens, token_limit - 50
+            )
+            for msg_range in ranges:
+                if msg_range.include_metadata:
+                    cur_transcript_str = "\n\n".join(
+                        transcript_strs[msg_range.start : msg_range.end]
+                    )
+                    results.append(
+                        f"Here is a partial agent run for analysis purposes only:\n{cur_transcript_str}"
+                        f"{metadata_str}"
+                    )
+                else:
+                    assert (
+                        msg_range.end == msg_range.start + 1
+                    ), "Ranges without metadata should be a single message"
+                    t_id, t = list(self.transcripts.items())[msg_range.start]
+                    if msg_range.num_tokens < token_limit - 50:
+                        transcript = f"<transcript {t_id}>\n{t.to_str()}\n</transcript {t_id}>"
+                        result = (
+                            f"Here is a partial agent run for analysis purposes only:\n{transcript}"
+                        )
+                        results.append(result)
+                    else:
+                        transcript_fragments = t.to_str_with_token_limit(token_limit - 50)
+                        for fragment in transcript_fragments:
+                            result = f"<transcript {t_id}>\n{fragment}\n</transcript {t_id}>"
+                            result = (
+                                f"Here is a partial agent run for analysis purposes only:\n{result}"
+                            )
+                            results.append(result)
+            return results
+    @property
+    def text(self) -> str:
+        """Concatenates all transcript texts with double newlines as separators.
+        Returns:
+            str: A string representation of all transcripts.
+        """
+        return self.to_text()[0]
+    def model_dump(self, *args: Any, **kwargs: Any) -> dict[str, Any]:
+        """Extends the parent model_dump method to include the text property.
+        Args:
+            *args: Variable length argument list passed to parent method.
+            **kwargs: Arbitrary keyword arguments passed to parent method.
+        Returns:
+            dict[str, Any]: Dictionary representation of the model including the text property.
+        """
+        return super().model_dump(*args, **kwargs) | {"text": self.text}
+    def get_filterable_fields(self, max_depth: int = 1) -> list[FilterableField]:
+        """Returns a list of all fields that can be used to filter the agent run,
+        by recursively exploring the model_dump() for singleton types in dictionaries.
+        Returns:
+            list[FilterableField]: A list of filterable fields, where each field is a
+                                   dictionary containing its 'name' (path) and 'type'.
+        """
+        result: list[FilterableField] = []
+        def _explore_dict(d: dict[str, Any], prefix: str, depth: int):
+            nonlocal result
+            if depth > max_depth:
+                return
+            for k, v in d.items():
+                if isinstance(v, (str, int, float, bool)):
+                    result.append(
+                        {
+                            "name": f"{prefix}.{k}",
+                            "type": cast(Literal["str", "bool", "int", "float"], type(v).__name__),
+                        }
+                    )
+                elif isinstance(v, dict):
+                    _explore_dict(cast(dict[str, Any], v), f"{prefix}.{k}", depth + 1)
+        # Look at the agent run metadata
+        _explore_dict(self.metadata.model_dump(strip_internal_fields=True), "metadata", 0)
+        # Look at the transcript metadata
+        # TODO(mengk): restore this later when we have the ability to integrate with SQL.
+        # for t_id, t in self.transcripts.items():
+        #     _explore_dict(
+        #         t.metadata.model_dump(strip_internal_fields=True), f"transcript.{t_id}.metadata", 0
+        #     )
+        # Append the text field
+        result.append({"name": "text", "type": "str"})
+        return result
+class AgentRunWithoutMetadataValidator(AgentRun):
+    """
+    A version of AgentRun that doesn't have the model_validator on metadata.
+    Needed for sending/receiving agent runs via JSON, since they incorrectly trip the existing model_validator.
+    """
+    transcripts: dict[str, TranscriptWithoutMetadataValidator]  # type: ignore
+    @field_validator("metadata", mode="before")
+    @classmethod
+    def _validate_metadata_type(cls, v: Any) -> Any:
+        # Bypass the model_validator
+        return v

docent/data_models/chat/__init__.py ADDED Viewed

@@ -0,0 +1,25 @@
+from docent.data_models.chat.content import Content, ContentReasoning, ContentText
+from docent.data_models.chat.message import (
+    AssistantMessage,
+    ChatMessage,
+    SystemMessage,
+    ToolMessage,
+    UserMessage,
+    parse_chat_message,
+)
+from docent.data_models.chat.tool import ToolCall, ToolInfo, ToolParams
+__all__ = [
+    "ChatMessage",
+    "AssistantMessage",
+    "SystemMessage",
+    "ToolMessage",
+    "UserMessage",
+    "Content",
+    "ContentReasoning",
+    "ContentText",
+    "ToolCall",
+    "ToolInfo",
+    "ToolParams",
+    "parse_chat_message",
+]

docent/data_models/chat/content.py ADDED Viewed

@@ -0,0 +1,56 @@
+from typing import Annotated, Literal
+from pydantic import BaseModel, Discriminator
+class BaseContent(BaseModel):
+    """Base class for all content types in chat messages.
+    Provides the foundation for different content types with a discriminator field.
+    Attributes:
+        type: The content type identifier, used for discriminating between content types.
+    """
+    type: Literal["text", "reasoning", "image", "audio", "video"]
+class ContentText(BaseContent):
+    """Text content for chat messages.
+    Represents plain text content in a chat message.
+    Attributes:
+        type: Fixed as "text" to identify this content type.
+        text: The actual text content.
+        refusal: Optional flag indicating if this is a refusal message.
+    """
+    type: Literal["text"] = "text"  # type: ignore
+    text: str
+    refusal: bool | None = None
+class ContentReasoning(BaseContent):
+    """Reasoning content for chat messages.
+    Represents reasoning or thought process content in a chat message.
+    Attributes:
+        type: Fixed as "reasoning" to identify this content type.
+        reasoning: The actual reasoning text.
+        signature: Optional signature associated with the reasoning.
+        redacted: Flag indicating if the reasoning has been redacted.
+    """
+    type: Literal["reasoning"] = "reasoning"  # type: ignore
+    reasoning: str
+    signature: str | None = None
+    redacted: bool = False
+# Content type discriminated union
+Content = Annotated[ContentText | ContentReasoning, Discriminator("type")]
+"""Discriminated union of possible content types using the 'type' field.
+Can be either ContentText or ContentReasoning.
+"""