PyPI - docent-python - Versions diffs - 0.1.19a0__py3-none-any.whl → 0.1.27a0__py3-none-any.whl - Mend

docent-python 0.1.19a0py3-none-any.whl → 0.1.27a0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docent-python might be problematic. Click here for more details.

Files changed (38) hide show

docent/_llm_util/__init__.py +0 -0
docent/_llm_util/data_models/__init__.py +0 -0
docent/_llm_util/data_models/exceptions.py +48 -0
docent/_llm_util/data_models/llm_output.py +331 -0
docent/_llm_util/llm_cache.py +193 -0
docent/_llm_util/llm_svc.py +472 -0
docent/_llm_util/model_registry.py +130 -0
docent/_llm_util/providers/__init__.py +0 -0
docent/_llm_util/providers/anthropic.py +537 -0
docent/_llm_util/providers/common.py +41 -0
docent/_llm_util/providers/google.py +530 -0
docent/_llm_util/providers/openai.py +745 -0
docent/_llm_util/providers/openrouter.py +375 -0
docent/_llm_util/providers/preference_types.py +104 -0
docent/_llm_util/providers/provider_registry.py +164 -0
docent/data_models/__init__.py +2 -2
docent/data_models/agent_run.py +1 -0
docent/data_models/judge.py +7 -4
docent/data_models/transcript.py +2 -0
docent/data_models/util.py +170 -0
docent/judges/__init__.py +23 -0
docent/judges/analysis.py +77 -0
docent/judges/impl.py +587 -0
docent/judges/runner.py +129 -0
docent/judges/stats.py +205 -0
docent/judges/types.py +311 -0
docent/judges/util/forgiving_json.py +108 -0
docent/judges/util/meta_schema.json +86 -0
docent/judges/util/meta_schema.py +29 -0
docent/judges/util/parse_output.py +87 -0
docent/judges/util/voting.py +139 -0
docent/sdk/client.py +181 -44
docent/trace.py +362 -44
{docent_python-0.1.19a0.dist-info → docent_python-0.1.27a0.dist-info}/METADATA +11 -5
docent_python-0.1.27a0.dist-info/RECORD +59 -0
docent_python-0.1.19a0.dist-info/RECORD +0 -32
{docent_python-0.1.19a0.dist-info → docent_python-0.1.27a0.dist-info}/WHEEL +0 -0
{docent_python-0.1.19a0.dist-info → docent_python-0.1.27a0.dist-info}/licenses/LICENSE.md +0 -0

docent/_llm_util/__init__.py ADDED Viewed

File without changes

docent/_llm_util/data_models/__init__.py ADDED Viewed

File without changes

docent/_llm_util/data_models/exceptions.py ADDED Viewed

@@ -0,0 +1,48 @@
+class LLMException(Exception):
+    error_type_id = "other"
+    user_message = "The model failed to respond. Please try again later."
+class CompletionTooLongException(LLMException):
+    error_type_id = "completion_too_long"
+    user_message = "Completion too long."
+class RateLimitException(LLMException):
+    error_type_id = "rate_limit"
+    user_message = "Rate limited by the model provider. Please wait and try again."
+class ContextWindowException(LLMException):
+    error_type_id = "context_window"
+    user_message = "Context window exceeded."
+class NoResponseException(LLMException):
+    error_type_id = "no_response"
+    user_message = "The model returned an empty response. Please try again later."
+class DocentUsageLimitException(LLMException):
+    error_type_id = "docent_usage_limit"
+    user_message = "Free daily usage limit reached. Add your own API key in settings or contact us for increased limits."
+class ValidationFailedException(LLMException):
+    error_type_id = "validation_failed"
+    user_message = "The model returned invalid output that failed validation."
+    def __init__(self, message: str = "", failed_output: str | None = None):
+        super().__init__(message)
+        self.failed_output = failed_output
+LLM_ERROR_TYPES: list[type[LLMException]] = [
+    LLMException,
+    CompletionTooLongException,
+    RateLimitException,
+    ContextWindowException,
+    NoResponseException,
+    DocentUsageLimitException,
+    ValidationFailedException,
+]

docent/_llm_util/data_models/llm_output.py ADDED Viewed

@@ -0,0 +1,331 @@
+import json
+from dataclasses import dataclass, field
+from typing import Any, Literal, Protocol, cast
+from openai.types.chat.chat_completion_token_logprob import TopLogprob
+from pydantic import BaseModel
+from docent._llm_util.data_models.exceptions import (
+    LLM_ERROR_TYPES,
+    CompletionTooLongException,
+    ContextWindowException,
+    LLMException,
+)
+from docent._log_util import get_logger
+from docent.data_models.chat import ToolCall
+logger = get_logger(__name__)
+FinishReasonType = Literal[
+    "error",
+    "stop",
+    "length",
+    "tool_calls",
+    "content_filter",
+    "function_call",
+    "streaming",
+    "refusal",
+]
+"""Possible reasons for an LLM completion to finish."""
+TokenType = Literal["input", "output", "cache_read", "cache_write"]
+class UsageMetrics:
+    _usage: dict[TokenType, int]
+    def __init__(self, **kwargs: int | None):
+        filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        self._usage = cast(dict[TokenType, int], filtered_kwargs)
+    def __getitem__(self, key: TokenType) -> int:
+        return self._usage.get(key, 0)
+    def __setitem__(self, key: TokenType, value: int):
+        self._usage[key] = value
+    def to_dict(self) -> dict[TokenType, int]:
+        # Filter out 0 values to avoid cluttering the database
+        return {k: v for k, v in self._usage.items() if v != 0}
+    @property
+    def total_tokens(self) -> int:
+        return self["input"] + self["output"]
+class LLMCompletion(BaseModel):
+    """A single completion from an LLM.
+    Attributes:
+        text: The generated text content.
+        tool_calls: List of tool calls made during the completion.
+        finish_reason: Reason why the completion finished.
+        top_logprobs: Probability distribution for top token choices.
+    """
+    text: str | None = None
+    tool_calls: list[ToolCall] | None = None
+    finish_reason: FinishReasonType | None = None
+    top_logprobs: list[list[TopLogprob]] | None = None
+    reasoning_tokens: str | None = None
+    @property
+    def no_text(self) -> bool:
+        """Check if the completion has no text.
+        Returns:
+            bool: True if text is None or empty, False otherwise.
+        """
+        return self.text is None or len(self.text) == 0
+@dataclass
+class LLMOutput:
+    """Container for LLM output, potentially with multiple completions.
+    Aggregates completions from an LLM along with metadata and error information.
+    Attributes:
+        model: The name/identifier of the model used.
+        completions: List of individual completions.
+        errors: List of error types encountered during generation.
+    """
+    model: str
+    completions: list[LLMCompletion]
+    errors: list[LLMException] = field(default_factory=list)
+    usage: UsageMetrics = field(default_factory=UsageMetrics)
+    from_cache: bool = False
+    duration: float | None = None
+    @property
+    def non_empty(self) -> bool:
+        """Check if there are any completions.
+        Returns:
+            bool: True if there's at least one completion, False otherwise.
+        """
+        return len(self.completions) > 0
+    @property
+    def first(self) -> LLMCompletion | None:
+        """Get the first completion if available.
+        Returns:
+            LLMCompletion | None: The first completion or None if no completions exist.
+        """
+        return self.completions[0] if self.non_empty else None
+    @property
+    def first_text(self) -> str | None:
+        """Get the text of the first completion if available.
+        Returns:
+            str | None: The text of the first completion or None if no completion exists.
+        """
+        return self.first.text if self.first else None
+    @property
+    def did_error(self) -> bool:
+        """Check if any errors occurred during generation.
+        Returns:
+            bool: True if there were errors, False otherwise.
+        """
+        return bool(self.errors)
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "model": self.model,
+            "completions": [comp.model_dump() for comp in self.completions],
+            "errors": [e.error_type_id for e in self.errors],
+            "usage": self.usage.to_dict(),
+            "from_cache": self.from_cache,
+            "duration": self.duration,
+        }
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "LLMOutput":
+        error_type_map = {e.error_type_id: e for e in LLM_ERROR_TYPES}
+        errors = data.get("errors", [])
+        error_types_to_not_log: list[str] = [
+            CompletionTooLongException.error_type_id,
+            ContextWindowException.error_type_id,
+        ]
+        errors_to_log = [e for e in errors if e not in error_types_to_not_log]
+        if errors_to_log:
+            logger.error(f"Loading LLM output with errors: {errors}")
+        errors = [error_type_map.get(e, LLMException)() for e in errors]
+        completions = data.get("completions", [])
+        completions = [LLMCompletion.model_validate(comp) for comp in completions]
+        usage: dict[TokenType, int] = {}
+        if data_usage := data.get("usage"):
+            usage = cast(dict[TokenType, int], data_usage)
+        return cls(
+            model=data["model"],
+            completions=completions,
+            errors=errors,
+            usage=UsageMetrics(**usage),
+            from_cache=bool(data.get("from_cache", False)),
+            duration=data.get("duration"),
+        )
+@dataclass
+class ToolCallPartial:
+    """Partial representation of a tool call before full processing.
+    Used as an intermediate format before finalizing into a complete ToolCall.
+    Args:
+        id: The identifier for the tool call.
+        function: The name of the function to call.
+        arguments_raw: Raw JSON string of arguments for the function.
+        type: The type of the tool call, always "function".
+    """
+    id: str | None
+    function: str | None
+    arguments_raw: str | None
+    type: Literal["function"]
+class LLMCompletionPartial(LLMCompletion):
+    """Partial representation of an LLM completion before finalization.
+    Extends LLMCompletion but with tool_calls being a list of ToolCallPartial.
+    This is used during the processing stage before tool calls are fully parsed.
+    Attributes:
+        tool_calls: List of partial tool call representations.
+    """
+    tool_calls: list[ToolCallPartial | None] | None = None  # type: ignore
+class LLMOutputPartial(LLMOutput):
+    """Partial representation of LLM output before finalization.
+    Extends LLMOutput but with completions being a list of LLMCompletionPartial.
+    Used as an intermediate format during processing.
+    Attributes:
+        completions: List of partial completions.
+    """
+    completions: list[LLMCompletionPartial]  # type: ignore
+def finalize_llm_output_partial(partial: LLMOutputPartial) -> LLMOutput:
+    """Convert a partial LLM output into a finalized LLM output.
+    Processes tool calls by parsing their arguments from raw JSON strings,
+    handles errors in JSON parsing, and provides warnings for truncated completions.
+    Args:
+        partial: The partial LLM output to finalize.
+    Returns:
+        LLMOutput: The finalized LLM output with processed tool calls.
+    Raises:
+        CompletionTooLongException: If the completion was truncated due to length
+            and resulted in empty text.
+        ValueError: If tool call ID or function is missing in the partial data.
+    """
+    def _parse_tool_call(tc_partial: ToolCallPartial):
+        if tc_partial.id is None:
+            raise ValueError("Tool call ID not found in partial; check for parsing errors")
+        if tc_partial.function is None:
+            raise ValueError("Tool call function not found in partial; check for parsing errors")
+        arguments: dict[str, Any] = {}
+        # Attempt to load arguments into JSON
+        try:
+            arguments = json.loads(tc_partial.arguments_raw or "{}")
+            parse_error = None
+        # If the tool call arguments are not valid JSON, return an empty dict with the error
+        except Exception as e:
+            arguments = {"__parse_error_raw_args": tc_partial.arguments_raw}
+            parse_error = f"Couldn't parse tool call arguments as JSON: {e}. Original input: {tc_partial.arguments_raw}"
+        return ToolCall(
+            id=tc_partial.id,
+            function=tc_partial.function,
+            arguments=arguments,
+            parse_error=parse_error,
+            type=tc_partial.type,
+        )
+    output = LLMOutput(
+        model=partial.model,
+        completions=[
+            LLMCompletion(
+                text=c.text,
+                tool_calls=[_parse_tool_call(tc) for tc in (c.tool_calls or []) if tc is not None],
+                finish_reason=c.finish_reason,
+                reasoning_tokens=c.reasoning_tokens,
+            )
+            for c in partial.completions
+        ],
+        usage=partial.usage,
+        from_cache=False,
+    )
+    # If the completion is empty and was truncated (likely due to too much reasoning), raise an exception
+    if output.first and output.first.finish_reason == "length" and output.first.no_text:
+        raise CompletionTooLongException(
+            "Completion empty due to truncation. Consider increasing max_new_tokens."
+        )
+    for c in output.completions:
+        if c.finish_reason == "length":
+            logger.warning(
+                "Completion truncated due to length; consider increasing max_new_tokens."
+            )
+    return output
+class AsyncLLMOutputStreamingCallback(Protocol):
+    """Protocol for asynchronous streaming callbacks with batch index.
+    Defines the expected signature for callbacks that handle streaming output
+    with a batch index.
+    Args:
+        batch_index: The index of the current batch.
+        llm_output: The LLM output for the current batch.
+    """
+    async def __call__(
+        self,
+        batch_index: int,
+        llm_output: LLMOutput,
+    ) -> None: ...
+class AsyncSingleLLMOutputStreamingCallback(Protocol):
+    """Protocol for asynchronous streaming callbacks without batch indexing.
+    Defines the expected signature for callbacks that handle streaming output
+    without batch indexing.
+    Args:
+        llm_output: The LLM output to process.
+    """
+    async def __call__(
+        self,
+        llm_output: LLMOutput,
+    ) -> None: ...
+class AsyncEmbeddingStreamingCallback(Protocol):
+    """Protocol for sending progress updates for embedding generation."""
+    async def __call__(self, progress: int) -> None: ...

docent/_llm_util/llm_cache.py ADDED Viewed

@@ -0,0 +1,193 @@
+import hashlib
+import json
+import os
+import sqlite3
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Literal
+from docent._llm_util.data_models.llm_output import LLMOutput
+from docent._log_util import get_logger
+from docent.data_models.chat import ChatMessage, ToolInfo
+logger = get_logger(__name__)
+class LLMCache:
+    def __init__(self, db_path: str | None = None):
+        if db_path is None:
+            llm_cache_path = os.getenv("LLM_CACHE_PATH")
+            if llm_cache_path is None or llm_cache_path == "":
+                raise ValueError("LLM_CACHE_PATH is not set")
+            else:
+                cache_dir = Path(llm_cache_path)
+                cache_dir.mkdir(parents=True, exist_ok=True)
+                db_path = str(cache_dir / "llm_cache.db")
+        self.db_path = db_path
+        self._init_db()
+    def _init_db(self) -> None:
+        with self._get_connection() as conn:
+            conn.execute(
+                """
+                CREATE TABLE IF NOT EXISTS llm_cache (
+                    key TEXT PRIMARY KEY,
+                    completion TEXT,
+                    model_name TEXT,
+                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+                )
+            """
+            )
+    @contextmanager
+    def _get_connection(self):
+        conn = sqlite3.connect(self.db_path)
+        try:
+            yield conn
+        finally:
+            conn.close()
+    def _create_key(
+        self,
+        messages: list[ChatMessage],
+        model_name: str,
+        *,
+        tools: list[ToolInfo] | None = None,
+        tool_choice: Literal["auto", "required"] | None = None,
+        reasoning_effort: Literal["minimal", "low", "medium", "high"] | None = None,
+        temperature: float = 1.0,
+        logprobs: bool = False,
+        top_logprobs: int | None = None,
+    ) -> str:
+        """Create a deterministic hash key from messages and model."""
+        # Convert messages to a stable string representation
+        message_str = json.dumps(
+            [msg.model_dump(exclude={"id"}) for msg in messages], sort_keys=True
+        )
+        # Convert tools to a stable string representation if present
+        tools_str = (
+            json.dumps([tool.model_dump() for tool in tools], sort_keys=True) if tools else None
+        )
+        # Combine all parameters into a single string
+        key_str = (
+            f"{message_str}:{model_name}:{tools_str}:{tool_choice}:{reasoning_effort}:{temperature}"
+        )
+        if logprobs:
+            key_str += f":{top_logprobs}"
+        return hashlib.sha256(key_str.encode()).hexdigest()
+    def get(
+        self,
+        messages: list[ChatMessage],
+        model_name: str,
+        *,
+        tools: list[ToolInfo] | None = None,
+        tool_choice: Literal["auto", "required"] | None = None,
+        reasoning_effort: Literal["minimal", "low", "medium", "high"] | None = None,
+        temperature: float = 1.0,
+        logprobs: bool = False,
+        top_logprobs: int | None = None,
+    ) -> LLMOutput | None:
+        """Get cached completion for a conversation if it exists."""
+        key = self._create_key(
+            messages,
+            model_name,
+            tools=tools,
+            tool_choice=tool_choice,
+            reasoning_effort=reasoning_effort,
+            temperature=temperature,
+            logprobs=logprobs,
+            top_logprobs=top_logprobs,
+        )
+        with self._get_connection() as conn:
+            cursor = conn.execute("SELECT completion FROM llm_cache WHERE key = ?", (key,))
+            result = cursor.fetchone()
+            if not result:
+                return None
+            out = LLMOutput.from_dict(json.loads(result[0]))
+            out.from_cache = True
+            return out
+    def set(
+        self,
+        messages: list[ChatMessage],
+        model_name: str,
+        llm_output: LLMOutput,
+        *,
+        tools: list[ToolInfo] | None = None,
+        tool_choice: Literal["auto", "required"] | None = None,
+        reasoning_effort: Literal["minimal", "low", "medium", "high"] | None = None,
+        temperature: float = 1.0,
+        logprobs: bool = False,
+        top_logprobs: int | None = None,
+    ) -> None:
+        """Cache a completion for a conversation."""
+        key = self._create_key(
+            messages,
+            model_name,
+            tools=tools,
+            tool_choice=tool_choice,
+            reasoning_effort=reasoning_effort,
+            temperature=temperature,
+            logprobs=logprobs,
+            top_logprobs=top_logprobs,
+        )
+        with self._get_connection() as conn:
+            conn.execute(
+                "INSERT OR REPLACE INTO llm_cache (key, completion, model_name) VALUES (?, ?, ?)",
+                (key, json.dumps(llm_output.to_dict()), model_name),
+            )
+            conn.commit()
+    def set_batch(
+        self,
+        messages_list: list[list[ChatMessage]],
+        model_name: str,
+        llm_output_list: list[LLMOutput],
+        *,
+        tools: list[ToolInfo] | None = None,
+        tool_choice: Literal["auto", "required"] | None = None,
+        reasoning_effort: Literal["minimal", "low", "medium", "high"] | None = None,
+        temperature: float = 1.0,
+        logprobs: bool = False,
+        top_logprobs: int | None = None,
+    ) -> None:
+        """Cache a completion for a conversation."""
+        keys: list[str] = []
+        for messages in messages_list:
+            key = self._create_key(
+                messages,
+                model_name,
+                tools=tools,
+                tool_choice=tool_choice,
+                reasoning_effort=reasoning_effort,
+                temperature=temperature,
+                logprobs=logprobs,
+                top_logprobs=top_logprobs,
+            )
+            keys.append(key)
+        with self._get_connection() as conn:
+            conn.executemany(
+                "INSERT OR REPLACE INTO llm_cache (key, completion, model_name) VALUES (?, ?, ?)",
+                [
+                    (key, json.dumps(llm_output.to_dict()), model_name)
+                    for key, llm_output in zip(keys, llm_output_list)
+                ],
+            )
+            conn.commit()
+    def clear(self) -> None:
+        """Clear all cached completions."""
+        with self._get_connection() as conn:
+            conn.execute("DELETE FROM llm_cache")
+            conn.commit()

docent-python 0.1.19a0__py3-none-any.whl → 0.1.27a0__py3-none-any.whl

Potentially problematic release.

docent-python 0.1.19a0py3-none-any.whl → 0.1.27a0py3-none-any.whl