PyPI - docent-python - Versions diffs - 0.1.22a0__py3-none-any.whl → 0.1.24a0__py3-none-any.whl - Mend

docent-python 0.1.22a0py3-none-any.whl → 0.1.24a0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docent-python might be problematic. Click here for more details.

Files changed (23) hide show

docent/_llm_util/data_models/llm_output.py +3 -0
docent/_llm_util/llm_cache.py +4 -4
docent/_llm_util/{prod_llms.py → llm_svc.py} +104 -86
docent/_llm_util/providers/preference_types.py +2 -2
docent/data_models/__init__.py +2 -2
docent/data_models/judge.py +7 -4
docent/judges/__init__.py +2 -0
docent/judges/analysis.py +77 -0
docent/judges/impl.py +476 -121
docent/judges/runner.py +66 -0
docent/judges/stats.py +205 -0
docent/judges/types.py +73 -2
docent/judges/util/meta_schema.json +3 -1
docent/judges/util/parse_output.py +8 -16
docent/judges/util/voting.py +38 -13
docent/sdk/client.py +90 -41
docent/trace.py +35 -0
{docent_python-0.1.22a0.dist-info → docent_python-0.1.24a0.dist-info}/METADATA +1 -1
{docent_python-0.1.22a0.dist-info → docent_python-0.1.24a0.dist-info}/RECORD +21 -20
docent/_llm_util/data_models/simple_svc.py +0 -79
docent/trace_2.py +0 -1842
{docent_python-0.1.22a0.dist-info → docent_python-0.1.24a0.dist-info}/WHEEL +0 -0
{docent_python-0.1.22a0.dist-info → docent_python-0.1.24a0.dist-info}/licenses/LICENSE.md +0 -0

docent/_llm_util/data_models/llm_output.py CHANGED Viewed

@@ -96,6 +96,7 @@ class LLMOutput:
     errors: list[LLMException] = field(default_factory=list)
     usage: UsageMetrics = field(default_factory=UsageMetrics)
     from_cache: bool = False
+    duration: float | None = None
     @property
     def non_empty(self) -> bool:
@@ -140,6 +141,7 @@ class LLMOutput:
             "errors": [e.error_type_id for e in self.errors],
             "usage": self.usage.to_dict(),
             "from_cache": self.from_cache,
+            "duration": self.duration,
         }
     @classmethod
@@ -161,6 +163,7 @@ class LLMOutput:
             errors=errors,
             usage=UsageMetrics(**usage),
             from_cache=bool(data.get("from_cache", False)),
+            duration=data.get("duration"),
         )

docent/_llm_util/llm_cache.py CHANGED Viewed

@@ -55,7 +55,7 @@ class LLMCache:
         *,
         tools: list[ToolInfo] | None = None,
         tool_choice: Literal["auto", "required"] | None = None,
-        reasoning_effort: Literal["low", "medium", "high"] | None = None,
+        reasoning_effort: Literal["minimal", "low", "medium", "high"] | None = None,
         temperature: float = 1.0,
         logprobs: bool = False,
         top_logprobs: int | None = None,
@@ -86,7 +86,7 @@ class LLMCache:
         *,
         tools: list[ToolInfo] | None = None,
         tool_choice: Literal["auto", "required"] | None = None,
-        reasoning_effort: Literal["low", "medium", "high"] | None = None,
+        reasoning_effort: Literal["minimal", "low", "medium", "high"] | None = None,
         temperature: float = 1.0,
         logprobs: bool = False,
         top_logprobs: int | None = None,
@@ -121,7 +121,7 @@ class LLMCache:
         *,
         tools: list[ToolInfo] | None = None,
         tool_choice: Literal["auto", "required"] | None = None,
-        reasoning_effort: Literal["low", "medium", "high"] | None = None,
+        reasoning_effort: Literal["minimal", "low", "medium", "high"] | None = None,
         temperature: float = 1.0,
         logprobs: bool = False,
         top_logprobs: int | None = None,
@@ -154,7 +154,7 @@ class LLMCache:
         *,
         tools: list[ToolInfo] | None = None,
         tool_choice: Literal["auto", "required"] | None = None,
-        reasoning_effort: Literal["low", "medium", "high"] | None = None,
+        reasoning_effort: Literal["minimal", "low", "medium", "high"] | None = None,
         temperature: float = 1.0,
         logprobs: bool = False,
         top_logprobs: int | None = None,

docent/_llm_util/{prod_llms.py → llm_svc.py} RENAMED Viewed

@@ -1,17 +1,8 @@
-"""
-At some point we'll want to do a refactor to support different types of provider/key swapping
-due to different scenarios. However, this'll probably be a breaking change, which is why I'm
-not doing it now.
-- mengk
-"""
+import time
 import traceback
-from contextlib import nullcontext
 from functools import partial
 from typing import (
     Any,
-    AsyncContextManager,
     Literal,
     Protocol,
     Sequence,
@@ -20,6 +11,7 @@ from typing import (
 )
 import anyio
+from anyio import Lock, Semaphore
 from anyio.abc import TaskGroup
 from tqdm.auto import tqdm
@@ -44,10 +36,12 @@ from docent._llm_util.providers.provider_registry import (
 from docent._log_util import get_logger
 from docent.data_models.chat import ChatMessage, ToolInfo, parse_chat_message
-MAX_VALIDATION_ATTEMPTS = 3
 logger = get_logger(__name__)
+MAX_VALIDATION_ATTEMPTS = 3
+DEFAULT_MAX_CONCURRENCY = 100
+DEFAULT_SVC_MAX_CONCURRENCY = 100
 @runtime_checkable
 class MessageResolver(Protocol):
@@ -87,11 +81,11 @@ async def _parallelize_calls(
     tool_choice: Literal["auto", "required"] | None,
     max_new_tokens: int,
     temperature: float,
-    reasoning_effort: Literal["low", "medium", "high"] | None,
+    reasoning_effort: Literal["minimal", "low", "medium", "high"] | None,
     logprobs: bool,
     top_logprobs: int | None,
     timeout: float,
-    semaphore: AsyncContextManager[anyio.Semaphore] | None,
+    semaphore: Semaphore,
     # use_tqdm: bool,
     cache: LLMCache | None = None,
 ):
@@ -122,17 +116,19 @@ async def _parallelize_calls(
     # Save resolved messages to avoid multiple resolutions
     resolved_messages: list[list[ChatMessage] | None] = [None] * len(inputs)
-    cancelled_due_to_usage_limit: bool = False
+    # Not sure why the cast is necessary for the type checker
+    cancelled_due_to_usage_limit: bool = cast(bool, False)
     async def _limited_task(i: int, cur_input: MessagesInput, tg: TaskGroup):
         nonlocal responses, pbar, resolved_messages, cancelled_due_to_usage_limit
-        async with semaphore or nullcontext():
+        async with semaphore:
             messages = _resolve_messages_input(cur_input)
             resolved_messages[i] = messages
             retry_count = 0
             result = None
+            call_started_at: float | None = None
             # Check if there's a cached result
             cached_result = (
@@ -154,6 +150,7 @@ async def _parallelize_calls(
                 if streaming_callback is not None:
                     await streaming_callback(i, result)
             else:
+                call_started_at = time.perf_counter()
                 while retry_count < MAX_VALIDATION_ATTEMPTS:
                     try:
                         if streaming_callback is None:
@@ -187,7 +184,7 @@ async def _parallelize_calls(
                                 errors=[e],
                             )
                             break
-                    except DocentUsageLimitException as e:
+                    except DocentUsageLimitException as _:
                         result = LLMOutput(
                             model=model_name,
                             completions=[],
@@ -219,6 +216,10 @@ async def _parallelize_calls(
                         )
                         break
+            # Only store the elapsed time if we didn't hit the cache and the call was successful
+            if cached_result is None and result is not None and call_started_at is not None:
+                result.duration = time.perf_counter() - call_started_at
             # Always call completion callback with final result (success or error)
             if completion_callback and result is not None:
                 try:
@@ -273,21 +274,24 @@ async def _parallelize_calls(
     # Cache what we have so far if something got cancelled
     except anyio.get_cancelled_exc_class():
         num_cached = _cache_responses()
-        logger.info(
-            f"Cancelled {len(inputs) - num_cached} unfinished LLM API calls; cached {num_cached} completed responses"
-        )
-        raise
+        if num_cached:
+            logger.info(
+                f"Cancelled {len(inputs) - num_cached} unfinished LLM API calls, but cached {num_cached} completed responses"
+            )
-    if cancelled_due_to_usage_limit:
-        for i in range(len(responses)):
-            if responses[i] is None:
-                responses[i] = LLMOutput(
-                    model=model_name,
-                    completions=[],
-                    errors=[DocentUsageLimitException()],
-                )
-            else:
-                responses[i].errors.append(DocentUsageLimitException())
+        # If the task was cancelled due to usage limit, set the response to a usage limit exception
+        if cancelled_due_to_usage_limit:
+            for i, response in enumerate(responses):
+                if response is None:
+                    responses[i] = LLMOutput(
+                        model=model_name,
+                        completions=[],
+                        errors=[DocentUsageLimitException()],
+                    )
+                else:
+                    response.errors.append(DocentUsageLimitException())
+        raise
     # Cache results if available
     _cache_responses()
@@ -300,51 +304,88 @@ async def _parallelize_calls(
     return cast(list[LLMOutput], responses)
-class LLMManager:
-    def __init__(
-        self,
-        model_options: list[ModelOption],
-        api_key_overrides: dict[str, str] | None = None,
-        use_cache: bool = False,
-    ):
-        # TODO(mengk): make this more robust, possibly move to a NoSQL database or something
-        try:
-            self.cache = LLMCache() if use_cache else None
-        except ValueError as e:
-            logger.warning(f"Disabling LLM cache due to init error: {e}")
-            self.cache = None
+class BaseLLMService:
+    def __init__(self, max_concurrency: int = DEFAULT_SVC_MAX_CONCURRENCY):
+        self._semaphore = Semaphore(max_concurrency)
+        self._client_cache: dict[tuple[str, str | None], Any] = {}  # (provider, api_key) -> client
+        self._client_cache_lock = Lock()
+    async def _get_cached_client(self, provider: str, override_key: str | None) -> Any:
+        """Return a cached client for the provider/api-key tuple, creating one if needed."""
+        cache_key = (provider, override_key)
+        async with self._client_cache_lock:
+            cached = self._client_cache.get(cache_key)
+            if cached is not None:
+                return cached
-        self.model_options = model_options
-        self.current_model_option_index = 0
-        self.api_key_overrides = api_key_overrides or {}
+            client_factory = PROVIDERS[provider]["async_client_getter"]
+            new_client = client_factory(override_key)
+            self._client_cache[cache_key] = new_client
+            return new_client
     async def get_completions(
         self,
+        *,
         inputs: list[MessagesInput],
+        model_options: list[ModelOption],
         tools: list[ToolInfo] | None = None,
         tool_choice: Literal["auto", "required"] | None = None,
-        max_new_tokens: int = 32,
+        max_new_tokens: int = 1024,
         temperature: float = 1.0,
         logprobs: bool = False,
         top_logprobs: int | None = None,
-        max_concurrency: int | None = None,
-        timeout: float = 5.0,
+        timeout: float = 120.0,
         streaming_callback: AsyncLLMOutputStreamingCallback | None = None,
         validation_callback: AsyncLLMOutputStreamingCallback | None = None,
         completion_callback: AsyncLLMOutputStreamingCallback | None = None,
+        use_cache: bool = False,
+        _api_key_overrides: dict[str, str] = dict(),
     ) -> list[LLMOutput]:
+        """Request completions from a configured LLM provider."""
+        # We don't support logprobs for Anthropic yet
+        if logprobs:
+            for model_option in model_options:
+                if model_option.provider == "anthropic":
+                    raise ValueError(
+                        f"Logprobs are not supported for Anthropic, so we can't use model {model_option.model_name}"
+                    )
+        # Instantiate cache
+        # TODO(mengk): make this more robust, possibly move to a NoSQL database or something
+        try:
+            cache = LLMCache() if use_cache else None
+        except ValueError as e:
+            logger.warning(f"Disabling LLM cache due to init error: {e}")
+            cache = None
+        # Initialize pointer to which model we're using; used for model rotation after failures
+        current_model_option_index = 0
+        def _rotate_model_option() -> ModelOption | None:
+            nonlocal current_model_option_index
+            current_model_option_index += 1
+            if current_model_option_index >= len(model_options):
+                logger.error("All model options are exhausted")
+                return None
+            new_model_option = model_options[current_model_option_index]
+            logger.warning(f"Switched to next model {new_model_option.model_name}")
+            return new_model_option
         while True:
             # Parse the current model option
-            cur_option = self.model_options[self.current_model_option_index]
+            cur_option = model_options[current_model_option_index]
             provider, model_name, reasoning_effort = (
                 cur_option.provider,
                 cur_option.model_name,
                 cur_option.reasoning_effort,
             )
-            override_key = self.api_key_overrides.get(provider)
+            override_key = _api_key_overrides.get(provider)
-            client = PROVIDERS[provider]["async_client_getter"](override_key)
+            client = await self._get_cached_client(provider, override_key)
             single_output_getter = PROVIDERS[provider]["single_output_getter"]
             single_streaming_output_getter = PROVIDERS[provider]["single_streaming_output_getter"]
@@ -369,10 +410,8 @@ class LLMManager:
                 logprobs=logprobs,
                 top_logprobs=top_logprobs,
                 timeout=timeout,
-                semaphore=(
-                    anyio.Semaphore(max_concurrency) if max_concurrency is not None else None
-                ),
-                cache=self.cache,
+                semaphore=self._semaphore,
+                cache=cache,
             )
             assert len(outputs) == len(inputs), "Number of outputs must match number of messages"
@@ -388,23 +427,13 @@ class LLMManager:
             )
             if num_rotation_errors > 0:
                 logger.warning(f"{model_name}: {num_rotation_errors} API errors")
-                if not self._rotate_model_option():
+                if not _rotate_model_option():
                     break
             else:
                 break
         return outputs
-    def _rotate_model_option(self) -> ModelOption | None:
-        self.current_model_option_index += 1
-        if self.current_model_option_index >= len(self.model_options):
-            logger.error("All model options are exhausted")
-            return None
-        new_model_option = self.model_options[self.current_model_option_index]
-        logger.warning(f"Switched to next model {new_model_option.model_name}")
-        return new_model_option
 async def get_llm_completions_async(
     inputs: list[MessagesInput],
@@ -415,40 +444,29 @@ async def get_llm_completions_async(
     temperature: float = 1.0,
     logprobs: bool = False,
     top_logprobs: int | None = None,
-    max_concurrency: int = 100,
     timeout: float = 120.0,
     streaming_callback: AsyncLLMOutputStreamingCallback | None = None,
     validation_callback: AsyncLLMOutputStreamingCallback | None = None,
     completion_callback: AsyncLLMOutputStreamingCallback | None = None,
     use_cache: bool = False,
-    api_key_overrides: dict[str, str] | None = None,
+    _api_key_overrides: dict[str, str] = dict(),
 ) -> list[LLMOutput]:
-    # We don't support logprobs for Anthropic yet
-    if logprobs:
-        for model_option in model_options:
-            if model_option.provider == "anthropic":
-                raise ValueError(
-                    f"Logprobs are not supported for Anthropic, so we can't use model {model_option.model_name}"
-                )
+    """Convenience method for backward compatibility"""
-    # Create the LLM manager
-    llm_manager = LLMManager(
+    svc = BaseLLMService()
+    return await svc.get_completions(
+        inputs=inputs,
         model_options=model_options,
-        api_key_overrides=api_key_overrides,
-        use_cache=use_cache,
-    )
-    return await llm_manager.get_completions(
-        inputs,
         tools=tools,
         tool_choice=tool_choice,
         max_new_tokens=max_new_tokens,
         temperature=temperature,
         logprobs=logprobs,
         top_logprobs=top_logprobs,
-        max_concurrency=max_concurrency,
         timeout=timeout,
         streaming_callback=streaming_callback,
         validation_callback=validation_callback,
         completion_callback=completion_callback,
+        use_cache=use_cache,
+        _api_key_overrides=_api_key_overrides,
     )

docent/_llm_util/providers/preference_types.py CHANGED Viewed

@@ -22,7 +22,7 @@ class ModelOption(BaseModel):
     provider: str
     model_name: str
-    reasoning_effort: Literal["low", "medium", "high"] | None = None
+    reasoning_effort: Literal["minimal", "low", "medium", "high"] | None = None
 class ModelOptionWithContext(BaseModel):
@@ -39,7 +39,7 @@ class ModelOptionWithContext(BaseModel):
     provider: str
     model_name: str
-    reasoning_effort: Literal["low", "medium", "high"] | None = None
+    reasoning_effort: Literal["minimal", "low", "medium", "high"] | None = None
     context_window: int
     uses_byok: bool

docent/data_models/__init__.py CHANGED Viewed

@@ -1,13 +1,13 @@
 from docent.data_models.agent_run import AgentRun
 from docent.data_models.citation import Citation
-from docent.data_models.judge import JudgeRunLabel
+from docent.data_models.judge import Label
 from docent.data_models.regex import RegexSnippet
 from docent.data_models.transcript import Transcript, TranscriptGroup
 __all__ = [
     "AgentRun",
     "Citation",
-    "JudgeRunLabel",
+    "Label",
     "RegexSnippet",
     "Transcript",
     "TranscriptGroup",

docent/data_models/judge.py CHANGED Viewed

@@ -6,11 +6,14 @@ from uuid import uuid4
 from pydantic import BaseModel, Field
-class JudgeRunLabel(BaseModel):
+class Label(BaseModel):
     id: str = Field(default_factory=lambda: str(uuid4()))
+    label_set_id: str
+    label_value: dict[str, Any]
     agent_run_id: str
-    rubric_id: str
-    label: dict[str, Any]
-__all__ = ["JudgeRunLabel"]
+__all__ = ["Label"]

docent/judges/__init__.py CHANGED Viewed

@@ -3,6 +3,7 @@ from docent.judges.types import (
     JudgeResult,
     JudgeResultCompletionCallback,
     JudgeResultWithCitations,
+    JudgeVariant,
     ResultType,
     Rubric,
 )
@@ -18,4 +19,5 @@ __all__ = [
     "JudgeResultWithCitations",
     "JudgeResultCompletionCallback",
     "ResultType",
+    "JudgeVariant",
 ]

docent/judges/analysis.py ADDED Viewed

@@ -0,0 +1,77 @@
+import json
+from pathlib import Path
+from typing import Any
+import anyio
+from pydantic import BaseModel
+from pydantic_core import to_jsonable_python
+from tqdm.auto import tqdm
+from docent._log_util import get_logger
+from docent.data_models.agent_run import AgentRun
+from docent.judges.impl import BaseJudge
+from docent.judges.util.voting import JudgeOutputDistribution
+logger = get_logger(__name__)
+class MultiReflectRollouts(BaseModel):
+    """Object is associated with a single agent run"""
+    agent_run_id: str
+    first_step_rollouts: list[dict[str, Any]]
+    first_step_rollout_metadata: list[dict[str, Any] | None]
+    # Each index in second_step_rollouts corresponds to an index in first_step_combinations
+    # Step 2 rollouts are computed by passing each step 1 combo into the judge several times
+    first_step_combinations: list[list[dict[str, Any]]] | None = None
+    second_step_rollouts: list[list[dict[str, Any]]] | None = None
+    second_step_rollout_metadata: list[list[dict[str, Any] | None]] | None = None
+    distributions: dict[str, JudgeOutputDistribution]
+async def collect_judge_pvs(
+    judge: BaseJudge,
+    agent_runs: list[AgentRun],
+    *,
+    results_path: Path,
+    estimate_output_distrs_kwargs: dict[str, Any],
+):
+    if results_path.exists():
+        raise FileExistsError(f"Results path already exists: {results_path}")
+    results_path.parent.mkdir(parents=True, exist_ok=True)
+    results = dict[str, MultiReflectRollouts]()
+    persist_lock = anyio.Lock()
+    pbar = tqdm(total=len(agent_runs), desc="Processing agent runs")
+    async def _persist():
+        async with persist_lock:
+            with open(str(results_path), "w") as f:
+                json.dump(to_jsonable_python(results), f, indent=2)
+    async def _execute_for_agent_run(agent_run: AgentRun):
+        result = await judge.estimate_output_distrs(agent_run, **estimate_output_distrs_kwargs)
+        if result is None:
+            pbar.update(1)
+            return
+        distrs, metadata = result
+        results[agent_run.id] = MultiReflectRollouts.model_validate(
+            {
+                "agent_run_id": agent_run.id,
+                "distributions": distrs,
+                **metadata,
+            }
+        )
+        await _persist()
+        pbar.update(1)
+    async with anyio.create_task_group() as tg_outer:
+        for agent_run in agent_runs:
+            tg_outer.start_soon(_execute_for_agent_run, agent_run)
+    pbar.close()
+    return results

docent-python 0.1.22a0__py3-none-any.whl → 0.1.24a0__py3-none-any.whl

Potentially problematic release.

docent-python 0.1.22a0py3-none-any.whl → 0.1.24a0py3-none-any.whl