PyPI - docent-python - Versions diffs - 0.1.61a0__tar.gz → 0.1.63a0__tar.gz - Mend

docent-python 0.1.61a0tar.gz → 0.1.63a0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

{docent_python-0.1.61a0 → docent_python-0.1.63a0}/.gitignore RENAMED Viewed

@@ -145,8 +145,9 @@ ENV/
 env.bak/
 venv.bak/
-# Docent environment files
+# Docent
 docent.env*
+docent_analyses/
 # Spyder project settings
 .spyderproject
@@ -204,3 +205,6 @@ data/cache
 # dont commit package lock, force use of bun lock
 package-lock.json
+# Claude Code worktrees
+.claude/worktrees/

{docent_python-0.1.61a0 → docent_python-0.1.63a0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docent-python
-Version: 0.1.61a0
+Version: 0.1.63a0
 Summary: Docent SDK
 Project-URL: Homepage, https://github.com/TransluceAI/docent
 Project-URL: Issues, https://github.com/TransluceAI/docent/issues

{docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/__init__.py RENAMED Viewed

@@ -4,6 +4,7 @@ __all__ = [
     "load_config_file",
     "AgentRunRef",
     "TranscriptRef",
+    "TranscriptSliceRef",
     "ReadingResultRef",
     "ResultRef",
     "Prompt",
@@ -17,4 +18,5 @@ from docent.sdk.llm_context import (
     ReadingResultRef,
     ResultRef,
     TranscriptRef,
+    TranscriptSliceRef,
 )

{docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/_llm_util/data_models/exceptions.py RENAMED Viewed

@@ -35,6 +35,11 @@ class ContextWindowException(LLMException):
     user_message = "Context window exceeded."
+class InvalidPromptException(LLMException):
+    error_type_id = "invalid_prompt"
+    user_message = "The model provider rejected this prompt for safety reasons."
 class NoResponseException(LLMException):
     error_type_id = "no_response"
     user_message = "The model returned an empty response. Please try again later."
@@ -45,6 +50,17 @@ class DocentUsageLimitException(LLMException):
     user_message = "Free daily usage limit reached. Add your own API key in settings or contact us for increased limits."
+class ProviderAuthenticationException(LLMException):
+    error_type_id = "provider_authentication"
+    def __init__(self, message: str = ""):
+        super().__init__(message)
+        self.user_message = (
+            "The model provider API key could not be authenticated. "
+            "If you added your own key, update it in Settings > Model providers."
+        )
 class ValidationFailedException(LLMException):
     error_type_id = "validation_failed"
     user_message = "The model returned invalid output that failed validation."
@@ -64,8 +80,10 @@ LLM_ERROR_TYPES: list[type[LLMException]] = [
     CompletionTooLongException,
     RateLimitException,
     ContextWindowException,
+    InvalidPromptException,
     NoResponseException,
     DocentUsageLimitException,
+    ProviderAuthenticationException,
     ValidationFailedException,
     TimeoutException,
 ]

{docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/_llm_util/data_models/llm_output.py RENAMED Viewed

@@ -97,7 +97,6 @@ class LLMOutput:
     completions: list[LLMCompletion]
     errors: list[LLMException] = field(default_factory=lambda: [])
     usage: UsageMetrics = field(default_factory=UsageMetrics)
-    from_cache: bool = False
     duration: float | None = None
     @property
@@ -142,7 +141,6 @@ class LLMOutput:
             "completions": [comp.model_dump() for comp in self.completions],
             "errors": [e.error_type_id for e in self.errors],
             "usage": self.usage.to_dict(),
-            "from_cache": self.from_cache,
             "duration": self.duration,
         }
@@ -156,7 +154,7 @@ class LLMOutput:
         ]
         errors_to_log = [e for e in errors if e not in error_types_to_not_log]
         if errors_to_log:
-            logger.error(f"Loading LLM output with errors: {errors}")
+            logger.error("Loading LLM output with errors: %s", errors)
         errors = [error_type_map.get(e, LLMException)() for e in errors]
         completions = data.get("completions", [])
@@ -171,7 +169,6 @@ class LLMOutput:
             completions=completions,
             errors=errors,
             usage=UsageMetrics(**usage),
-            from_cache=bool(data.get("from_cache", False)),
             duration=data.get("duration"),
         )
@@ -275,7 +272,6 @@ def finalize_llm_output_partial(partial: LLMOutputPartial) -> LLMOutput:
             for c in partial.completions
         ],
         usage=partial.usage,
-        from_cache=False,
     )
     # If the completion is empty and was truncated (likely due to too much reasoning), raise an exception

{docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/_llm_util/llm_svc.py RENAMED Viewed

@@ -28,7 +28,6 @@ from docent._llm_util.data_models.llm_output import (
     AsyncSingleLLMOutputStreamingCallback,
     LLMOutput,
 )
-from docent._llm_util.llm_cache import LLMCache
 from docent._llm_util.providers.preference_types import ModelOption
 from docent._llm_util.providers.provider_registry import (
     PROVIDERS,
@@ -37,6 +36,7 @@ from docent._llm_util.providers.provider_registry import (
 )
 from docent._log_util import get_logger
 from docent.data_models.chat import ChatMessage, ToolInfo, parse_chat_message
+from docent.data_models.chat.message import AssistantMessage, UserMessage
 from docent.data_models.chat.response_format import ResponseFormat
 logger = get_logger(__name__)
@@ -91,8 +91,8 @@ async def _parallelize_calls(
     semaphore: Semaphore,
     max_retries: int,
     # use_tqdm: bool,
-    cache: LLMCache | None = None,
     response_format: ResponseFormat | None = None,
+    retry_with_feedback: bool = False,
 ):
     base_func = partial(
         single_output_getter,
@@ -120,122 +120,129 @@ async def _parallelize_calls(
         else None
     )
-    # Save resolved messages to avoid multiple resolutions
-    resolved_messages: list[list[ChatMessage] | None] = [None] * len(inputs)
     # Not sure why the cast is necessary for the type checker
     cancelled_due_to_usage_limit: bool = cast(bool, False)
+    def _mark_usage_limit_responses() -> None:
+        for i, response in enumerate(responses):
+            if response is None:
+                responses[i] = LLMOutput(
+                    model=model_name,
+                    completions=[],
+                    errors=[DocentUsageLimitException()],
+                )
+            elif not response.completions and not response.errors:
+                response.errors.append(DocentUsageLimitException())
     async def _limited_task(i: int, cur_input: MessagesInput, tg: TaskGroup):
-        nonlocal responses, pbar, resolved_messages, cancelled_due_to_usage_limit
+        nonlocal responses, pbar, cancelled_due_to_usage_limit
         async with semaphore:
             messages = _resolve_messages_input(cur_input)
-            resolved_messages[i] = messages
             retry_count = 0
             result = None
-            call_started_at: float | None = None
-            # Check if there's a cached result
-            cached_result = (
-                cache.get(
-                    messages,
-                    model_name,
-                    tools=tools,
-                    tool_choice=tool_choice,
-                    reasoning_effort=reasoning_effort,
-                    temperature=temperature,
-                    logprobs=logprobs,
-                    top_logprobs=top_logprobs,
-                    response_format=response_format,
-                )
-                if cache is not None
-                else None
-            )
-            if cached_result is not None:
-                result = cached_result
-                if streaming_callback is not None:
-                    await streaming_callback(i, result)
-            else:
-                call_started_at = time.perf_counter()
-                while retry_count < MAX_VALIDATION_ATTEMPTS:
-                    try:
-                        if streaming_callback is None:
-                            result = await base_func(client=client, messages=messages)
-                        else:
-                            result = await base_func(
-                                client=client,
-                                streaming_callback=_get_single_streaming_callback(
-                                    i, streaming_callback
-                                ),
-                                messages=messages,
-                            )
-                        # Validate if validation callback provided and result is successful
-                        if validation_callback and not result.did_error:
-                            await validation_callback(i, result)
-                        break
-                    except ValidationFailedException as e:
-                        retry_count += 1
-                        logger.warning(
-                            f"Validation failed for {model_name} after {retry_count} attempts: {e}"
+            call_started_at = time.perf_counter()
+            current_messages = messages
+            while retry_count < MAX_VALIDATION_ATTEMPTS:
+                try:
+                    if streaming_callback is None:
+                        result = await base_func(client=client, messages=current_messages)
+                    else:
+                        result = await base_func(
+                            client=client,
+                            streaming_callback=_get_single_streaming_callback(
+                                i, streaming_callback
+                            ),
+                            messages=current_messages,
                         )
-                        if retry_count >= MAX_VALIDATION_ATTEMPTS:
-                            logger.error(
-                                f"Validation failed for {model_name} after {retry_count} attempts. Original output: {e.failed_output}"
-                            )
-                            result = LLMOutput(
-                                model=model_name,
-                                completions=[],
-                                errors=[e],
-                            )
-                            break
-                    except DocentUsageLimitException as _:
-                        result = LLMOutput(
-                            model=model_name,
-                            completions=[],
-                            errors=[],  # Usage limit exceptions will be added to all results later if cancelled_due_to_usage_limit
+                    # Validate if validation callback provided and result is successful
+                    if validation_callback and not result.did_error:
+                        await validation_callback(i, result)
+                    break
+                except ValidationFailedException as e:
+                    retry_count += 1
+                    logger.warning(
+                        "Validation failed for %s after %d attempts: %s",
+                        model_name,
+                        retry_count,
+                        e,
+                        extra={"original_output": e.failed_output},
+                    )
+                    if retry_count >= MAX_VALIDATION_ATTEMPTS:
+                        logger.error(
+                            "Validation failed for %s after %d attempts: %s",
+                            model_name,
+                            retry_count,
+                            e,
+                            extra={"original_output": e.failed_output},
                         )
-                        cancelled_due_to_usage_limit = True
-                        tg.cancel_scope.cancel()
-                        break
-                    except asyncio.TimeoutError as e:
-                        timeout_exception = TimeoutException(str(e) or "Request timed out")
-                        timeout_exception.__cause__ = e
-                        logger.error(f"Call to {model_name} timed out")
                         result = LLMOutput(
                             model=model_name,
                             completions=[],
-                            errors=[timeout_exception],
+                            errors=[e],
                         )
                         break
-                    except Exception as e:
-                        if not isinstance(e, LLMException):
-                            logger.error(
-                                f"LLM call raised an exception that is not an LLMException: {e}. Failure traceback:\n{traceback.format_exc()}"
-                            )
-                            llm_exception = LLMException(e)
-                            llm_exception.__cause__ = e
-                        else:
-                            llm_exception = e
-                        error_message = f"Call to {model_name} failed even with backoff: {e.__class__.__name__}."
-                        if not isinstance(e, RateLimitException):
-                            error_message += f" Failure traceback:\n{traceback.format_exc()}"
-                        logger.error(error_message)
-                        result = LLMOutput(
-                            model=model_name,
-                            completions=[],
-                            errors=[llm_exception],
+                    if retry_with_feedback:
+                        # Build a new message list with the failed output and
+                        # error feedback so the model can correct itself
+                        current_messages = [
+                            *messages,
+                            AssistantMessage(content=e.failed_output or ""),
+                            UserMessage(
+                                content=f"Your previous output failed validation: {e}\n\nPlease try again with a corrected output."
+                            ),
+                        ]
+                except DocentUsageLimitException as _:
+                    result = LLMOutput(
+                        model=model_name,
+                        completions=[],
+                        errors=[],  # Usage limit exceptions will be added to all results later if cancelled_due_to_usage_limit
+                    )
+                    cancelled_due_to_usage_limit = True
+                    tg.cancel_scope.cancel()
+                    break
+                except asyncio.TimeoutError as e:
+                    timeout_exception = TimeoutException(str(e) or "Request timed out")
+                    timeout_exception.__cause__ = e
+                    logger.error("Call to %s timed out", model_name)
+                    result = LLMOutput(
+                        model=model_name,
+                        completions=[],
+                        errors=[timeout_exception],
+                    )
+                    break
+                except Exception as e:
+                    if not isinstance(e, LLMException):
+                        logger.error(
+                            "LLM call raised an exception that is not an LLMException: %s. Failure traceback:\n%s",
+                            e,
+                            traceback.format_exc(),
                         )
-                        break
+                        llm_exception = LLMException(e)
+                        llm_exception.__cause__ = e
+                    else:
+                        llm_exception = e
+                    error_message = (
+                        f"Call to {model_name} failed even with backoff: {e.__class__.__name__}."
+                    )
-            # Only store the elapsed time if we didn't hit the cache and the call was successful
-            if cached_result is None and result is not None and call_started_at is not None:
+                    if not isinstance(e, RateLimitException):
+                        error_message += f" Failure traceback:\n{traceback.format_exc()}"
+                    logger.error(error_message)
+                    result = LLMOutput(
+                        model=model_name,
+                        completions=[],
+                        errors=[llm_exception],
+                    )
+                    break
+            if result is not None:
                 result.duration = time.perf_counter() - call_started_at
             # Always call completion callback with final result (success or error)
@@ -244,44 +251,14 @@ async def _parallelize_calls(
                     await completion_callback(i, result)
                 # LLMService uses this callback to record cost, and may throw an error if we just exceeded limit
                 except DocentUsageLimitException as e:
-                    result.errors.append(e)
+                    if not result.completions and not result.errors:
+                        result.errors.append(e)
                     cancelled_due_to_usage_limit = True
                     tg.cancel_scope.cancel()
             responses[i] = result
             if pbar is not None:
                 pbar.update(1)
-            if pbar is None or pbar.n == pbar.total:
-                tg.cancel_scope.cancel()
-    def _cache_responses():
-        nonlocal responses, cache
-        if cache is not None:
-            indices = [
-                i
-                for i, response in enumerate(responses)
-                if resolved_messages[i] is not None
-                and response is not None
-                and not response.did_error
-            ]
-            cache.set_batch(
-                # We already checked that each index has a resolved messages list
-                [cast(list[ChatMessage], resolved_messages[i]) for i in indices],
-                model_name,
-                # We already checked that each index corresponds to an LLMOutput object
-                [cast(LLMOutput, responses[i]) for i in indices],
-                tools=tools,
-                tool_choice=tool_choice,
-                reasoning_effort=reasoning_effort,
-                temperature=temperature,
-                logprobs=logprobs,
-                top_logprobs=top_logprobs,
-                response_format=response_format,
-            )
-            return len(indices)
-        else:
-            return 0
     # Get all results concurrently
     try:
@@ -290,30 +267,14 @@ async def _parallelize_calls(
             for i, cur_input in enumerate(inputs):
                 tg.start_soon(_limited_task, i, cur_input, tg)
-    # Cache what we have so far if something got cancelled
     except anyio.get_cancelled_exc_class():
-        num_cached = _cache_responses()
-        if num_cached:
-            logger.info(
-                f"Cancelled {len(inputs) - num_cached} unfinished LLM API calls, but cached {num_cached} completed responses"
-            )
-        # If the task was cancelled due to usage limit, set the response to a usage limit exception
-        if cancelled_due_to_usage_limit:
-            for i, response in enumerate(responses):
-                if response is None:
-                    responses[i] = LLMOutput(
-                        model=model_name,
-                        completions=[],
-                        errors=[DocentUsageLimitException()],
-                    )
-                else:
-                    response.errors.append(DocentUsageLimitException())
+        if not cancelled_due_to_usage_limit:
+            raise
-        raise
-    # Cache results if available
-    _cache_responses()
+    # If we stopped the batch due to usage limits, make sure every input has a
+    # structured result instead of relying on AnyIO's cancellation propagation.
+    if cancelled_due_to_usage_limit:
+        _mark_usage_limit_responses()
     # At this point, all indices should have a result
     assert all(isinstance(r, LLMOutput) for r in responses), (
@@ -357,9 +318,9 @@ class BaseLLMService:
         streaming_callback: AsyncLLMOutputStreamingCallback | None = None,
         validation_callback: AsyncLLMOutputStreamingCallback | None = None,
         completion_callback: AsyncLLMOutputStreamingCallback | None = None,
-        use_cache: bool = False,
         response_format: ResponseFormat | None = None,
         max_retries: int = 1,
+        retry_with_feedback: bool = False,
         _api_key_overrides: dict[str, str] = dict(),
     ) -> list[LLMOutput]:
         """Request completions from a configured LLM provider."""
@@ -375,14 +336,6 @@ class BaseLLMService:
                         f"Logprobs are not supported for Anthropic, so we can't use model {model_option.model_name}"
                     )
-        # Instantiate cache
-        # TODO(mengk): make this more robust, possibly move to a NoSQL database or something
-        try:
-            cache = LLMCache() if use_cache else None
-        except ValueError as e:
-            logger.warning(f"Disabling LLM cache due to init error: {e}")
-            cache = None
         # Initialize pointer to which model we're using; used for model rotation after failures
         current_model_option_index = 0
@@ -395,7 +348,7 @@ class BaseLLMService:
                 return None
             new_model_option = model_options[current_model_option_index]
-            logger.warning(f"Switched to next model {new_model_option.model_name}")
+            logger.warning("Switched to next model %s", new_model_option.model_name)
             return new_model_option
         while True:
@@ -413,7 +366,7 @@ class BaseLLMService:
             single_output_getter = PROVIDERS[provider]["single_output_getter"]
             single_streaming_output_getter = PROVIDERS[provider]["single_streaming_output_getter"]
-            # Get completions for uncached messages
+            # Get completions for messages.
             outputs: list[LLMOutput] = await _parallelize_calls(
                 (
                     single_output_getter
@@ -436,11 +389,18 @@ class BaseLLMService:
                 timeout=timeout,
                 semaphore=self._semaphore,
                 max_retries=max_retries,
-                cache=cache,
                 response_format=response_format,
+                retry_with_feedback=retry_with_feedback,
             )
             assert len(outputs) == len(inputs), "Number of outputs must match number of messages"
+            if any(
+                isinstance(e, DocentUsageLimitException)
+                for output in outputs
+                for e in output.errors
+            ):
+                break
             # Only count errors that should trigger model rotation (API errors, not validation/usage errors)
             num_rotation_errors = sum(
                 1
@@ -452,7 +412,7 @@ class BaseLLMService:
                 )
             )
             if num_rotation_errors > 0:
-                logger.warning(f"{model_name}: {num_rotation_errors} API errors")
+                logger.warning("%s: %s API errors", model_name, num_rotation_errors)
                 if not _rotate_model_option():
                     break
             else:

{docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/_llm_util/model_registry.py RENAMED Viewed

@@ -183,7 +183,7 @@ def get_model_info(model_name: str) -> Optional[ModelInfo]:
 def get_context_window(model_name: str) -> int:
     info = get_model_info(model_name)
     if info is None:
-        logger.warning(f"No context window found for model {model_name}")
+        logger.warning("No context window found for model %s", model_name)
         return 100_000
     return info.context_window
@@ -196,11 +196,11 @@ def get_rates_for_model_name(model_name: str) -> Optional[ModelRate]:
 def estimate_cost_cents(model_name: str, token_count: int, token_type: TokenType) -> float:
     rate = get_rates_for_model_name(model_name)
     if rate is None:
-        logger.warning(f"No rate found for model {model_name}")
+        logger.warning("No rate found for model %s", model_name)
         return 0.0
     usd_per_mtok = rate.get(token_type)
     if usd_per_mtok is None:
-        logger.warning(f"No rate found for model {model_name} token type {token_type}")
+        logger.warning("No rate found for model %s token type %s", model_name, token_type)
         return 0.0
     cents_per_token = usd_per_mtok * 100 / 1_000_000.0
     return token_count * cents_per_token

{docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/_llm_util/providers/anthropic.py RENAMED Viewed

@@ -41,6 +41,7 @@ from docent._llm_util.data_models.exceptions import (
     CompletionTooLongException,
     ContextWindowException,
     NoResponseException,
+    ProviderAuthenticationException,
     RateLimitException,
 )
 from docent._llm_util.data_models.llm_output import (
@@ -78,7 +79,9 @@ ANTHROPIC_STRUCTURED_OUTPUTS_BETA = "structured-outputs-2025-11-13"
 def _print_backoff_message(e: Details):
     logger.warning(
-        f"Anthropic backing off for {e['wait']:.2f}s due to {e['exception'].__class__.__name__}"  # type: ignore
+        "Anthropic backing off for %.2fs due to %s",
+        e["wait"],  # type: ignore
+        e["exception"].__class__.__name__,  # type: ignore
     )
@@ -86,6 +89,7 @@ def _is_retryable_error(e: BaseException) -> bool:
     if (
         isinstance(e, BadRequestError)
         or isinstance(e, ContextWindowException)
+        or isinstance(e, ProviderAuthenticationException)
         or isinstance(e, AuthenticationError)
         or isinstance(e, NotImplementedError)
         or isinstance(e, PermissionDeniedError)
@@ -209,6 +213,8 @@ def _build_output_format(response_format: ResponseFormat | None) -> dict[str, An
 def _convert_anthropic_error(e: Exception):
+    if isinstance(e, (AuthenticationError, PermissionDeniedError)):
+        return ProviderAuthenticationException(e.message)
     if isinstance(e, BadRequestError):
         if "context limit" in e.message.lower() or "prompt is too long" in e.message.lower():
             return ContextWindowException()
@@ -285,7 +291,7 @@ async def get_anthropic_chat_completion_streaming_async(
                 if llm_output_partial:
                     return finalize_llm_output_partial(llm_output_partial)
                 return LLMOutput(model=model_name, completions=[], errors=[NoResponseException()])
-        except (RateLimitError, BadRequestError) as e:
+        except (RateLimitError, BadRequestError, AuthenticationError, PermissionDeniedError) as e:
             if e2 := _convert_anthropic_error(e):
                 raise e2 from e
             raise
@@ -365,7 +371,7 @@ def update_llm_output(
             ):
                 # This should not happen with a well-behaved API, log and skip
                 logger.warning(
-                    f"Received InputJSONDelta before start event at index {index}, skipping"
+                    "Received InputJSONDelta before start event at index %s, skipping", index
                 )
             else:
                 cur_tool_calls[index] = ToolCallPartial(
@@ -482,7 +488,7 @@ async def get_anthropic_chat_completion_async(
                     )
                 return output
-        except (RateLimitError, BadRequestError) as e:
+        except (RateLimitError, BadRequestError, AuthenticationError, PermissionDeniedError) as e:
             if e2 := _convert_anthropic_error(e):
                 raise e2 from e
             raise

docent-python 0.1.61a0__tar.gz → 0.1.63a0__tar.gz

docent-python 0.1.61a0tar.gz → 0.1.63a0tar.gz