docent-python 0.1.61a0__tar.gz → 0.1.63a0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/.gitignore +5 -1
  2. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/PKG-INFO +1 -1
  3. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/__init__.py +2 -0
  4. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/_llm_util/data_models/exceptions.py +18 -0
  5. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/_llm_util/data_models/llm_output.py +1 -5
  6. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/_llm_util/llm_svc.py +125 -165
  7. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/_llm_util/model_registry.py +3 -3
  8. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/_llm_util/providers/anthropic.py +10 -4
  9. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/_llm_util/providers/google.py +47 -31
  10. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/_llm_util/providers/openai.py +38 -7
  11. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/_llm_util/providers/openrouter.py +3 -1
  12. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/_log_util/logger.py +3 -2
  13. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/data_models/__init__.py +2 -0
  14. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/data_models/agent_run.py +139 -165
  15. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/data_models/citation.py +49 -11
  16. docent_python-0.1.63a0/docent/data_models/context_config.py +88 -0
  17. docent_python-0.1.63a0/docent/data_models/metadata_util.py +180 -0
  18. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/data_models/reading.py +120 -48
  19. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/data_models/regex.py +2 -2
  20. docent_python-0.1.63a0/docent/data_models/report.py +16 -0
  21. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/data_models/transcript.py +75 -38
  22. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/judges/impl.py +41 -44
  23. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/judges/types.py +2 -2
  24. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/judges/util/parse_output.py +1 -1
  25. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/loaders/load_inspect.py +1 -1
  26. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/mcp/server.py +250 -9
  27. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/sdk/_base.py +53 -6
  28. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/sdk/_client_util.py +75 -6
  29. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/sdk/_collections.py +16 -12
  30. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/sdk/_dql.py +8 -6
  31. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/sdk/_readings.py +236 -44
  32. docent_python-0.1.63a0/docent/sdk/_reports.py +281 -0
  33. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/sdk/_results.py +3 -3
  34. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/sdk/_rubrics.py +1 -1
  35. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/sdk/_sharing.py +1 -1
  36. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/sdk/agent_run_writer.py +9 -4
  37. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/sdk/client.py +2 -0
  38. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/sdk/integrations/inspect.py +8 -6
  39. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/sdk/llm_context.py +208 -94
  40. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/sdk/reading.py +19 -7
  41. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/trace.py +46 -41
  42. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/pyproject.toml +1 -1
  43. docent_python-0.1.61a0/docent/_llm_util/llm_cache.py +0 -206
  44. docent_python-0.1.61a0/docent/data_models/metadata_util.py +0 -32
  45. docent_python-0.1.61a0/docent/trace_temp.py +0 -1088
  46. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/LICENSE.md +0 -0
  47. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/README.md +0 -0
  48. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/_llm_util/__init__.py +0 -0
  49. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/_llm_util/data_models/__init__.py +0 -0
  50. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/_llm_util/providers/__init__.py +0 -0
  51. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/_llm_util/providers/common.py +0 -0
  52. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/_llm_util/providers/preference_types.py +0 -0
  53. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/_llm_util/providers/provider_registry.py +0 -0
  54. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/_log_util/__init__.py +0 -0
  55. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/data_models/_tiktoken_util.py +0 -0
  56. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/data_models/chat/__init__.py +0 -0
  57. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/data_models/chat/content.py +0 -0
  58. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/data_models/chat/message.py +0 -0
  59. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/data_models/chat/response_format.py +0 -0
  60. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/data_models/chat/tool.py +0 -0
  61. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/data_models/feedback.py +0 -0
  62. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/data_models/formatted_objects.py +0 -0
  63. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/data_models/judge.py +0 -0
  64. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/data_models/util.py +0 -0
  65. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/judges/__init__.py +0 -0
  66. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/judges/analysis.py +0 -0
  67. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/judges/runner.py +0 -0
  68. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/judges/stats.py +0 -0
  69. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/judges/util/forgiving_json.py +0 -0
  70. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/judges/util/meta_schema.json +0 -0
  71. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/judges/util/meta_schema.py +0 -0
  72. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/judges/util/template_formatter.py +0 -0
  73. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/judges/util/voting.py +0 -0
  74. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/mcp/__init__.py +0 -0
  75. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/mcp/__main__.py +0 -0
  76. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/py.typed +0 -0
  77. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/samples/__init__.py +0 -0
  78. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/samples/load.py +0 -0
  79. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/samples/log.eval +0 -0
  80. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/samples/tb_airline.json +0 -0
  81. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/sdk/__init__.py +0 -0
  82. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/sdk/_agent_runs.py +0 -0
  83. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/sdk/_feedback.py +0 -0
  84. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/sdk/_labels.py +0 -0
  85. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/sdk/integrations/__init__.py +0 -0
  86. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/sdk/integrations/harbor.py +0 -0
  87. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/sdk/integrations/nemogym.py +0 -0
  88. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/sdk/integrations/util.py +0 -0
  89. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/sdk/llm_request.py +0 -0
  90. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/docent/sdk/util.py +0 -0
  91. {docent_python-0.1.61a0 → docent_python-0.1.63a0}/uv.lock +0 -0
@@ -145,8 +145,9 @@ ENV/
145
145
  env.bak/
146
146
  venv.bak/
147
147
 
148
- # Docent environment files
148
+ # Docent
149
149
  docent.env*
150
+ docent_analyses/
150
151
 
151
152
  # Spyder project settings
152
153
  .spyderproject
@@ -204,3 +205,6 @@ data/cache
204
205
 
205
206
  # dont commit package lock, force use of bun lock
206
207
  package-lock.json
208
+
209
+ # Claude Code worktrees
210
+ .claude/worktrees/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docent-python
3
- Version: 0.1.61a0
3
+ Version: 0.1.63a0
4
4
  Summary: Docent SDK
5
5
  Project-URL: Homepage, https://github.com/TransluceAI/docent
6
6
  Project-URL: Issues, https://github.com/TransluceAI/docent/issues
@@ -4,6 +4,7 @@ __all__ = [
4
4
  "load_config_file",
5
5
  "AgentRunRef",
6
6
  "TranscriptRef",
7
+ "TranscriptSliceRef",
7
8
  "ReadingResultRef",
8
9
  "ResultRef",
9
10
  "Prompt",
@@ -17,4 +18,5 @@ from docent.sdk.llm_context import (
17
18
  ReadingResultRef,
18
19
  ResultRef,
19
20
  TranscriptRef,
21
+ TranscriptSliceRef,
20
22
  )
@@ -35,6 +35,11 @@ class ContextWindowException(LLMException):
35
35
  user_message = "Context window exceeded."
36
36
 
37
37
 
38
+ class InvalidPromptException(LLMException):
39
+ error_type_id = "invalid_prompt"
40
+ user_message = "The model provider rejected this prompt for safety reasons."
41
+
42
+
38
43
  class NoResponseException(LLMException):
39
44
  error_type_id = "no_response"
40
45
  user_message = "The model returned an empty response. Please try again later."
@@ -45,6 +50,17 @@ class DocentUsageLimitException(LLMException):
45
50
  user_message = "Free daily usage limit reached. Add your own API key in settings or contact us for increased limits."
46
51
 
47
52
 
53
+ class ProviderAuthenticationException(LLMException):
54
+ error_type_id = "provider_authentication"
55
+
56
+ def __init__(self, message: str = ""):
57
+ super().__init__(message)
58
+ self.user_message = (
59
+ "The model provider API key could not be authenticated. "
60
+ "If you added your own key, update it in Settings > Model providers."
61
+ )
62
+
63
+
48
64
  class ValidationFailedException(LLMException):
49
65
  error_type_id = "validation_failed"
50
66
  user_message = "The model returned invalid output that failed validation."
@@ -64,8 +80,10 @@ LLM_ERROR_TYPES: list[type[LLMException]] = [
64
80
  CompletionTooLongException,
65
81
  RateLimitException,
66
82
  ContextWindowException,
83
+ InvalidPromptException,
67
84
  NoResponseException,
68
85
  DocentUsageLimitException,
86
+ ProviderAuthenticationException,
69
87
  ValidationFailedException,
70
88
  TimeoutException,
71
89
  ]
@@ -97,7 +97,6 @@ class LLMOutput:
97
97
  completions: list[LLMCompletion]
98
98
  errors: list[LLMException] = field(default_factory=lambda: [])
99
99
  usage: UsageMetrics = field(default_factory=UsageMetrics)
100
- from_cache: bool = False
101
100
  duration: float | None = None
102
101
 
103
102
  @property
@@ -142,7 +141,6 @@ class LLMOutput:
142
141
  "completions": [comp.model_dump() for comp in self.completions],
143
142
  "errors": [e.error_type_id for e in self.errors],
144
143
  "usage": self.usage.to_dict(),
145
- "from_cache": self.from_cache,
146
144
  "duration": self.duration,
147
145
  }
148
146
 
@@ -156,7 +154,7 @@ class LLMOutput:
156
154
  ]
157
155
  errors_to_log = [e for e in errors if e not in error_types_to_not_log]
158
156
  if errors_to_log:
159
- logger.error(f"Loading LLM output with errors: {errors}")
157
+ logger.error("Loading LLM output with errors: %s", errors)
160
158
  errors = [error_type_map.get(e, LLMException)() for e in errors]
161
159
 
162
160
  completions = data.get("completions", [])
@@ -171,7 +169,6 @@ class LLMOutput:
171
169
  completions=completions,
172
170
  errors=errors,
173
171
  usage=UsageMetrics(**usage),
174
- from_cache=bool(data.get("from_cache", False)),
175
172
  duration=data.get("duration"),
176
173
  )
177
174
 
@@ -275,7 +272,6 @@ def finalize_llm_output_partial(partial: LLMOutputPartial) -> LLMOutput:
275
272
  for c in partial.completions
276
273
  ],
277
274
  usage=partial.usage,
278
- from_cache=False,
279
275
  )
280
276
 
281
277
  # If the completion is empty and was truncated (likely due to too much reasoning), raise an exception
@@ -28,7 +28,6 @@ from docent._llm_util.data_models.llm_output import (
28
28
  AsyncSingleLLMOutputStreamingCallback,
29
29
  LLMOutput,
30
30
  )
31
- from docent._llm_util.llm_cache import LLMCache
32
31
  from docent._llm_util.providers.preference_types import ModelOption
33
32
  from docent._llm_util.providers.provider_registry import (
34
33
  PROVIDERS,
@@ -37,6 +36,7 @@ from docent._llm_util.providers.provider_registry import (
37
36
  )
38
37
  from docent._log_util import get_logger
39
38
  from docent.data_models.chat import ChatMessage, ToolInfo, parse_chat_message
39
+ from docent.data_models.chat.message import AssistantMessage, UserMessage
40
40
  from docent.data_models.chat.response_format import ResponseFormat
41
41
 
42
42
  logger = get_logger(__name__)
@@ -91,8 +91,8 @@ async def _parallelize_calls(
91
91
  semaphore: Semaphore,
92
92
  max_retries: int,
93
93
  # use_tqdm: bool,
94
- cache: LLMCache | None = None,
95
94
  response_format: ResponseFormat | None = None,
95
+ retry_with_feedback: bool = False,
96
96
  ):
97
97
  base_func = partial(
98
98
  single_output_getter,
@@ -120,122 +120,129 @@ async def _parallelize_calls(
120
120
  else None
121
121
  )
122
122
 
123
- # Save resolved messages to avoid multiple resolutions
124
- resolved_messages: list[list[ChatMessage] | None] = [None] * len(inputs)
125
-
126
123
  # Not sure why the cast is necessary for the type checker
127
124
  cancelled_due_to_usage_limit: bool = cast(bool, False)
128
125
 
126
+ def _mark_usage_limit_responses() -> None:
127
+ for i, response in enumerate(responses):
128
+ if response is None:
129
+ responses[i] = LLMOutput(
130
+ model=model_name,
131
+ completions=[],
132
+ errors=[DocentUsageLimitException()],
133
+ )
134
+ elif not response.completions and not response.errors:
135
+ response.errors.append(DocentUsageLimitException())
136
+
129
137
  async def _limited_task(i: int, cur_input: MessagesInput, tg: TaskGroup):
130
- nonlocal responses, pbar, resolved_messages, cancelled_due_to_usage_limit
138
+ nonlocal responses, pbar, cancelled_due_to_usage_limit
131
139
 
132
140
  async with semaphore:
133
141
  messages = _resolve_messages_input(cur_input)
134
- resolved_messages[i] = messages
135
142
 
136
143
  retry_count = 0
137
144
  result = None
138
- call_started_at: float | None = None
139
-
140
- # Check if there's a cached result
141
- cached_result = (
142
- cache.get(
143
- messages,
144
- model_name,
145
- tools=tools,
146
- tool_choice=tool_choice,
147
- reasoning_effort=reasoning_effort,
148
- temperature=temperature,
149
- logprobs=logprobs,
150
- top_logprobs=top_logprobs,
151
- response_format=response_format,
152
- )
153
- if cache is not None
154
- else None
155
- )
156
- if cached_result is not None:
157
- result = cached_result
158
- if streaming_callback is not None:
159
- await streaming_callback(i, result)
160
- else:
161
- call_started_at = time.perf_counter()
162
- while retry_count < MAX_VALIDATION_ATTEMPTS:
163
- try:
164
- if streaming_callback is None:
165
- result = await base_func(client=client, messages=messages)
166
- else:
167
- result = await base_func(
168
- client=client,
169
- streaming_callback=_get_single_streaming_callback(
170
- i, streaming_callback
171
- ),
172
- messages=messages,
173
- )
174
-
175
- # Validate if validation callback provided and result is successful
176
- if validation_callback and not result.did_error:
177
- await validation_callback(i, result)
178
-
179
- break
180
- except ValidationFailedException as e:
181
- retry_count += 1
182
- logger.warning(
183
- f"Validation failed for {model_name} after {retry_count} attempts: {e}"
145
+ call_started_at = time.perf_counter()
146
+ current_messages = messages
147
+ while retry_count < MAX_VALIDATION_ATTEMPTS:
148
+ try:
149
+ if streaming_callback is None:
150
+ result = await base_func(client=client, messages=current_messages)
151
+ else:
152
+ result = await base_func(
153
+ client=client,
154
+ streaming_callback=_get_single_streaming_callback(
155
+ i, streaming_callback
156
+ ),
157
+ messages=current_messages,
184
158
  )
185
- if retry_count >= MAX_VALIDATION_ATTEMPTS:
186
- logger.error(
187
- f"Validation failed for {model_name} after {retry_count} attempts. Original output: {e.failed_output}"
188
- )
189
- result = LLMOutput(
190
- model=model_name,
191
- completions=[],
192
- errors=[e],
193
- )
194
- break
195
- except DocentUsageLimitException as _:
196
- result = LLMOutput(
197
- model=model_name,
198
- completions=[],
199
- errors=[], # Usage limit exceptions will be added to all results later if cancelled_due_to_usage_limit
159
+
160
+ # Validate if validation callback provided and result is successful
161
+ if validation_callback and not result.did_error:
162
+ await validation_callback(i, result)
163
+
164
+ break
165
+ except ValidationFailedException as e:
166
+ retry_count += 1
167
+ logger.warning(
168
+ "Validation failed for %s after %d attempts: %s",
169
+ model_name,
170
+ retry_count,
171
+ e,
172
+ extra={"original_output": e.failed_output},
173
+ )
174
+ if retry_count >= MAX_VALIDATION_ATTEMPTS:
175
+ logger.error(
176
+ "Validation failed for %s after %d attempts: %s",
177
+ model_name,
178
+ retry_count,
179
+ e,
180
+ extra={"original_output": e.failed_output},
200
181
  )
201
- cancelled_due_to_usage_limit = True
202
- tg.cancel_scope.cancel()
203
- break
204
- except asyncio.TimeoutError as e:
205
- timeout_exception = TimeoutException(str(e) or "Request timed out")
206
- timeout_exception.__cause__ = e
207
- logger.error(f"Call to {model_name} timed out")
208
182
  result = LLMOutput(
209
183
  model=model_name,
210
184
  completions=[],
211
- errors=[timeout_exception],
185
+ errors=[e],
212
186
  )
213
187
  break
214
- except Exception as e:
215
- if not isinstance(e, LLMException):
216
- logger.error(
217
- f"LLM call raised an exception that is not an LLMException: {e}. Failure traceback:\n{traceback.format_exc()}"
218
- )
219
- llm_exception = LLMException(e)
220
- llm_exception.__cause__ = e
221
- else:
222
- llm_exception = e
223
-
224
- error_message = f"Call to {model_name} failed even with backoff: {e.__class__.__name__}."
225
-
226
- if not isinstance(e, RateLimitException):
227
- error_message += f" Failure traceback:\n{traceback.format_exc()}"
228
- logger.error(error_message)
229
188
 
230
- result = LLMOutput(
231
- model=model_name,
232
- completions=[],
233
- errors=[llm_exception],
189
+ if retry_with_feedback:
190
+ # Build a new message list with the failed output and
191
+ # error feedback so the model can correct itself
192
+ current_messages = [
193
+ *messages,
194
+ AssistantMessage(content=e.failed_output or ""),
195
+ UserMessage(
196
+ content=f"Your previous output failed validation: {e}\n\nPlease try again with a corrected output."
197
+ ),
198
+ ]
199
+ except DocentUsageLimitException as _:
200
+ result = LLMOutput(
201
+ model=model_name,
202
+ completions=[],
203
+ errors=[], # Usage limit exceptions will be added to all results later if cancelled_due_to_usage_limit
204
+ )
205
+ cancelled_due_to_usage_limit = True
206
+ tg.cancel_scope.cancel()
207
+ break
208
+ except asyncio.TimeoutError as e:
209
+ timeout_exception = TimeoutException(str(e) or "Request timed out")
210
+ timeout_exception.__cause__ = e
211
+ logger.error("Call to %s timed out", model_name)
212
+ result = LLMOutput(
213
+ model=model_name,
214
+ completions=[],
215
+ errors=[timeout_exception],
216
+ )
217
+ break
218
+ except Exception as e:
219
+ if not isinstance(e, LLMException):
220
+ logger.error(
221
+ "LLM call raised an exception that is not an LLMException: %s. Failure traceback:\n%s",
222
+ e,
223
+ traceback.format_exc(),
234
224
  )
235
- break
225
+ llm_exception = LLMException(e)
226
+ llm_exception.__cause__ = e
227
+ else:
228
+ llm_exception = e
229
+
230
+ error_message = (
231
+ f"Call to {model_name} failed even with backoff: {e.__class__.__name__}."
232
+ )
236
233
 
237
- # Only store the elapsed time if we didn't hit the cache and the call was successful
238
- if cached_result is None and result is not None and call_started_at is not None:
234
+ if not isinstance(e, RateLimitException):
235
+ error_message += f" Failure traceback:\n{traceback.format_exc()}"
236
+ logger.error(error_message)
237
+
238
+ result = LLMOutput(
239
+ model=model_name,
240
+ completions=[],
241
+ errors=[llm_exception],
242
+ )
243
+ break
244
+
245
+ if result is not None:
239
246
  result.duration = time.perf_counter() - call_started_at
240
247
 
241
248
  # Always call completion callback with final result (success or error)
@@ -244,44 +251,14 @@ async def _parallelize_calls(
244
251
  await completion_callback(i, result)
245
252
  # LLMService uses this callback to record cost, and may throw an error if we just exceeded limit
246
253
  except DocentUsageLimitException as e:
247
- result.errors.append(e)
254
+ if not result.completions and not result.errors:
255
+ result.errors.append(e)
248
256
  cancelled_due_to_usage_limit = True
249
257
  tg.cancel_scope.cancel()
250
258
 
251
259
  responses[i] = result
252
260
  if pbar is not None:
253
261
  pbar.update(1)
254
- if pbar is None or pbar.n == pbar.total:
255
- tg.cancel_scope.cancel()
256
-
257
- def _cache_responses():
258
- nonlocal responses, cache
259
-
260
- if cache is not None:
261
- indices = [
262
- i
263
- for i, response in enumerate(responses)
264
- if resolved_messages[i] is not None
265
- and response is not None
266
- and not response.did_error
267
- ]
268
- cache.set_batch(
269
- # We already checked that each index has a resolved messages list
270
- [cast(list[ChatMessage], resolved_messages[i]) for i in indices],
271
- model_name,
272
- # We already checked that each index corresponds to an LLMOutput object
273
- [cast(LLMOutput, responses[i]) for i in indices],
274
- tools=tools,
275
- tool_choice=tool_choice,
276
- reasoning_effort=reasoning_effort,
277
- temperature=temperature,
278
- logprobs=logprobs,
279
- top_logprobs=top_logprobs,
280
- response_format=response_format,
281
- )
282
- return len(indices)
283
- else:
284
- return 0
285
262
 
286
263
  # Get all results concurrently
287
264
  try:
@@ -290,30 +267,14 @@ async def _parallelize_calls(
290
267
  for i, cur_input in enumerate(inputs):
291
268
  tg.start_soon(_limited_task, i, cur_input, tg)
292
269
 
293
- # Cache what we have so far if something got cancelled
294
270
  except anyio.get_cancelled_exc_class():
295
- num_cached = _cache_responses()
296
- if num_cached:
297
- logger.info(
298
- f"Cancelled {len(inputs) - num_cached} unfinished LLM API calls, but cached {num_cached} completed responses"
299
- )
300
-
301
- # If the task was cancelled due to usage limit, set the response to a usage limit exception
302
- if cancelled_due_to_usage_limit:
303
- for i, response in enumerate(responses):
304
- if response is None:
305
- responses[i] = LLMOutput(
306
- model=model_name,
307
- completions=[],
308
- errors=[DocentUsageLimitException()],
309
- )
310
- else:
311
- response.errors.append(DocentUsageLimitException())
271
+ if not cancelled_due_to_usage_limit:
272
+ raise
312
273
 
313
- raise
314
-
315
- # Cache results if available
316
- _cache_responses()
274
+ # If we stopped the batch due to usage limits, make sure every input has a
275
+ # structured result instead of relying on AnyIO's cancellation propagation.
276
+ if cancelled_due_to_usage_limit:
277
+ _mark_usage_limit_responses()
317
278
 
318
279
  # At this point, all indices should have a result
319
280
  assert all(isinstance(r, LLMOutput) for r in responses), (
@@ -357,9 +318,9 @@ class BaseLLMService:
357
318
  streaming_callback: AsyncLLMOutputStreamingCallback | None = None,
358
319
  validation_callback: AsyncLLMOutputStreamingCallback | None = None,
359
320
  completion_callback: AsyncLLMOutputStreamingCallback | None = None,
360
- use_cache: bool = False,
361
321
  response_format: ResponseFormat | None = None,
362
322
  max_retries: int = 1,
323
+ retry_with_feedback: bool = False,
363
324
  _api_key_overrides: dict[str, str] = dict(),
364
325
  ) -> list[LLMOutput]:
365
326
  """Request completions from a configured LLM provider."""
@@ -375,14 +336,6 @@ class BaseLLMService:
375
336
  f"Logprobs are not supported for Anthropic, so we can't use model {model_option.model_name}"
376
337
  )
377
338
 
378
- # Instantiate cache
379
- # TODO(mengk): make this more robust, possibly move to a NoSQL database or something
380
- try:
381
- cache = LLMCache() if use_cache else None
382
- except ValueError as e:
383
- logger.warning(f"Disabling LLM cache due to init error: {e}")
384
- cache = None
385
-
386
339
  # Initialize pointer to which model we're using; used for model rotation after failures
387
340
  current_model_option_index = 0
388
341
 
@@ -395,7 +348,7 @@ class BaseLLMService:
395
348
  return None
396
349
 
397
350
  new_model_option = model_options[current_model_option_index]
398
- logger.warning(f"Switched to next model {new_model_option.model_name}")
351
+ logger.warning("Switched to next model %s", new_model_option.model_name)
399
352
  return new_model_option
400
353
 
401
354
  while True:
@@ -413,7 +366,7 @@ class BaseLLMService:
413
366
  single_output_getter = PROVIDERS[provider]["single_output_getter"]
414
367
  single_streaming_output_getter = PROVIDERS[provider]["single_streaming_output_getter"]
415
368
 
416
- # Get completions for uncached messages
369
+ # Get completions for messages.
417
370
  outputs: list[LLMOutput] = await _parallelize_calls(
418
371
  (
419
372
  single_output_getter
@@ -436,11 +389,18 @@ class BaseLLMService:
436
389
  timeout=timeout,
437
390
  semaphore=self._semaphore,
438
391
  max_retries=max_retries,
439
- cache=cache,
440
392
  response_format=response_format,
393
+ retry_with_feedback=retry_with_feedback,
441
394
  )
442
395
  assert len(outputs) == len(inputs), "Number of outputs must match number of messages"
443
396
 
397
+ if any(
398
+ isinstance(e, DocentUsageLimitException)
399
+ for output in outputs
400
+ for e in output.errors
401
+ ):
402
+ break
403
+
444
404
  # Only count errors that should trigger model rotation (API errors, not validation/usage errors)
445
405
  num_rotation_errors = sum(
446
406
  1
@@ -452,7 +412,7 @@ class BaseLLMService:
452
412
  )
453
413
  )
454
414
  if num_rotation_errors > 0:
455
- logger.warning(f"{model_name}: {num_rotation_errors} API errors")
415
+ logger.warning("%s: %s API errors", model_name, num_rotation_errors)
456
416
  if not _rotate_model_option():
457
417
  break
458
418
  else:
@@ -183,7 +183,7 @@ def get_model_info(model_name: str) -> Optional[ModelInfo]:
183
183
  def get_context_window(model_name: str) -> int:
184
184
  info = get_model_info(model_name)
185
185
  if info is None:
186
- logger.warning(f"No context window found for model {model_name}")
186
+ logger.warning("No context window found for model %s", model_name)
187
187
  return 100_000
188
188
  return info.context_window
189
189
 
@@ -196,11 +196,11 @@ def get_rates_for_model_name(model_name: str) -> Optional[ModelRate]:
196
196
  def estimate_cost_cents(model_name: str, token_count: int, token_type: TokenType) -> float:
197
197
  rate = get_rates_for_model_name(model_name)
198
198
  if rate is None:
199
- logger.warning(f"No rate found for model {model_name}")
199
+ logger.warning("No rate found for model %s", model_name)
200
200
  return 0.0
201
201
  usd_per_mtok = rate.get(token_type)
202
202
  if usd_per_mtok is None:
203
- logger.warning(f"No rate found for model {model_name} token type {token_type}")
203
+ logger.warning("No rate found for model %s token type %s", model_name, token_type)
204
204
  return 0.0
205
205
  cents_per_token = usd_per_mtok * 100 / 1_000_000.0
206
206
  return token_count * cents_per_token
@@ -41,6 +41,7 @@ from docent._llm_util.data_models.exceptions import (
41
41
  CompletionTooLongException,
42
42
  ContextWindowException,
43
43
  NoResponseException,
44
+ ProviderAuthenticationException,
44
45
  RateLimitException,
45
46
  )
46
47
  from docent._llm_util.data_models.llm_output import (
@@ -78,7 +79,9 @@ ANTHROPIC_STRUCTURED_OUTPUTS_BETA = "structured-outputs-2025-11-13"
78
79
 
79
80
  def _print_backoff_message(e: Details):
80
81
  logger.warning(
81
- f"Anthropic backing off for {e['wait']:.2f}s due to {e['exception'].__class__.__name__}" # type: ignore
82
+ "Anthropic backing off for %.2fs due to %s",
83
+ e["wait"], # type: ignore
84
+ e["exception"].__class__.__name__, # type: ignore
82
85
  )
83
86
 
84
87
 
@@ -86,6 +89,7 @@ def _is_retryable_error(e: BaseException) -> bool:
86
89
  if (
87
90
  isinstance(e, BadRequestError)
88
91
  or isinstance(e, ContextWindowException)
92
+ or isinstance(e, ProviderAuthenticationException)
89
93
  or isinstance(e, AuthenticationError)
90
94
  or isinstance(e, NotImplementedError)
91
95
  or isinstance(e, PermissionDeniedError)
@@ -209,6 +213,8 @@ def _build_output_format(response_format: ResponseFormat | None) -> dict[str, An
209
213
 
210
214
 
211
215
  def _convert_anthropic_error(e: Exception):
216
+ if isinstance(e, (AuthenticationError, PermissionDeniedError)):
217
+ return ProviderAuthenticationException(e.message)
212
218
  if isinstance(e, BadRequestError):
213
219
  if "context limit" in e.message.lower() or "prompt is too long" in e.message.lower():
214
220
  return ContextWindowException()
@@ -285,7 +291,7 @@ async def get_anthropic_chat_completion_streaming_async(
285
291
  if llm_output_partial:
286
292
  return finalize_llm_output_partial(llm_output_partial)
287
293
  return LLMOutput(model=model_name, completions=[], errors=[NoResponseException()])
288
- except (RateLimitError, BadRequestError) as e:
294
+ except (RateLimitError, BadRequestError, AuthenticationError, PermissionDeniedError) as e:
289
295
  if e2 := _convert_anthropic_error(e):
290
296
  raise e2 from e
291
297
  raise
@@ -365,7 +371,7 @@ def update_llm_output(
365
371
  ):
366
372
  # This should not happen with a well-behaved API, log and skip
367
373
  logger.warning(
368
- f"Received InputJSONDelta before start event at index {index}, skipping"
374
+ "Received InputJSONDelta before start event at index %s, skipping", index
369
375
  )
370
376
  else:
371
377
  cur_tool_calls[index] = ToolCallPartial(
@@ -482,7 +488,7 @@ async def get_anthropic_chat_completion_async(
482
488
  )
483
489
 
484
490
  return output
485
- except (RateLimitError, BadRequestError) as e:
491
+ except (RateLimitError, BadRequestError, AuthenticationError, PermissionDeniedError) as e:
486
492
  if e2 := _convert_anthropic_error(e):
487
493
  raise e2 from e
488
494
  raise