docent-python 0.1.61a0__tar.gz → 0.1.62a0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/.gitignore +5 -1
  2. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/PKG-INFO +1 -1
  3. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/data_models/llm_output.py +0 -4
  4. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/llm_svc.py +121 -163
  5. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_log_util/logger.py +3 -2
  6. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/__init__.py +2 -0
  7. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/agent_run.py +134 -162
  8. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/citation.py +5 -4
  9. docent_python-0.1.62a0/docent/data_models/context_config.py +88 -0
  10. docent_python-0.1.62a0/docent/data_models/metadata_util.py +180 -0
  11. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/reading.py +95 -33
  12. docent_python-0.1.62a0/docent/data_models/report.py +16 -0
  13. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/transcript.py +68 -38
  14. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/impl.py +41 -44
  15. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/util/parse_output.py +1 -1
  16. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/mcp/server.py +250 -9
  17. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/_base.py +44 -2
  18. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/_client_util.py +72 -4
  19. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/_readings.py +235 -43
  20. docent_python-0.1.62a0/docent/sdk/_reports.py +281 -0
  21. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/client.py +2 -0
  22. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/llm_context.py +174 -83
  23. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/reading.py +19 -7
  24. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/pyproject.toml +1 -1
  25. docent_python-0.1.61a0/docent/_llm_util/llm_cache.py +0 -206
  26. docent_python-0.1.61a0/docent/data_models/metadata_util.py +0 -32
  27. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/LICENSE.md +0 -0
  28. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/README.md +0 -0
  29. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/__init__.py +0 -0
  30. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/__init__.py +0 -0
  31. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/data_models/__init__.py +0 -0
  32. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/data_models/exceptions.py +0 -0
  33. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/model_registry.py +0 -0
  34. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/providers/__init__.py +0 -0
  35. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/providers/anthropic.py +0 -0
  36. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/providers/common.py +0 -0
  37. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/providers/google.py +0 -0
  38. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/providers/openai.py +0 -0
  39. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/providers/openrouter.py +0 -0
  40. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/providers/preference_types.py +0 -0
  41. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/providers/provider_registry.py +0 -0
  42. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_log_util/__init__.py +0 -0
  43. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/_tiktoken_util.py +0 -0
  44. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/chat/__init__.py +0 -0
  45. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/chat/content.py +0 -0
  46. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/chat/message.py +0 -0
  47. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/chat/response_format.py +0 -0
  48. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/chat/tool.py +0 -0
  49. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/feedback.py +0 -0
  50. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/formatted_objects.py +0 -0
  51. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/judge.py +0 -0
  52. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/regex.py +0 -0
  53. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/util.py +0 -0
  54. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/__init__.py +0 -0
  55. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/analysis.py +0 -0
  56. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/runner.py +0 -0
  57. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/stats.py +0 -0
  58. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/types.py +0 -0
  59. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/util/forgiving_json.py +0 -0
  60. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/util/meta_schema.json +0 -0
  61. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/util/meta_schema.py +0 -0
  62. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/util/template_formatter.py +0 -0
  63. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/util/voting.py +0 -0
  64. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/loaders/load_inspect.py +0 -0
  65. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/mcp/__init__.py +0 -0
  66. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/mcp/__main__.py +0 -0
  67. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/py.typed +0 -0
  68. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/samples/__init__.py +0 -0
  69. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/samples/load.py +0 -0
  70. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/samples/log.eval +0 -0
  71. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/samples/tb_airline.json +0 -0
  72. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/__init__.py +0 -0
  73. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/_agent_runs.py +0 -0
  74. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/_collections.py +0 -0
  75. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/_dql.py +0 -0
  76. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/_feedback.py +0 -0
  77. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/_labels.py +0 -0
  78. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/_results.py +0 -0
  79. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/_rubrics.py +0 -0
  80. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/_sharing.py +0 -0
  81. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/agent_run_writer.py +0 -0
  82. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/integrations/__init__.py +0 -0
  83. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/integrations/harbor.py +0 -0
  84. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/integrations/inspect.py +0 -0
  85. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/integrations/nemogym.py +0 -0
  86. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/integrations/util.py +0 -0
  87. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/llm_request.py +0 -0
  88. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/util.py +0 -0
  89. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/trace.py +0 -0
  90. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/trace_temp.py +0 -0
  91. {docent_python-0.1.61a0 → docent_python-0.1.62a0}/uv.lock +0 -0
@@ -145,8 +145,9 @@ ENV/
145
145
  env.bak/
146
146
  venv.bak/
147
147
 
148
- # Docent environment files
148
+ # Docent
149
149
  docent.env*
150
+ docent_analyses/
150
151
 
151
152
  # Spyder project settings
152
153
  .spyderproject
@@ -204,3 +205,6 @@ data/cache
204
205
 
205
206
  # dont commit package lock, force use of bun lock
206
207
  package-lock.json
208
+
209
+ # Claude Code worktrees
210
+ .claude/worktrees/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docent-python
3
- Version: 0.1.61a0
3
+ Version: 0.1.62a0
4
4
  Summary: Docent SDK
5
5
  Project-URL: Homepage, https://github.com/TransluceAI/docent
6
6
  Project-URL: Issues, https://github.com/TransluceAI/docent/issues
@@ -97,7 +97,6 @@ class LLMOutput:
97
97
  completions: list[LLMCompletion]
98
98
  errors: list[LLMException] = field(default_factory=lambda: [])
99
99
  usage: UsageMetrics = field(default_factory=UsageMetrics)
100
- from_cache: bool = False
101
100
  duration: float | None = None
102
101
 
103
102
  @property
@@ -142,7 +141,6 @@ class LLMOutput:
142
141
  "completions": [comp.model_dump() for comp in self.completions],
143
142
  "errors": [e.error_type_id for e in self.errors],
144
143
  "usage": self.usage.to_dict(),
145
- "from_cache": self.from_cache,
146
144
  "duration": self.duration,
147
145
  }
148
146
 
@@ -171,7 +169,6 @@ class LLMOutput:
171
169
  completions=completions,
172
170
  errors=errors,
173
171
  usage=UsageMetrics(**usage),
174
- from_cache=bool(data.get("from_cache", False)),
175
172
  duration=data.get("duration"),
176
173
  )
177
174
 
@@ -275,7 +272,6 @@ def finalize_llm_output_partial(partial: LLMOutputPartial) -> LLMOutput:
275
272
  for c in partial.completions
276
273
  ],
277
274
  usage=partial.usage,
278
- from_cache=False,
279
275
  )
280
276
 
281
277
  # If the completion is empty and was truncated (likely due to too much reasoning), raise an exception
@@ -28,7 +28,6 @@ from docent._llm_util.data_models.llm_output import (
28
28
  AsyncSingleLLMOutputStreamingCallback,
29
29
  LLMOutput,
30
30
  )
31
- from docent._llm_util.llm_cache import LLMCache
32
31
  from docent._llm_util.providers.preference_types import ModelOption
33
32
  from docent._llm_util.providers.provider_registry import (
34
33
  PROVIDERS,
@@ -37,6 +36,7 @@ from docent._llm_util.providers.provider_registry import (
37
36
  )
38
37
  from docent._log_util import get_logger
39
38
  from docent.data_models.chat import ChatMessage, ToolInfo, parse_chat_message
39
+ from docent.data_models.chat.message import AssistantMessage, UserMessage
40
40
  from docent.data_models.chat.response_format import ResponseFormat
41
41
 
42
42
  logger = get_logger(__name__)
@@ -91,8 +91,8 @@ async def _parallelize_calls(
91
91
  semaphore: Semaphore,
92
92
  max_retries: int,
93
93
  # use_tqdm: bool,
94
- cache: LLMCache | None = None,
95
94
  response_format: ResponseFormat | None = None,
95
+ retry_with_feedback: bool = False,
96
96
  ):
97
97
  base_func = partial(
98
98
  single_output_getter,
@@ -120,122 +120,127 @@ async def _parallelize_calls(
120
120
  else None
121
121
  )
122
122
 
123
- # Save resolved messages to avoid multiple resolutions
124
- resolved_messages: list[list[ChatMessage] | None] = [None] * len(inputs)
125
-
126
123
  # Not sure why the cast is necessary for the type checker
127
124
  cancelled_due_to_usage_limit: bool = cast(bool, False)
128
125
 
126
+ def _mark_usage_limit_responses() -> None:
127
+ for i, response in enumerate(responses):
128
+ if response is None:
129
+ responses[i] = LLMOutput(
130
+ model=model_name,
131
+ completions=[],
132
+ errors=[DocentUsageLimitException()],
133
+ )
134
+ elif not response.completions and not response.errors:
135
+ response.errors.append(DocentUsageLimitException())
136
+
129
137
  async def _limited_task(i: int, cur_input: MessagesInput, tg: TaskGroup):
130
- nonlocal responses, pbar, resolved_messages, cancelled_due_to_usage_limit
138
+ nonlocal responses, pbar, cancelled_due_to_usage_limit
131
139
 
132
140
  async with semaphore:
133
141
  messages = _resolve_messages_input(cur_input)
134
- resolved_messages[i] = messages
135
142
 
136
143
  retry_count = 0
137
144
  result = None
138
- call_started_at: float | None = None
139
-
140
- # Check if there's a cached result
141
- cached_result = (
142
- cache.get(
143
- messages,
144
- model_name,
145
- tools=tools,
146
- tool_choice=tool_choice,
147
- reasoning_effort=reasoning_effort,
148
- temperature=temperature,
149
- logprobs=logprobs,
150
- top_logprobs=top_logprobs,
151
- response_format=response_format,
152
- )
153
- if cache is not None
154
- else None
155
- )
156
- if cached_result is not None:
157
- result = cached_result
158
- if streaming_callback is not None:
159
- await streaming_callback(i, result)
160
- else:
161
- call_started_at = time.perf_counter()
162
- while retry_count < MAX_VALIDATION_ATTEMPTS:
163
- try:
164
- if streaming_callback is None:
165
- result = await base_func(client=client, messages=messages)
166
- else:
167
- result = await base_func(
168
- client=client,
169
- streaming_callback=_get_single_streaming_callback(
170
- i, streaming_callback
171
- ),
172
- messages=messages,
173
- )
174
-
175
- # Validate if validation callback provided and result is successful
176
- if validation_callback and not result.did_error:
177
- await validation_callback(i, result)
178
-
179
- break
180
- except ValidationFailedException as e:
181
- retry_count += 1
182
- logger.warning(
183
- f"Validation failed for {model_name} after {retry_count} attempts: {e}"
145
+ call_started_at = time.perf_counter()
146
+ current_messages = messages
147
+ while retry_count < MAX_VALIDATION_ATTEMPTS:
148
+ try:
149
+ if streaming_callback is None:
150
+ result = await base_func(client=client, messages=current_messages)
151
+ else:
152
+ result = await base_func(
153
+ client=client,
154
+ streaming_callback=_get_single_streaming_callback(
155
+ i, streaming_callback
156
+ ),
157
+ messages=current_messages,
184
158
  )
185
- if retry_count >= MAX_VALIDATION_ATTEMPTS:
186
- logger.error(
187
- f"Validation failed for {model_name} after {retry_count} attempts. Original output: {e.failed_output}"
188
- )
189
- result = LLMOutput(
190
- model=model_name,
191
- completions=[],
192
- errors=[e],
193
- )
194
- break
195
- except DocentUsageLimitException as _:
196
- result = LLMOutput(
197
- model=model_name,
198
- completions=[],
199
- errors=[], # Usage limit exceptions will be added to all results later if cancelled_due_to_usage_limit
159
+
160
+ # Validate if validation callback provided and result is successful
161
+ if validation_callback and not result.did_error:
162
+ await validation_callback(i, result)
163
+
164
+ break
165
+ except ValidationFailedException as e:
166
+ retry_count += 1
167
+ logger.warning(
168
+ "Validation failed for %s after %d attempts: %s",
169
+ model_name,
170
+ retry_count,
171
+ e,
172
+ extra={"original_output": e.failed_output},
173
+ )
174
+ if retry_count >= MAX_VALIDATION_ATTEMPTS:
175
+ logger.error(
176
+ "Validation failed for %s after %d attempts: %s",
177
+ model_name,
178
+ retry_count,
179
+ e,
180
+ extra={"original_output": e.failed_output},
200
181
  )
201
- cancelled_due_to_usage_limit = True
202
- tg.cancel_scope.cancel()
203
- break
204
- except asyncio.TimeoutError as e:
205
- timeout_exception = TimeoutException(str(e) or "Request timed out")
206
- timeout_exception.__cause__ = e
207
- logger.error(f"Call to {model_name} timed out")
208
182
  result = LLMOutput(
209
183
  model=model_name,
210
184
  completions=[],
211
- errors=[timeout_exception],
185
+ errors=[e],
212
186
  )
213
187
  break
214
- except Exception as e:
215
- if not isinstance(e, LLMException):
216
- logger.error(
217
- f"LLM call raised an exception that is not an LLMException: {e}. Failure traceback:\n{traceback.format_exc()}"
218
- )
219
- llm_exception = LLMException(e)
220
- llm_exception.__cause__ = e
221
- else:
222
- llm_exception = e
223
-
224
- error_message = f"Call to {model_name} failed even with backoff: {e.__class__.__name__}."
225
-
226
- if not isinstance(e, RateLimitException):
227
- error_message += f" Failure traceback:\n{traceback.format_exc()}"
228
- logger.error(error_message)
229
188
 
230
- result = LLMOutput(
231
- model=model_name,
232
- completions=[],
233
- errors=[llm_exception],
189
+ if retry_with_feedback:
190
+ # Build a new message list with the failed output and
191
+ # error feedback so the model can correct itself
192
+ current_messages = [
193
+ *messages,
194
+ AssistantMessage(content=e.failed_output or ""),
195
+ UserMessage(
196
+ content=f"Your previous output failed validation: {e}\n\nPlease try again with a corrected output."
197
+ ),
198
+ ]
199
+ except DocentUsageLimitException as _:
200
+ result = LLMOutput(
201
+ model=model_name,
202
+ completions=[],
203
+ errors=[], # Usage limit exceptions will be added to all results later if cancelled_due_to_usage_limit
204
+ )
205
+ cancelled_due_to_usage_limit = True
206
+ tg.cancel_scope.cancel()
207
+ break
208
+ except asyncio.TimeoutError as e:
209
+ timeout_exception = TimeoutException(str(e) or "Request timed out")
210
+ timeout_exception.__cause__ = e
211
+ logger.error(f"Call to {model_name} timed out")
212
+ result = LLMOutput(
213
+ model=model_name,
214
+ completions=[],
215
+ errors=[timeout_exception],
216
+ )
217
+ break
218
+ except Exception as e:
219
+ if not isinstance(e, LLMException):
220
+ logger.error(
221
+ f"LLM call raised an exception that is not an LLMException: {e}. Failure traceback:\n{traceback.format_exc()}"
234
222
  )
235
- break
223
+ llm_exception = LLMException(e)
224
+ llm_exception.__cause__ = e
225
+ else:
226
+ llm_exception = e
227
+
228
+ error_message = (
229
+ f"Call to {model_name} failed even with backoff: {e.__class__.__name__}."
230
+ )
236
231
 
237
- # Only store the elapsed time if we didn't hit the cache and the call was successful
238
- if cached_result is None and result is not None and call_started_at is not None:
232
+ if not isinstance(e, RateLimitException):
233
+ error_message += f" Failure traceback:\n{traceback.format_exc()}"
234
+ logger.error(error_message)
235
+
236
+ result = LLMOutput(
237
+ model=model_name,
238
+ completions=[],
239
+ errors=[llm_exception],
240
+ )
241
+ break
242
+
243
+ if result is not None:
239
244
  result.duration = time.perf_counter() - call_started_at
240
245
 
241
246
  # Always call completion callback with final result (success or error)
@@ -244,44 +249,14 @@ async def _parallelize_calls(
244
249
  await completion_callback(i, result)
245
250
  # LLMService uses this callback to record cost, and may throw an error if we just exceeded limit
246
251
  except DocentUsageLimitException as e:
247
- result.errors.append(e)
252
+ if not result.completions and not result.errors:
253
+ result.errors.append(e)
248
254
  cancelled_due_to_usage_limit = True
249
255
  tg.cancel_scope.cancel()
250
256
 
251
257
  responses[i] = result
252
258
  if pbar is not None:
253
259
  pbar.update(1)
254
- if pbar is None or pbar.n == pbar.total:
255
- tg.cancel_scope.cancel()
256
-
257
- def _cache_responses():
258
- nonlocal responses, cache
259
-
260
- if cache is not None:
261
- indices = [
262
- i
263
- for i, response in enumerate(responses)
264
- if resolved_messages[i] is not None
265
- and response is not None
266
- and not response.did_error
267
- ]
268
- cache.set_batch(
269
- # We already checked that each index has a resolved messages list
270
- [cast(list[ChatMessage], resolved_messages[i]) for i in indices],
271
- model_name,
272
- # We already checked that each index corresponds to an LLMOutput object
273
- [cast(LLMOutput, responses[i]) for i in indices],
274
- tools=tools,
275
- tool_choice=tool_choice,
276
- reasoning_effort=reasoning_effort,
277
- temperature=temperature,
278
- logprobs=logprobs,
279
- top_logprobs=top_logprobs,
280
- response_format=response_format,
281
- )
282
- return len(indices)
283
- else:
284
- return 0
285
260
 
286
261
  # Get all results concurrently
287
262
  try:
@@ -290,30 +265,14 @@ async def _parallelize_calls(
290
265
  for i, cur_input in enumerate(inputs):
291
266
  tg.start_soon(_limited_task, i, cur_input, tg)
292
267
 
293
- # Cache what we have so far if something got cancelled
294
268
  except anyio.get_cancelled_exc_class():
295
- num_cached = _cache_responses()
296
- if num_cached:
297
- logger.info(
298
- f"Cancelled {len(inputs) - num_cached} unfinished LLM API calls, but cached {num_cached} completed responses"
299
- )
300
-
301
- # If the task was cancelled due to usage limit, set the response to a usage limit exception
302
- if cancelled_due_to_usage_limit:
303
- for i, response in enumerate(responses):
304
- if response is None:
305
- responses[i] = LLMOutput(
306
- model=model_name,
307
- completions=[],
308
- errors=[DocentUsageLimitException()],
309
- )
310
- else:
311
- response.errors.append(DocentUsageLimitException())
269
+ if not cancelled_due_to_usage_limit:
270
+ raise
312
271
 
313
- raise
314
-
315
- # Cache results if available
316
- _cache_responses()
272
+ # If we stopped the batch due to usage limits, make sure every input has a
273
+ # structured result instead of relying on AnyIO's cancellation propagation.
274
+ if cancelled_due_to_usage_limit:
275
+ _mark_usage_limit_responses()
317
276
 
318
277
  # At this point, all indices should have a result
319
278
  assert all(isinstance(r, LLMOutput) for r in responses), (
@@ -357,9 +316,9 @@ class BaseLLMService:
357
316
  streaming_callback: AsyncLLMOutputStreamingCallback | None = None,
358
317
  validation_callback: AsyncLLMOutputStreamingCallback | None = None,
359
318
  completion_callback: AsyncLLMOutputStreamingCallback | None = None,
360
- use_cache: bool = False,
361
319
  response_format: ResponseFormat | None = None,
362
320
  max_retries: int = 1,
321
+ retry_with_feedback: bool = False,
363
322
  _api_key_overrides: dict[str, str] = dict(),
364
323
  ) -> list[LLMOutput]:
365
324
  """Request completions from a configured LLM provider."""
@@ -375,14 +334,6 @@ class BaseLLMService:
375
334
  f"Logprobs are not supported for Anthropic, so we can't use model {model_option.model_name}"
376
335
  )
377
336
 
378
- # Instantiate cache
379
- # TODO(mengk): make this more robust, possibly move to a NoSQL database or something
380
- try:
381
- cache = LLMCache() if use_cache else None
382
- except ValueError as e:
383
- logger.warning(f"Disabling LLM cache due to init error: {e}")
384
- cache = None
385
-
386
337
  # Initialize pointer to which model we're using; used for model rotation after failures
387
338
  current_model_option_index = 0
388
339
 
@@ -413,7 +364,7 @@ class BaseLLMService:
413
364
  single_output_getter = PROVIDERS[provider]["single_output_getter"]
414
365
  single_streaming_output_getter = PROVIDERS[provider]["single_streaming_output_getter"]
415
366
 
416
- # Get completions for uncached messages
367
+ # Get completions for messages.
417
368
  outputs: list[LLMOutput] = await _parallelize_calls(
418
369
  (
419
370
  single_output_getter
@@ -436,11 +387,18 @@ class BaseLLMService:
436
387
  timeout=timeout,
437
388
  semaphore=self._semaphore,
438
389
  max_retries=max_retries,
439
- cache=cache,
440
390
  response_format=response_format,
391
+ retry_with_feedback=retry_with_feedback,
441
392
  )
442
393
  assert len(outputs) == len(inputs), "Number of outputs must match number of messages"
443
394
 
395
+ if any(
396
+ isinstance(e, DocentUsageLimitException)
397
+ for output in outputs
398
+ for e in output.errors
399
+ ):
400
+ break
401
+
444
402
  # Only count errors that should trigger model rotation (API errors, not validation/usage errors)
445
403
  num_rotation_errors = sum(
446
404
  1
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import os
2
3
  import sys
3
4
  from dataclasses import dataclass
4
5
  from typing import IO, Any, Dict, Literal, MutableMapping, Optional, Tuple
@@ -135,8 +136,8 @@ def get_logger(namespace: str, stream: IO[str] | None = None) -> LoggerAdapter:
135
136
  handler.setFormatter(ColoredFormatter())
136
137
  logger.addHandler(handler)
137
138
 
138
- # Set default level to INFO
139
- logger.setLevel(logging.INFO)
139
+ level_name = os.environ.get("DOCENT_LOG_LEVEL", "INFO").upper()
140
+ logger.setLevel(getattr(logging, level_name, logging.INFO))
140
141
 
141
142
  # Wrap with adapter to support highlighting
142
143
  return LoggerAdapter(logger, {})
@@ -2,6 +2,7 @@ from docent.data_models.agent_run import AgentRun
2
2
  from docent.data_models.citation import InlineCitation
3
3
  from docent.data_models.judge import Label
4
4
  from docent.data_models.regex import RegexSnippet
5
+ from docent.data_models.report import Report
5
6
  from docent.data_models.transcript import Transcript, TranscriptGroup
6
7
 
7
8
  __all__ = [
@@ -9,6 +10,7 @@ __all__ = [
9
10
  "InlineCitation",
10
11
  "Label",
11
12
  "RegexSnippet",
13
+ "Report",
12
14
  "Transcript",
13
15
  "TranscriptGroup",
14
16
  ]