docent-python 0.1.61a0__tar.gz → 0.1.62a0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/.gitignore +5 -1
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/PKG-INFO +1 -1
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/data_models/llm_output.py +0 -4
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/llm_svc.py +121 -163
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_log_util/logger.py +3 -2
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/__init__.py +2 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/agent_run.py +134 -162
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/citation.py +5 -4
- docent_python-0.1.62a0/docent/data_models/context_config.py +88 -0
- docent_python-0.1.62a0/docent/data_models/metadata_util.py +180 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/reading.py +95 -33
- docent_python-0.1.62a0/docent/data_models/report.py +16 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/transcript.py +68 -38
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/impl.py +41 -44
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/util/parse_output.py +1 -1
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/mcp/server.py +250 -9
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/_base.py +44 -2
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/_client_util.py +72 -4
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/_readings.py +235 -43
- docent_python-0.1.62a0/docent/sdk/_reports.py +281 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/client.py +2 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/llm_context.py +174 -83
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/reading.py +19 -7
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/pyproject.toml +1 -1
- docent_python-0.1.61a0/docent/_llm_util/llm_cache.py +0 -206
- docent_python-0.1.61a0/docent/data_models/metadata_util.py +0 -32
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/LICENSE.md +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/README.md +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/__init__.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/__init__.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/data_models/__init__.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/data_models/exceptions.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/model_registry.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/providers/__init__.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/providers/anthropic.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/providers/common.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/providers/google.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/providers/openai.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/providers/openrouter.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/providers/preference_types.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/providers/provider_registry.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_log_util/__init__.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/_tiktoken_util.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/chat/__init__.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/chat/content.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/chat/message.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/chat/response_format.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/chat/tool.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/feedback.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/formatted_objects.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/judge.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/regex.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/data_models/util.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/__init__.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/analysis.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/runner.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/stats.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/types.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/util/forgiving_json.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/util/meta_schema.json +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/util/meta_schema.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/util/template_formatter.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/judges/util/voting.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/loaders/load_inspect.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/mcp/__init__.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/mcp/__main__.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/py.typed +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/samples/__init__.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/samples/load.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/samples/log.eval +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/samples/tb_airline.json +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/__init__.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/_agent_runs.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/_collections.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/_dql.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/_feedback.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/_labels.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/_results.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/_rubrics.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/_sharing.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/agent_run_writer.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/integrations/__init__.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/integrations/harbor.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/integrations/inspect.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/integrations/nemogym.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/integrations/util.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/llm_request.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/sdk/util.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/trace.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/trace_temp.py +0 -0
- {docent_python-0.1.61a0 → docent_python-0.1.62a0}/uv.lock +0 -0
|
@@ -145,8 +145,9 @@ ENV/
|
|
|
145
145
|
env.bak/
|
|
146
146
|
venv.bak/
|
|
147
147
|
|
|
148
|
-
# Docent
|
|
148
|
+
# Docent
|
|
149
149
|
docent.env*
|
|
150
|
+
docent_analyses/
|
|
150
151
|
|
|
151
152
|
# Spyder project settings
|
|
152
153
|
.spyderproject
|
|
@@ -204,3 +205,6 @@ data/cache
|
|
|
204
205
|
|
|
205
206
|
# dont commit package lock, force use of bun lock
|
|
206
207
|
package-lock.json
|
|
208
|
+
|
|
209
|
+
# Claude Code worktrees
|
|
210
|
+
.claude/worktrees/
|
{docent_python-0.1.61a0 → docent_python-0.1.62a0}/docent/_llm_util/data_models/llm_output.py
RENAMED
|
@@ -97,7 +97,6 @@ class LLMOutput:
|
|
|
97
97
|
completions: list[LLMCompletion]
|
|
98
98
|
errors: list[LLMException] = field(default_factory=lambda: [])
|
|
99
99
|
usage: UsageMetrics = field(default_factory=UsageMetrics)
|
|
100
|
-
from_cache: bool = False
|
|
101
100
|
duration: float | None = None
|
|
102
101
|
|
|
103
102
|
@property
|
|
@@ -142,7 +141,6 @@ class LLMOutput:
|
|
|
142
141
|
"completions": [comp.model_dump() for comp in self.completions],
|
|
143
142
|
"errors": [e.error_type_id for e in self.errors],
|
|
144
143
|
"usage": self.usage.to_dict(),
|
|
145
|
-
"from_cache": self.from_cache,
|
|
146
144
|
"duration": self.duration,
|
|
147
145
|
}
|
|
148
146
|
|
|
@@ -171,7 +169,6 @@ class LLMOutput:
|
|
|
171
169
|
completions=completions,
|
|
172
170
|
errors=errors,
|
|
173
171
|
usage=UsageMetrics(**usage),
|
|
174
|
-
from_cache=bool(data.get("from_cache", False)),
|
|
175
172
|
duration=data.get("duration"),
|
|
176
173
|
)
|
|
177
174
|
|
|
@@ -275,7 +272,6 @@ def finalize_llm_output_partial(partial: LLMOutputPartial) -> LLMOutput:
|
|
|
275
272
|
for c in partial.completions
|
|
276
273
|
],
|
|
277
274
|
usage=partial.usage,
|
|
278
|
-
from_cache=False,
|
|
279
275
|
)
|
|
280
276
|
|
|
281
277
|
# If the completion is empty and was truncated (likely due to too much reasoning), raise an exception
|
|
@@ -28,7 +28,6 @@ from docent._llm_util.data_models.llm_output import (
|
|
|
28
28
|
AsyncSingleLLMOutputStreamingCallback,
|
|
29
29
|
LLMOutput,
|
|
30
30
|
)
|
|
31
|
-
from docent._llm_util.llm_cache import LLMCache
|
|
32
31
|
from docent._llm_util.providers.preference_types import ModelOption
|
|
33
32
|
from docent._llm_util.providers.provider_registry import (
|
|
34
33
|
PROVIDERS,
|
|
@@ -37,6 +36,7 @@ from docent._llm_util.providers.provider_registry import (
|
|
|
37
36
|
)
|
|
38
37
|
from docent._log_util import get_logger
|
|
39
38
|
from docent.data_models.chat import ChatMessage, ToolInfo, parse_chat_message
|
|
39
|
+
from docent.data_models.chat.message import AssistantMessage, UserMessage
|
|
40
40
|
from docent.data_models.chat.response_format import ResponseFormat
|
|
41
41
|
|
|
42
42
|
logger = get_logger(__name__)
|
|
@@ -91,8 +91,8 @@ async def _parallelize_calls(
|
|
|
91
91
|
semaphore: Semaphore,
|
|
92
92
|
max_retries: int,
|
|
93
93
|
# use_tqdm: bool,
|
|
94
|
-
cache: LLMCache | None = None,
|
|
95
94
|
response_format: ResponseFormat | None = None,
|
|
95
|
+
retry_with_feedback: bool = False,
|
|
96
96
|
):
|
|
97
97
|
base_func = partial(
|
|
98
98
|
single_output_getter,
|
|
@@ -120,122 +120,127 @@ async def _parallelize_calls(
|
|
|
120
120
|
else None
|
|
121
121
|
)
|
|
122
122
|
|
|
123
|
-
# Save resolved messages to avoid multiple resolutions
|
|
124
|
-
resolved_messages: list[list[ChatMessage] | None] = [None] * len(inputs)
|
|
125
|
-
|
|
126
123
|
# Not sure why the cast is necessary for the type checker
|
|
127
124
|
cancelled_due_to_usage_limit: bool = cast(bool, False)
|
|
128
125
|
|
|
126
|
+
def _mark_usage_limit_responses() -> None:
|
|
127
|
+
for i, response in enumerate(responses):
|
|
128
|
+
if response is None:
|
|
129
|
+
responses[i] = LLMOutput(
|
|
130
|
+
model=model_name,
|
|
131
|
+
completions=[],
|
|
132
|
+
errors=[DocentUsageLimitException()],
|
|
133
|
+
)
|
|
134
|
+
elif not response.completions and not response.errors:
|
|
135
|
+
response.errors.append(DocentUsageLimitException())
|
|
136
|
+
|
|
129
137
|
async def _limited_task(i: int, cur_input: MessagesInput, tg: TaskGroup):
|
|
130
|
-
nonlocal responses, pbar,
|
|
138
|
+
nonlocal responses, pbar, cancelled_due_to_usage_limit
|
|
131
139
|
|
|
132
140
|
async with semaphore:
|
|
133
141
|
messages = _resolve_messages_input(cur_input)
|
|
134
|
-
resolved_messages[i] = messages
|
|
135
142
|
|
|
136
143
|
retry_count = 0
|
|
137
144
|
result = None
|
|
138
|
-
call_started_at
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
response_format=response_format,
|
|
152
|
-
)
|
|
153
|
-
if cache is not None
|
|
154
|
-
else None
|
|
155
|
-
)
|
|
156
|
-
if cached_result is not None:
|
|
157
|
-
result = cached_result
|
|
158
|
-
if streaming_callback is not None:
|
|
159
|
-
await streaming_callback(i, result)
|
|
160
|
-
else:
|
|
161
|
-
call_started_at = time.perf_counter()
|
|
162
|
-
while retry_count < MAX_VALIDATION_ATTEMPTS:
|
|
163
|
-
try:
|
|
164
|
-
if streaming_callback is None:
|
|
165
|
-
result = await base_func(client=client, messages=messages)
|
|
166
|
-
else:
|
|
167
|
-
result = await base_func(
|
|
168
|
-
client=client,
|
|
169
|
-
streaming_callback=_get_single_streaming_callback(
|
|
170
|
-
i, streaming_callback
|
|
171
|
-
),
|
|
172
|
-
messages=messages,
|
|
173
|
-
)
|
|
174
|
-
|
|
175
|
-
# Validate if validation callback provided and result is successful
|
|
176
|
-
if validation_callback and not result.did_error:
|
|
177
|
-
await validation_callback(i, result)
|
|
178
|
-
|
|
179
|
-
break
|
|
180
|
-
except ValidationFailedException as e:
|
|
181
|
-
retry_count += 1
|
|
182
|
-
logger.warning(
|
|
183
|
-
f"Validation failed for {model_name} after {retry_count} attempts: {e}"
|
|
145
|
+
call_started_at = time.perf_counter()
|
|
146
|
+
current_messages = messages
|
|
147
|
+
while retry_count < MAX_VALIDATION_ATTEMPTS:
|
|
148
|
+
try:
|
|
149
|
+
if streaming_callback is None:
|
|
150
|
+
result = await base_func(client=client, messages=current_messages)
|
|
151
|
+
else:
|
|
152
|
+
result = await base_func(
|
|
153
|
+
client=client,
|
|
154
|
+
streaming_callback=_get_single_streaming_callback(
|
|
155
|
+
i, streaming_callback
|
|
156
|
+
),
|
|
157
|
+
messages=current_messages,
|
|
184
158
|
)
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
159
|
+
|
|
160
|
+
# Validate if validation callback provided and result is successful
|
|
161
|
+
if validation_callback and not result.did_error:
|
|
162
|
+
await validation_callback(i, result)
|
|
163
|
+
|
|
164
|
+
break
|
|
165
|
+
except ValidationFailedException as e:
|
|
166
|
+
retry_count += 1
|
|
167
|
+
logger.warning(
|
|
168
|
+
"Validation failed for %s after %d attempts: %s",
|
|
169
|
+
model_name,
|
|
170
|
+
retry_count,
|
|
171
|
+
e,
|
|
172
|
+
extra={"original_output": e.failed_output},
|
|
173
|
+
)
|
|
174
|
+
if retry_count >= MAX_VALIDATION_ATTEMPTS:
|
|
175
|
+
logger.error(
|
|
176
|
+
"Validation failed for %s after %d attempts: %s",
|
|
177
|
+
model_name,
|
|
178
|
+
retry_count,
|
|
179
|
+
e,
|
|
180
|
+
extra={"original_output": e.failed_output},
|
|
200
181
|
)
|
|
201
|
-
cancelled_due_to_usage_limit = True
|
|
202
|
-
tg.cancel_scope.cancel()
|
|
203
|
-
break
|
|
204
|
-
except asyncio.TimeoutError as e:
|
|
205
|
-
timeout_exception = TimeoutException(str(e) or "Request timed out")
|
|
206
|
-
timeout_exception.__cause__ = e
|
|
207
|
-
logger.error(f"Call to {model_name} timed out")
|
|
208
182
|
result = LLMOutput(
|
|
209
183
|
model=model_name,
|
|
210
184
|
completions=[],
|
|
211
|
-
errors=[
|
|
185
|
+
errors=[e],
|
|
212
186
|
)
|
|
213
187
|
break
|
|
214
|
-
except Exception as e:
|
|
215
|
-
if not isinstance(e, LLMException):
|
|
216
|
-
logger.error(
|
|
217
|
-
f"LLM call raised an exception that is not an LLMException: {e}. Failure traceback:\n{traceback.format_exc()}"
|
|
218
|
-
)
|
|
219
|
-
llm_exception = LLMException(e)
|
|
220
|
-
llm_exception.__cause__ = e
|
|
221
|
-
else:
|
|
222
|
-
llm_exception = e
|
|
223
|
-
|
|
224
|
-
error_message = f"Call to {model_name} failed even with backoff: {e.__class__.__name__}."
|
|
225
|
-
|
|
226
|
-
if not isinstance(e, RateLimitException):
|
|
227
|
-
error_message += f" Failure traceback:\n{traceback.format_exc()}"
|
|
228
|
-
logger.error(error_message)
|
|
229
188
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
189
|
+
if retry_with_feedback:
|
|
190
|
+
# Build a new message list with the failed output and
|
|
191
|
+
# error feedback so the model can correct itself
|
|
192
|
+
current_messages = [
|
|
193
|
+
*messages,
|
|
194
|
+
AssistantMessage(content=e.failed_output or ""),
|
|
195
|
+
UserMessage(
|
|
196
|
+
content=f"Your previous output failed validation: {e}\n\nPlease try again with a corrected output."
|
|
197
|
+
),
|
|
198
|
+
]
|
|
199
|
+
except DocentUsageLimitException as _:
|
|
200
|
+
result = LLMOutput(
|
|
201
|
+
model=model_name,
|
|
202
|
+
completions=[],
|
|
203
|
+
errors=[], # Usage limit exceptions will be added to all results later if cancelled_due_to_usage_limit
|
|
204
|
+
)
|
|
205
|
+
cancelled_due_to_usage_limit = True
|
|
206
|
+
tg.cancel_scope.cancel()
|
|
207
|
+
break
|
|
208
|
+
except asyncio.TimeoutError as e:
|
|
209
|
+
timeout_exception = TimeoutException(str(e) or "Request timed out")
|
|
210
|
+
timeout_exception.__cause__ = e
|
|
211
|
+
logger.error(f"Call to {model_name} timed out")
|
|
212
|
+
result = LLMOutput(
|
|
213
|
+
model=model_name,
|
|
214
|
+
completions=[],
|
|
215
|
+
errors=[timeout_exception],
|
|
216
|
+
)
|
|
217
|
+
break
|
|
218
|
+
except Exception as e:
|
|
219
|
+
if not isinstance(e, LLMException):
|
|
220
|
+
logger.error(
|
|
221
|
+
f"LLM call raised an exception that is not an LLMException: {e}. Failure traceback:\n{traceback.format_exc()}"
|
|
234
222
|
)
|
|
235
|
-
|
|
223
|
+
llm_exception = LLMException(e)
|
|
224
|
+
llm_exception.__cause__ = e
|
|
225
|
+
else:
|
|
226
|
+
llm_exception = e
|
|
227
|
+
|
|
228
|
+
error_message = (
|
|
229
|
+
f"Call to {model_name} failed even with backoff: {e.__class__.__name__}."
|
|
230
|
+
)
|
|
236
231
|
|
|
237
|
-
|
|
238
|
-
|
|
232
|
+
if not isinstance(e, RateLimitException):
|
|
233
|
+
error_message += f" Failure traceback:\n{traceback.format_exc()}"
|
|
234
|
+
logger.error(error_message)
|
|
235
|
+
|
|
236
|
+
result = LLMOutput(
|
|
237
|
+
model=model_name,
|
|
238
|
+
completions=[],
|
|
239
|
+
errors=[llm_exception],
|
|
240
|
+
)
|
|
241
|
+
break
|
|
242
|
+
|
|
243
|
+
if result is not None:
|
|
239
244
|
result.duration = time.perf_counter() - call_started_at
|
|
240
245
|
|
|
241
246
|
# Always call completion callback with final result (success or error)
|
|
@@ -244,44 +249,14 @@ async def _parallelize_calls(
|
|
|
244
249
|
await completion_callback(i, result)
|
|
245
250
|
# LLMService uses this callback to record cost, and may throw an error if we just exceeded limit
|
|
246
251
|
except DocentUsageLimitException as e:
|
|
247
|
-
result.errors
|
|
252
|
+
if not result.completions and not result.errors:
|
|
253
|
+
result.errors.append(e)
|
|
248
254
|
cancelled_due_to_usage_limit = True
|
|
249
255
|
tg.cancel_scope.cancel()
|
|
250
256
|
|
|
251
257
|
responses[i] = result
|
|
252
258
|
if pbar is not None:
|
|
253
259
|
pbar.update(1)
|
|
254
|
-
if pbar is None or pbar.n == pbar.total:
|
|
255
|
-
tg.cancel_scope.cancel()
|
|
256
|
-
|
|
257
|
-
def _cache_responses():
|
|
258
|
-
nonlocal responses, cache
|
|
259
|
-
|
|
260
|
-
if cache is not None:
|
|
261
|
-
indices = [
|
|
262
|
-
i
|
|
263
|
-
for i, response in enumerate(responses)
|
|
264
|
-
if resolved_messages[i] is not None
|
|
265
|
-
and response is not None
|
|
266
|
-
and not response.did_error
|
|
267
|
-
]
|
|
268
|
-
cache.set_batch(
|
|
269
|
-
# We already checked that each index has a resolved messages list
|
|
270
|
-
[cast(list[ChatMessage], resolved_messages[i]) for i in indices],
|
|
271
|
-
model_name,
|
|
272
|
-
# We already checked that each index corresponds to an LLMOutput object
|
|
273
|
-
[cast(LLMOutput, responses[i]) for i in indices],
|
|
274
|
-
tools=tools,
|
|
275
|
-
tool_choice=tool_choice,
|
|
276
|
-
reasoning_effort=reasoning_effort,
|
|
277
|
-
temperature=temperature,
|
|
278
|
-
logprobs=logprobs,
|
|
279
|
-
top_logprobs=top_logprobs,
|
|
280
|
-
response_format=response_format,
|
|
281
|
-
)
|
|
282
|
-
return len(indices)
|
|
283
|
-
else:
|
|
284
|
-
return 0
|
|
285
260
|
|
|
286
261
|
# Get all results concurrently
|
|
287
262
|
try:
|
|
@@ -290,30 +265,14 @@ async def _parallelize_calls(
|
|
|
290
265
|
for i, cur_input in enumerate(inputs):
|
|
291
266
|
tg.start_soon(_limited_task, i, cur_input, tg)
|
|
292
267
|
|
|
293
|
-
# Cache what we have so far if something got cancelled
|
|
294
268
|
except anyio.get_cancelled_exc_class():
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
logger.info(
|
|
298
|
-
f"Cancelled {len(inputs) - num_cached} unfinished LLM API calls, but cached {num_cached} completed responses"
|
|
299
|
-
)
|
|
300
|
-
|
|
301
|
-
# If the task was cancelled due to usage limit, set the response to a usage limit exception
|
|
302
|
-
if cancelled_due_to_usage_limit:
|
|
303
|
-
for i, response in enumerate(responses):
|
|
304
|
-
if response is None:
|
|
305
|
-
responses[i] = LLMOutput(
|
|
306
|
-
model=model_name,
|
|
307
|
-
completions=[],
|
|
308
|
-
errors=[DocentUsageLimitException()],
|
|
309
|
-
)
|
|
310
|
-
else:
|
|
311
|
-
response.errors.append(DocentUsageLimitException())
|
|
269
|
+
if not cancelled_due_to_usage_limit:
|
|
270
|
+
raise
|
|
312
271
|
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
272
|
+
# If we stopped the batch due to usage limits, make sure every input has a
|
|
273
|
+
# structured result instead of relying on AnyIO's cancellation propagation.
|
|
274
|
+
if cancelled_due_to_usage_limit:
|
|
275
|
+
_mark_usage_limit_responses()
|
|
317
276
|
|
|
318
277
|
# At this point, all indices should have a result
|
|
319
278
|
assert all(isinstance(r, LLMOutput) for r in responses), (
|
|
@@ -357,9 +316,9 @@ class BaseLLMService:
|
|
|
357
316
|
streaming_callback: AsyncLLMOutputStreamingCallback | None = None,
|
|
358
317
|
validation_callback: AsyncLLMOutputStreamingCallback | None = None,
|
|
359
318
|
completion_callback: AsyncLLMOutputStreamingCallback | None = None,
|
|
360
|
-
use_cache: bool = False,
|
|
361
319
|
response_format: ResponseFormat | None = None,
|
|
362
320
|
max_retries: int = 1,
|
|
321
|
+
retry_with_feedback: bool = False,
|
|
363
322
|
_api_key_overrides: dict[str, str] = dict(),
|
|
364
323
|
) -> list[LLMOutput]:
|
|
365
324
|
"""Request completions from a configured LLM provider."""
|
|
@@ -375,14 +334,6 @@ class BaseLLMService:
|
|
|
375
334
|
f"Logprobs are not supported for Anthropic, so we can't use model {model_option.model_name}"
|
|
376
335
|
)
|
|
377
336
|
|
|
378
|
-
# Instantiate cache
|
|
379
|
-
# TODO(mengk): make this more robust, possibly move to a NoSQL database or something
|
|
380
|
-
try:
|
|
381
|
-
cache = LLMCache() if use_cache else None
|
|
382
|
-
except ValueError as e:
|
|
383
|
-
logger.warning(f"Disabling LLM cache due to init error: {e}")
|
|
384
|
-
cache = None
|
|
385
|
-
|
|
386
337
|
# Initialize pointer to which model we're using; used for model rotation after failures
|
|
387
338
|
current_model_option_index = 0
|
|
388
339
|
|
|
@@ -413,7 +364,7 @@ class BaseLLMService:
|
|
|
413
364
|
single_output_getter = PROVIDERS[provider]["single_output_getter"]
|
|
414
365
|
single_streaming_output_getter = PROVIDERS[provider]["single_streaming_output_getter"]
|
|
415
366
|
|
|
416
|
-
# Get completions for
|
|
367
|
+
# Get completions for messages.
|
|
417
368
|
outputs: list[LLMOutput] = await _parallelize_calls(
|
|
418
369
|
(
|
|
419
370
|
single_output_getter
|
|
@@ -436,11 +387,18 @@ class BaseLLMService:
|
|
|
436
387
|
timeout=timeout,
|
|
437
388
|
semaphore=self._semaphore,
|
|
438
389
|
max_retries=max_retries,
|
|
439
|
-
cache=cache,
|
|
440
390
|
response_format=response_format,
|
|
391
|
+
retry_with_feedback=retry_with_feedback,
|
|
441
392
|
)
|
|
442
393
|
assert len(outputs) == len(inputs), "Number of outputs must match number of messages"
|
|
443
394
|
|
|
395
|
+
if any(
|
|
396
|
+
isinstance(e, DocentUsageLimitException)
|
|
397
|
+
for output in outputs
|
|
398
|
+
for e in output.errors
|
|
399
|
+
):
|
|
400
|
+
break
|
|
401
|
+
|
|
444
402
|
# Only count errors that should trigger model rotation (API errors, not validation/usage errors)
|
|
445
403
|
num_rotation_errors = sum(
|
|
446
404
|
1
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import os
|
|
2
3
|
import sys
|
|
3
4
|
from dataclasses import dataclass
|
|
4
5
|
from typing import IO, Any, Dict, Literal, MutableMapping, Optional, Tuple
|
|
@@ -135,8 +136,8 @@ def get_logger(namespace: str, stream: IO[str] | None = None) -> LoggerAdapter:
|
|
|
135
136
|
handler.setFormatter(ColoredFormatter())
|
|
136
137
|
logger.addHandler(handler)
|
|
137
138
|
|
|
138
|
-
|
|
139
|
-
logger.setLevel(logging.INFO)
|
|
139
|
+
level_name = os.environ.get("DOCENT_LOG_LEVEL", "INFO").upper()
|
|
140
|
+
logger.setLevel(getattr(logging, level_name, logging.INFO))
|
|
140
141
|
|
|
141
142
|
# Wrap with adapter to support highlighting
|
|
142
143
|
return LoggerAdapter(logger, {})
|
|
@@ -2,6 +2,7 @@ from docent.data_models.agent_run import AgentRun
|
|
|
2
2
|
from docent.data_models.citation import InlineCitation
|
|
3
3
|
from docent.data_models.judge import Label
|
|
4
4
|
from docent.data_models.regex import RegexSnippet
|
|
5
|
+
from docent.data_models.report import Report
|
|
5
6
|
from docent.data_models.transcript import Transcript, TranscriptGroup
|
|
6
7
|
|
|
7
8
|
__all__ = [
|
|
@@ -9,6 +10,7 @@ __all__ = [
|
|
|
9
10
|
"InlineCitation",
|
|
10
11
|
"Label",
|
|
11
12
|
"RegexSnippet",
|
|
13
|
+
"Report",
|
|
12
14
|
"Transcript",
|
|
13
15
|
"TranscriptGroup",
|
|
14
16
|
]
|