inspect-ai 0.3.100__py3-none-any.whl → 0.3.101__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +2 -1
- inspect_ai/_eval/eval.py +13 -1
- inspect_ai/_eval/evalset.py +3 -2
- inspect_ai/_eval/run.py +2 -0
- inspect_ai/_eval/task/log.py +3 -1
- inspect_ai/_view/www/dist/assets/index.css +44 -12
- inspect_ai/_view/www/dist/assets/index.js +1499 -1467
- inspect_ai/_view/www/package.json +4 -4
- inspect_ai/_view/www/src/app/log-view/tabs/grouping.ts +4 -4
- inspect_ai/_view/www/src/app/routing/navigationHooks.ts +22 -25
- inspect_ai/_view/www/src/app/samples/list/SampleList.tsx +17 -5
- inspect_ai/_view/www/src/state/hooks.ts +1 -1
- inspect_ai/_view/www/yarn.lock +21 -27
- inspect_ai/analysis/beta/__init__.py +2 -0
- inspect_ai/dataset/_sources/csv.py +2 -6
- inspect_ai/dataset/_sources/hf.py +2 -6
- inspect_ai/dataset/_sources/json.py +2 -6
- inspect_ai/dataset/_util.py +23 -0
- inspect_ai/log/_recorders/eval.py +4 -3
- inspect_ai/log/_recorders/json.py +1 -0
- inspect_ai/log/_recorders/recorder.py +1 -0
- inspect_ai/model/_openai_responses.py +11 -6
- inspect_ai/model/_openai_web_search.py +9 -2
- inspect_ai/model/_providers/openai.py +3 -1
- inspect_ai/model/_providers/openai_responses.py +5 -1
- inspect_ai/scorer/_reducer/reducer.py +1 -1
- inspect_ai/tool/_tools/_web_search/_google.py +28 -11
- inspect_ai/tool/_tools/_web_search/_tavily.py +11 -1
- {inspect_ai-0.3.100.dist-info → inspect_ai-0.3.101.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.100.dist-info → inspect_ai-0.3.101.dist-info}/RECORD +34 -34
- {inspect_ai-0.3.100.dist-info → inspect_ai-0.3.101.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.100.dist-info → inspect_ai-0.3.101.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.100.dist-info → inspect_ai-0.3.101.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.100.dist-info → inspect_ai-0.3.101.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py
CHANGED
@@ -35,7 +35,7 @@ from .util import (
|
|
35
35
|
)
|
36
36
|
|
37
37
|
MAX_SAMPLES_HELP = "Maximum number of samples to run in parallel (default is running all samples in parallel)"
|
38
|
-
MAX_TASKS_HELP = "Maximum number of tasks to run in parallel (default is 1)"
|
38
|
+
MAX_TASKS_HELP = "Maximum number of tasks to run in parallel (default is 1 for eval and 4 for eval-set)"
|
39
39
|
MAX_SUBPROCESSES_HELP = (
|
40
40
|
"Maximum number of subprocesses to run in parallel (default is os.cpu_count())"
|
41
41
|
)
|
@@ -949,6 +949,7 @@ def eval_exec(
|
|
949
949
|
log_images=log_images,
|
950
950
|
log_buffer=log_buffer,
|
951
951
|
log_shared=log_shared,
|
952
|
+
log_header_only=True, # cli invocation doesn't need full log
|
952
953
|
score=score,
|
953
954
|
score_display=score_display,
|
954
955
|
)
|
inspect_ai/_eval/eval.py
CHANGED
@@ -105,6 +105,7 @@ def eval(
|
|
105
105
|
log_images: bool | None = None,
|
106
106
|
log_buffer: int | None = None,
|
107
107
|
log_shared: bool | int | None = None,
|
108
|
+
log_header_only: bool | None = None,
|
108
109
|
score: bool = True,
|
109
110
|
score_display: bool | None = None,
|
110
111
|
**kwargs: Unpack[GenerateConfigArgs],
|
@@ -181,6 +182,8 @@ def eval(
|
|
181
182
|
log_shared: Sync sample events to log directory so that users on other systems
|
182
183
|
can see log updates in realtime (defaults to no syncing). Specify `True`
|
183
184
|
to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
|
185
|
+
log_header_only: If `True`, the function should return only log headers rather
|
186
|
+
than full logs with samples (defaults to `False`).
|
184
187
|
score: Score output (defaults to True)
|
185
188
|
score_display: Show scoring metrics in realtime (defaults to True)
|
186
189
|
**kwargs: Model generation options.
|
@@ -234,6 +237,7 @@ def eval(
|
|
234
237
|
log_images=log_images,
|
235
238
|
log_buffer=log_buffer,
|
236
239
|
log_shared=log_shared,
|
240
|
+
log_header_only=log_header_only,
|
237
241
|
score=score,
|
238
242
|
score_display=score_display,
|
239
243
|
**kwargs,
|
@@ -288,6 +292,7 @@ async def eval_async(
|
|
288
292
|
log_images: bool | None = None,
|
289
293
|
log_buffer: int | None = None,
|
290
294
|
log_shared: bool | int | None = None,
|
295
|
+
log_header_only: bool | None = None,
|
291
296
|
score: bool = True,
|
292
297
|
score_display: bool | None = None,
|
293
298
|
**kwargs: Unpack[GenerateConfigArgs],
|
@@ -344,7 +349,9 @@ async def eval_async(
|
|
344
349
|
log_buffer: Number of samples to buffer before writing log file.
|
345
350
|
If not specified, an appropriate default for the format and filesystem is
|
346
351
|
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
347
|
-
log_shared: Indicate that the log directory is shared, which results in additional
|
352
|
+
log_shared: Indicate that the log directory is shared, which results in additional
|
353
|
+
syncing of realtime log data for Inspect View.
|
354
|
+
log_header_only: If `True`, the function should return only log headers rather than full logs with samples (defaults to `False`).
|
348
355
|
score: Score output (defaults to True)
|
349
356
|
score_display: Show scoring metrics in realtime (defaults to True)
|
350
357
|
**kwargs: Model generation options.
|
@@ -432,6 +439,9 @@ async def eval_async(
|
|
432
439
|
# resolve log_shared
|
433
440
|
log_shared = DEFAULT_LOG_SHARED if log_shared is True else log_shared
|
434
441
|
|
442
|
+
# resolve header only
|
443
|
+
log_header_only = log_header_only is True
|
444
|
+
|
435
445
|
# validate that --log-shared can't use used with 'json' format
|
436
446
|
if log_shared and log_format == JSON_LOG_FORMAT:
|
437
447
|
raise PrerequisiteError(
|
@@ -507,6 +517,7 @@ async def eval_async(
|
|
507
517
|
eval_config=eval_config,
|
508
518
|
eval_sandbox=sandbox,
|
509
519
|
recorder=recorder,
|
520
|
+
header_only=log_header_only,
|
510
521
|
epochs_reducer=epochs_reducer,
|
511
522
|
solver=solver,
|
512
523
|
tags=tags,
|
@@ -532,6 +543,7 @@ async def eval_async(
|
|
532
543
|
eval_config=eval_config,
|
533
544
|
eval_sandbox=sandbox,
|
534
545
|
recorder=recorder,
|
546
|
+
header_only=log_header_only,
|
535
547
|
epochs_reducer=epochs_reducer,
|
536
548
|
solver=solver,
|
537
549
|
tags=tags,
|
inspect_ai/_eval/evalset.py
CHANGED
@@ -168,7 +168,7 @@ def eval_set(
|
|
168
168
|
max_samples: Maximum number of samples to run in parallel
|
169
169
|
(default is max_connections)
|
170
170
|
max_tasks: Maximum number of tasks to run in parallel
|
171
|
-
(defaults to number of models being evaluated)
|
171
|
+
(defaults to the greater of 4 and the number of models being evaluated)
|
172
172
|
max_subprocesses: Maximum number of subprocesses to
|
173
173
|
run in parallel (default is os.cpu_count())
|
174
174
|
max_sandboxes: Maximum number of sandboxes (per-provider)
|
@@ -235,6 +235,7 @@ def eval_set(
|
|
235
235
|
log_images=log_images,
|
236
236
|
log_buffer=log_buffer,
|
237
237
|
log_shared=log_shared,
|
238
|
+
log_header_only=True,
|
238
239
|
score=score,
|
239
240
|
**kwargs,
|
240
241
|
)
|
@@ -277,7 +278,7 @@ def eval_set(
|
|
277
278
|
retry_connections = retry_connections or 0.5
|
278
279
|
retry_cleanup = retry_cleanup is not False
|
279
280
|
max_connections = starting_max_connections(models, GenerateConfig(**kwargs))
|
280
|
-
max_tasks = max_tasks if max_tasks is not None else len(models)
|
281
|
+
max_tasks = max_tasks if max_tasks is not None else max(len(models), 4)
|
281
282
|
|
282
283
|
# prepare console/status
|
283
284
|
console = rich.get_console()
|
inspect_ai/_eval/run.py
CHANGED
@@ -63,6 +63,7 @@ async def eval_run(
|
|
63
63
|
eval_config: EvalConfig,
|
64
64
|
eval_sandbox: SandboxEnvironmentType | None,
|
65
65
|
recorder: Recorder,
|
66
|
+
header_only: bool,
|
66
67
|
epochs_reducer: list[ScoreReducer] | None = None,
|
67
68
|
solver: Solver | SolverSpec | None = None,
|
68
69
|
tags: list[str] | None = None,
|
@@ -212,6 +213,7 @@ async def eval_run(
|
|
212
213
|
eval_config=task_eval_config,
|
213
214
|
metadata=((metadata or {}) | (task.metadata or {})) or None,
|
214
215
|
recorder=recorder,
|
216
|
+
header_only=header_only,
|
215
217
|
)
|
216
218
|
await logger.init()
|
217
219
|
|
inspect_ai/_eval/task/log.py
CHANGED
@@ -75,6 +75,7 @@ class TaskLogger:
|
|
75
75
|
eval_config: EvalConfig,
|
76
76
|
metadata: dict[str, Any] | None,
|
77
77
|
recorder: Recorder,
|
78
|
+
header_only: bool,
|
78
79
|
) -> None:
|
79
80
|
# determine versions
|
80
81
|
git = git_context()
|
@@ -153,6 +154,7 @@ class TaskLogger:
|
|
153
154
|
|
154
155
|
# stack recorder and location
|
155
156
|
self.recorder = recorder
|
157
|
+
self.header_only = header_only
|
156
158
|
|
157
159
|
# number of samples logged
|
158
160
|
self._samples_completed = 0
|
@@ -238,7 +240,7 @@ class TaskLogger:
|
|
238
240
|
) -> EvalLog:
|
239
241
|
# finish and get log
|
240
242
|
log = await self.recorder.log_finish(
|
241
|
-
self.eval, status, stats, results, reductions, error
|
243
|
+
self.eval, status, stats, results, reductions, error, self.header_only
|
242
244
|
)
|
243
245
|
|
244
246
|
# cleanup the events db
|