inspect-ai 0.3.100__py3-none-any.whl → 0.3.101__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. inspect_ai/_cli/eval.py +2 -1
  2. inspect_ai/_eval/eval.py +13 -1
  3. inspect_ai/_eval/evalset.py +3 -2
  4. inspect_ai/_eval/run.py +2 -0
  5. inspect_ai/_eval/task/log.py +3 -1
  6. inspect_ai/_view/www/dist/assets/index.css +44 -12
  7. inspect_ai/_view/www/dist/assets/index.js +1499 -1467
  8. inspect_ai/_view/www/package.json +4 -4
  9. inspect_ai/_view/www/src/app/log-view/tabs/grouping.ts +4 -4
  10. inspect_ai/_view/www/src/app/routing/navigationHooks.ts +22 -25
  11. inspect_ai/_view/www/src/app/samples/list/SampleList.tsx +17 -5
  12. inspect_ai/_view/www/src/state/hooks.ts +1 -1
  13. inspect_ai/_view/www/yarn.lock +21 -27
  14. inspect_ai/analysis/beta/__init__.py +2 -0
  15. inspect_ai/dataset/_sources/csv.py +2 -6
  16. inspect_ai/dataset/_sources/hf.py +2 -6
  17. inspect_ai/dataset/_sources/json.py +2 -6
  18. inspect_ai/dataset/_util.py +23 -0
  19. inspect_ai/log/_recorders/eval.py +4 -3
  20. inspect_ai/log/_recorders/json.py +1 -0
  21. inspect_ai/log/_recorders/recorder.py +1 -0
  22. inspect_ai/model/_openai_responses.py +11 -6
  23. inspect_ai/model/_openai_web_search.py +9 -2
  24. inspect_ai/model/_providers/openai.py +3 -1
  25. inspect_ai/model/_providers/openai_responses.py +5 -1
  26. inspect_ai/scorer/_reducer/reducer.py +1 -1
  27. inspect_ai/tool/_tools/_web_search/_google.py +28 -11
  28. inspect_ai/tool/_tools/_web_search/_tavily.py +11 -1
  29. {inspect_ai-0.3.100.dist-info → inspect_ai-0.3.101.dist-info}/METADATA +1 -1
  30. {inspect_ai-0.3.100.dist-info → inspect_ai-0.3.101.dist-info}/RECORD +34 -34
  31. {inspect_ai-0.3.100.dist-info → inspect_ai-0.3.101.dist-info}/WHEEL +0 -0
  32. {inspect_ai-0.3.100.dist-info → inspect_ai-0.3.101.dist-info}/entry_points.txt +0 -0
  33. {inspect_ai-0.3.100.dist-info → inspect_ai-0.3.101.dist-info}/licenses/LICENSE +0 -0
  34. {inspect_ai-0.3.100.dist-info → inspect_ai-0.3.101.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py CHANGED
@@ -35,7 +35,7 @@ from .util import (
35
35
  )
36
36
 
37
37
  MAX_SAMPLES_HELP = "Maximum number of samples to run in parallel (default is running all samples in parallel)"
38
- MAX_TASKS_HELP = "Maximum number of tasks to run in parallel (default is 1)"
38
+ MAX_TASKS_HELP = "Maximum number of tasks to run in parallel (default is 1 for eval and 4 for eval-set)"
39
39
  MAX_SUBPROCESSES_HELP = (
40
40
  "Maximum number of subprocesses to run in parallel (default is os.cpu_count())"
41
41
  )
@@ -949,6 +949,7 @@ def eval_exec(
949
949
  log_images=log_images,
950
950
  log_buffer=log_buffer,
951
951
  log_shared=log_shared,
952
+ log_header_only=True, # cli invocation doesn't need full log
952
953
  score=score,
953
954
  score_display=score_display,
954
955
  )
inspect_ai/_eval/eval.py CHANGED
@@ -105,6 +105,7 @@ def eval(
105
105
  log_images: bool | None = None,
106
106
  log_buffer: int | None = None,
107
107
  log_shared: bool | int | None = None,
108
+ log_header_only: bool | None = None,
108
109
  score: bool = True,
109
110
  score_display: bool | None = None,
110
111
  **kwargs: Unpack[GenerateConfigArgs],
@@ -181,6 +182,8 @@ def eval(
181
182
  log_shared: Sync sample events to log directory so that users on other systems
182
183
  can see log updates in realtime (defaults to no syncing). Specify `True`
183
184
  to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
185
+ log_header_only: If `True`, the function should return only log headers rather
186
+ than full logs with samples (defaults to `False`).
184
187
  score: Score output (defaults to True)
185
188
  score_display: Show scoring metrics in realtime (defaults to True)
186
189
  **kwargs: Model generation options.
@@ -234,6 +237,7 @@ def eval(
234
237
  log_images=log_images,
235
238
  log_buffer=log_buffer,
236
239
  log_shared=log_shared,
240
+ log_header_only=log_header_only,
237
241
  score=score,
238
242
  score_display=score_display,
239
243
  **kwargs,
@@ -288,6 +292,7 @@ async def eval_async(
288
292
  log_images: bool | None = None,
289
293
  log_buffer: int | None = None,
290
294
  log_shared: bool | int | None = None,
295
+ log_header_only: bool | None = None,
291
296
  score: bool = True,
292
297
  score_display: bool | None = None,
293
298
  **kwargs: Unpack[GenerateConfigArgs],
@@ -344,7 +349,9 @@ async def eval_async(
344
349
  log_buffer: Number of samples to buffer before writing log file.
345
350
  If not specified, an appropriate default for the format and filesystem is
346
351
  chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
347
- log_shared: Indicate that the log directory is shared, which results in additional syncing of realtime log data for Inspect View.
352
+ log_shared: Indicate that the log directory is shared, which results in additional
353
+ syncing of realtime log data for Inspect View.
354
+ log_header_only: If `True`, the function should return only log headers rather than full logs with samples (defaults to `False`).
348
355
  score: Score output (defaults to True)
349
356
  score_display: Show scoring metrics in realtime (defaults to True)
350
357
  **kwargs: Model generation options.
@@ -432,6 +439,9 @@ async def eval_async(
432
439
  # resolve log_shared
433
440
  log_shared = DEFAULT_LOG_SHARED if log_shared is True else log_shared
434
441
 
442
+ # resolve header only
443
+ log_header_only = log_header_only is True
444
+
435
445
  # validate that --log-shared can't use used with 'json' format
436
446
  if log_shared and log_format == JSON_LOG_FORMAT:
437
447
  raise PrerequisiteError(
@@ -507,6 +517,7 @@ async def eval_async(
507
517
  eval_config=eval_config,
508
518
  eval_sandbox=sandbox,
509
519
  recorder=recorder,
520
+ header_only=log_header_only,
510
521
  epochs_reducer=epochs_reducer,
511
522
  solver=solver,
512
523
  tags=tags,
@@ -532,6 +543,7 @@ async def eval_async(
532
543
  eval_config=eval_config,
533
544
  eval_sandbox=sandbox,
534
545
  recorder=recorder,
546
+ header_only=log_header_only,
535
547
  epochs_reducer=epochs_reducer,
536
548
  solver=solver,
537
549
  tags=tags,
@@ -168,7 +168,7 @@ def eval_set(
168
168
  max_samples: Maximum number of samples to run in parallel
169
169
  (default is max_connections)
170
170
  max_tasks: Maximum number of tasks to run in parallel
171
- (defaults to number of models being evaluated)
171
+ (defaults to the greater of 4 and the number of models being evaluated)
172
172
  max_subprocesses: Maximum number of subprocesses to
173
173
  run in parallel (default is os.cpu_count())
174
174
  max_sandboxes: Maximum number of sandboxes (per-provider)
@@ -235,6 +235,7 @@ def eval_set(
235
235
  log_images=log_images,
236
236
  log_buffer=log_buffer,
237
237
  log_shared=log_shared,
238
+ log_header_only=True,
238
239
  score=score,
239
240
  **kwargs,
240
241
  )
@@ -277,7 +278,7 @@ def eval_set(
277
278
  retry_connections = retry_connections or 0.5
278
279
  retry_cleanup = retry_cleanup is not False
279
280
  max_connections = starting_max_connections(models, GenerateConfig(**kwargs))
280
- max_tasks = max_tasks if max_tasks is not None else len(models)
281
+ max_tasks = max_tasks if max_tasks is not None else max(len(models), 4)
281
282
 
282
283
  # prepare console/status
283
284
  console = rich.get_console()
inspect_ai/_eval/run.py CHANGED
@@ -63,6 +63,7 @@ async def eval_run(
63
63
  eval_config: EvalConfig,
64
64
  eval_sandbox: SandboxEnvironmentType | None,
65
65
  recorder: Recorder,
66
+ header_only: bool,
66
67
  epochs_reducer: list[ScoreReducer] | None = None,
67
68
  solver: Solver | SolverSpec | None = None,
68
69
  tags: list[str] | None = None,
@@ -212,6 +213,7 @@ async def eval_run(
212
213
  eval_config=task_eval_config,
213
214
  metadata=((metadata or {}) | (task.metadata or {})) or None,
214
215
  recorder=recorder,
216
+ header_only=header_only,
215
217
  )
216
218
  await logger.init()
217
219
 
@@ -75,6 +75,7 @@ class TaskLogger:
75
75
  eval_config: EvalConfig,
76
76
  metadata: dict[str, Any] | None,
77
77
  recorder: Recorder,
78
+ header_only: bool,
78
79
  ) -> None:
79
80
  # determine versions
80
81
  git = git_context()
@@ -153,6 +154,7 @@ class TaskLogger:
153
154
 
154
155
  # stack recorder and location
155
156
  self.recorder = recorder
157
+ self.header_only = header_only
156
158
 
157
159
  # number of samples logged
158
160
  self._samples_completed = 0
@@ -238,7 +240,7 @@ class TaskLogger:
238
240
  ) -> EvalLog:
239
241
  # finish and get log
240
242
  log = await self.recorder.log_finish(
241
- self.eval, status, stats, results, reductions, error
243
+ self.eval, status, stats, results, reductions, error, self.header_only
242
244
  )
243
245
 
244
246
  # cleanup the events db