inspect-ai 0.3.104__py3-none-any.whl → 0.3.106__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. inspect_ai/_eval/context.py +5 -0
  2. inspect_ai/_eval/eval.py +113 -1
  3. inspect_ai/_eval/evalset.py +1 -1
  4. inspect_ai/_eval/task/run.py +64 -38
  5. inspect_ai/_util/eval_task_group.py +15 -0
  6. inspect_ai/_view/server.py +17 -0
  7. inspect_ai/_view/www/dist/assets/index.css +33 -29
  8. inspect_ai/_view/www/dist/assets/index.js +559 -247
  9. inspect_ai/_view/www/src/app/samples/chat/ChatMessage.module.css +4 -0
  10. inspect_ai/_view/www/src/app/samples/chat/ChatMessage.tsx +17 -0
  11. inspect_ai/_view/www/src/app/samples/sample-tools/filters.ts +26 -0
  12. inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/SampleFilter.tsx +14 -3
  13. inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/completions.ts +359 -7
  14. inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/language.ts +6 -0
  15. inspect_ai/_view/www/src/app/samples/transcript/outline/OutlineRow.tsx +1 -1
  16. inspect_ai/_view/www/src/client/api/api-browser.ts +25 -0
  17. inspect_ai/_view/www/src/client/api/api-http.ts +3 -0
  18. inspect_ai/_view/www/src/client/api/api-vscode.ts +6 -0
  19. inspect_ai/_view/www/src/client/api/client-api.ts +3 -0
  20. inspect_ai/_view/www/src/client/api/jsonrpc.ts +1 -0
  21. inspect_ai/_view/www/src/client/api/types.ts +3 -0
  22. inspect_ai/_view/www/src/state/samplePolling.ts +17 -1
  23. inspect_ai/agent/_handoff.py +5 -2
  24. inspect_ai/agent/_react.py +43 -20
  25. inspect_ai/dataset/_dataset.py +1 -1
  26. inspect_ai/log/_samples.py +5 -0
  27. inspect_ai/model/_call_tools.py +4 -4
  28. inspect_ai/model/_providers/_openai_web_search.py +1 -1
  29. inspect_ai/model/_providers/anthropic.py +23 -2
  30. inspect_ai/model/_providers/google.py +5 -1
  31. inspect_ai/model/_providers/groq.py +5 -0
  32. inspect_ai/model/_providers/perplexity.py +27 -1
  33. inspect_ai/model/_providers/providers.py +1 -1
  34. inspect_ai/tool/_tools/_web_search/_web_search.py +8 -3
  35. inspect_ai/util/__init__.py +8 -0
  36. inspect_ai/util/_background.py +64 -0
  37. inspect_ai/util/_limit.py +72 -5
  38. inspect_ai/util/_sandbox/__init__.py +2 -0
  39. inspect_ai/util/_sandbox/service.py +28 -7
  40. inspect_ai/util/_subprocess.py +51 -38
  41. {inspect_ai-0.3.104.dist-info → inspect_ai-0.3.106.dist-info}/METADATA +1 -1
  42. {inspect_ai-0.3.104.dist-info → inspect_ai-0.3.106.dist-info}/RECORD +46 -44
  43. {inspect_ai-0.3.104.dist-info → inspect_ai-0.3.106.dist-info}/WHEEL +0 -0
  44. {inspect_ai-0.3.104.dist-info → inspect_ai-0.3.106.dist-info}/entry_points.txt +0 -0
  45. {inspect_ai-0.3.104.dist-info → inspect_ai-0.3.106.dist-info}/licenses/LICENSE +0 -0
  46. {inspect_ai-0.3.104.dist-info → inspect_ai-0.3.106.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,7 @@
1
+ from anyio.abc import TaskGroup
2
+
1
3
  from inspect_ai._util.dotenv import init_dotenv
4
+ from inspect_ai._util.eval_task_group import init_eval_task_group
2
5
  from inspect_ai._util.hooks import init_hooks
3
6
  from inspect_ai._util.logger import init_logger
4
7
  from inspect_ai.approval._apply import have_tool_approval, init_tool_approval
@@ -19,6 +22,7 @@ def init_eval_context(
19
22
  log_level: str | None,
20
23
  log_level_transcript: str | None,
21
24
  max_subprocesses: int | None = None,
25
+ task_group: TaskGroup | None = None,
22
26
  ) -> None:
23
27
  init_dotenv()
24
28
  init_logger(log_level, log_level_transcript)
@@ -27,6 +31,7 @@ def init_eval_context(
27
31
  init_hooks()
28
32
  init_active_samples()
29
33
  init_human_approval_manager()
34
+ init_eval_task_group(task_group)
30
35
 
31
36
 
32
37
  def init_task_context(
inspect_ai/_eval/eval.py CHANGED
@@ -4,11 +4,15 @@ import sys
4
4
  from pathlib import Path
5
5
  from typing import Any, Literal, cast
6
6
 
7
+ import anyio
8
+ from anyio.abc import TaskGroup
9
+
7
10
  from inspect_ai._eval.task.task import resolve_model_roles
8
11
  from inspect_ai._util.notgiven import NOT_GIVEN, NotGiven
9
12
  from inspect_ai.agent._agent import Agent, is_agent
10
13
  from inspect_ai.agent._as_solver import as_solver
11
14
  from inspect_ai.log._model import model_roles_config_to_model_roles
15
+ from inspect_ai.util._anyio import inner_exception
12
16
 
13
17
  if sys.version_info < (3, 11):
14
18
  from exceptiongroup import ExceptionGroup
@@ -359,6 +363,112 @@ async def eval_async(
359
363
  Returns:
360
364
  List of EvalLog (one for each task)
361
365
  """
366
+ result: list[EvalLog] | None = None
367
+
368
+ async def run(tg: TaskGroup) -> None:
369
+ try:
370
+ nonlocal result
371
+ result = await _eval_async_inner(
372
+ tg=tg,
373
+ tasks=tasks,
374
+ model=model,
375
+ model_base_url=model_base_url,
376
+ model_args=model_args,
377
+ model_roles=model_roles,
378
+ task_args=task_args,
379
+ sandbox=sandbox,
380
+ sandbox_cleanup=sandbox_cleanup,
381
+ solver=solver,
382
+ tags=tags,
383
+ metadata=metadata,
384
+ approval=approval,
385
+ log_level=log_level,
386
+ log_level_transcript=log_level_transcript,
387
+ log_dir=log_dir,
388
+ log_format=log_format,
389
+ limit=limit,
390
+ sample_id=sample_id,
391
+ epochs=epochs,
392
+ fail_on_error=fail_on_error,
393
+ retry_on_error=retry_on_error,
394
+ debug_errors=debug_errors,
395
+ message_limit=message_limit,
396
+ token_limit=token_limit,
397
+ time_limit=time_limit,
398
+ working_limit=working_limit,
399
+ max_samples=max_samples,
400
+ max_tasks=max_tasks,
401
+ max_subprocesses=max_subprocesses,
402
+ max_sandboxes=max_sandboxes,
403
+ log_samples=log_samples,
404
+ log_realtime=log_realtime,
405
+ log_images=log_images,
406
+ log_buffer=log_buffer,
407
+ log_shared=log_shared,
408
+ log_header_only=log_header_only,
409
+ score=score,
410
+ score_display=score_display,
411
+ **kwargs,
412
+ )
413
+ finally:
414
+ tg.cancel_scope.cancel()
415
+
416
+ try:
417
+ async with anyio.create_task_group() as tg:
418
+ tg.start_soon(run, tg)
419
+ except Exception as ex:
420
+ raise inner_exception(ex)
421
+ except anyio.get_cancelled_exc_class():
422
+ # Cancelled exceptions are expected and handled by _eval_async_inner
423
+ pass
424
+
425
+ assert result is not None, "Eval async did not return a result."
426
+
427
+ return result
428
+
429
+
430
+ async def _eval_async_inner(
431
+ tg: TaskGroup,
432
+ tasks: Tasks,
433
+ model: str | Model | list[str] | list[Model] | None | NotGiven = NOT_GIVEN,
434
+ model_base_url: str | None = None,
435
+ model_args: dict[str, Any] | str = dict(),
436
+ model_roles: dict[str, str | Model] | None = None,
437
+ task_args: dict[str, Any] | str = dict(),
438
+ sandbox: SandboxEnvironmentType | None = None,
439
+ sandbox_cleanup: bool | None = None,
440
+ solver: Solver | SolverSpec | Agent | list[Solver] | None = None,
441
+ tags: list[str] | None = None,
442
+ metadata: dict[str, Any] | None = None,
443
+ approval: str | list[ApprovalPolicy] | ApprovalPolicyConfig | None = None,
444
+ log_level: str | None = None,
445
+ log_level_transcript: str | None = None,
446
+ log_dir: str | None = None,
447
+ log_format: Literal["eval", "json"] | None = None,
448
+ limit: int | tuple[int, int] | None = None,
449
+ sample_id: str | int | list[str] | list[int] | list[str | int] | None = None,
450
+ epochs: int | Epochs | None = None,
451
+ fail_on_error: bool | float | None = None,
452
+ retry_on_error: int | None = None,
453
+ debug_errors: bool | None = None,
454
+ message_limit: int | None = None,
455
+ token_limit: int | None = None,
456
+ time_limit: int | None = None,
457
+ working_limit: int | None = None,
458
+ max_samples: int | None = None,
459
+ max_tasks: int | None = None,
460
+ max_subprocesses: int | None = None,
461
+ max_sandboxes: int | None = None,
462
+ log_samples: bool | None = None,
463
+ log_realtime: bool | None = None,
464
+ log_images: bool | None = None,
465
+ log_buffer: int | None = None,
466
+ log_shared: bool | int | None = None,
467
+ log_header_only: bool | None = None,
468
+ score: bool = True,
469
+ score_display: bool | None = None,
470
+ **kwargs: Unpack[GenerateConfigArgs],
471
+ ) -> list[EvalLog]:
362
472
  # only a single call to eval_async can be active at a time, this used
363
473
  # to be due to running tasks switching to the task's directory, however
364
474
  # that feature no longer exists so we may be able to revisit this
@@ -387,6 +497,7 @@ async def eval_async(
387
497
  max_subprocesses=max_subprocesses,
388
498
  log_level=log_level,
389
499
  log_level_transcript=log_level_transcript,
500
+ task_group=tg,
390
501
  **kwargs,
391
502
  )
392
503
 
@@ -934,10 +1045,11 @@ def eval_init(
934
1045
  max_subprocesses: int | None = None,
935
1046
  log_level: str | None = None,
936
1047
  log_level_transcript: str | None = None,
1048
+ task_group: TaskGroup | None = None,
937
1049
  **kwargs: Unpack[GenerateConfigArgs],
938
1050
  ) -> list[Model]:
939
1051
  # init eval context
940
- init_eval_context(log_level, log_level_transcript, max_subprocesses)
1052
+ init_eval_context(log_level, log_level_transcript, max_subprocesses, task_group)
941
1053
 
942
1054
  # resolve model and task args
943
1055
  model_args = resolve_args(model_args)
@@ -578,7 +578,7 @@ def task_identifier(task: ResolvedTask | EvalLog) -> str:
578
578
  else:
579
579
  task_file = task.eval.task_file or ""
580
580
  task_name = task.eval.task
581
- task_args = task.eval.task_args
581
+ task_args = task.eval.task_args_passed
582
582
  model = str(task.eval.model)
583
583
  model_roles = task.eval.model_roles or {}
584
584
 
@@ -10,6 +10,7 @@ from pathlib import PurePath
10
10
  from typing import Callable, Literal
11
11
 
12
12
  import anyio
13
+ from anyio.abc import TaskGroup
13
14
  from typing_extensions import Unpack
14
15
 
15
16
  from inspect_ai._display import (
@@ -306,37 +307,57 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
306
307
  task.metrics,
307
308
  )
308
309
 
310
+ async def run_sample(
311
+ sample: Sample, state: TaskState
312
+ ) -> dict[str, SampleScore] | None:
313
+ result: dict[str, SampleScore] | None = None
314
+
315
+ async def run(tg: TaskGroup) -> None:
316
+ try:
317
+ nonlocal result
318
+ result = await task_run_sample(
319
+ tg=tg,
320
+ task_name=task.name,
321
+ log_location=profile.log_location,
322
+ sample=sample,
323
+ state=state,
324
+ sandbox=sandbox,
325
+ max_sandboxes=config.max_sandboxes,
326
+ sandbox_cleanup=sandbox_cleanup,
327
+ plan=plan,
328
+ scorers=scorers,
329
+ generate=generate,
330
+ progress=progress,
331
+ logger=logger if log_samples else None,
332
+ log_images=log_images,
333
+ sample_source=sample_source,
334
+ sample_error=sample_error_handler,
335
+ sample_complete=sample_complete,
336
+ fails_on_error=(
337
+ config.fail_on_error is None
338
+ or config.fail_on_error is True
339
+ ),
340
+ retry_on_error=config.retry_on_error or 0,
341
+ error_retries=[],
342
+ time_limit=config.time_limit,
343
+ working_limit=config.working_limit,
344
+ semaphore=sample_semaphore,
345
+ )
346
+ finally:
347
+ tg.cancel_scope.cancel()
348
+
349
+ async with anyio.create_task_group() as tg:
350
+ tg.start_soon(run, tg)
351
+
352
+ return result
353
+
309
354
  sample_results = await tg_collect(
310
355
  [
311
- functools.partial(
312
- task_run_sample,
313
- task_name=task.name,
314
- log_location=profile.log_location,
315
- sample=sample,
316
- state=state,
317
- sandbox=sandbox,
318
- max_sandboxes=config.max_sandboxes,
319
- sandbox_cleanup=sandbox_cleanup,
320
- plan=plan,
321
- scorers=scorers,
322
- generate=generate,
323
- progress=progress,
324
- logger=logger if log_samples else None,
325
- log_images=log_images,
326
- sample_source=sample_source,
327
- sample_error=sample_error_handler,
328
- sample_complete=sample_complete,
329
- fails_on_error=(
330
- config.fail_on_error is None
331
- or config.fail_on_error is True
332
- ),
333
- retry_on_error=config.retry_on_error or 0,
334
- error_retries=[],
335
- time_limit=config.time_limit,
336
- working_limit=config.working_limit,
337
- semaphore=sample_semaphore,
356
+ functools.partial(run_sample, sample, state)
357
+ for (sample, state) in zip(
358
+ samples,
359
+ states,
338
360
  )
339
- for (sample, state) in zip(samples, states)
340
361
  ]
341
362
  )
342
363
 
@@ -492,6 +513,7 @@ def update_metrics_display_fn(
492
513
 
493
514
  async def task_run_sample(
494
515
  *,
516
+ tg: TaskGroup,
495
517
  task_name: str,
496
518
  log_location: str,
497
519
  sample: Sample,
@@ -611,12 +633,14 @@ async def task_run_sample(
611
633
  working_limit=working_limit,
612
634
  fails_on_error=fails_on_error or (retry_on_error > 0),
613
635
  transcript=sample_transcript,
636
+ tg=tg,
614
637
  ) as active,
615
638
  ):
616
639
  start_time: float | None = None
617
640
  error: EvalError | None = None
618
641
  raise_error: BaseException | None = None
619
642
  results: dict[str, SampleScore] = {}
643
+ limit: EvalSampleLimit | None = None
620
644
  try:
621
645
  # begin init
622
646
  init_span = span("init", type="init")
@@ -704,9 +728,17 @@ async def task_run_sample(
704
728
  # handle the cancel exception
705
729
  raise
706
730
 
707
- except (LimitExceededError, TerminateSampleError):
731
+ except LimitExceededError as ex:
732
+ # capture most recent state for scoring
733
+ state = sample_state() or state
734
+ limit = EvalSampleLimit(
735
+ type=ex.type, limit=ex.limit if ex.limit is not None else -1
736
+ )
737
+
738
+ except TerminateSampleError:
708
739
  # capture most recent state for scoring
709
740
  state = sample_state() or state
741
+ limit = EvalSampleLimit(type="operator", limit=1)
710
742
 
711
743
  except BaseException as ex:
712
744
  error, raise_error = handle_error(ex)
@@ -815,6 +847,7 @@ async def task_run_sample(
815
847
  state=state,
816
848
  scores=results,
817
849
  error=error,
850
+ limit=limit,
818
851
  error_retries=error_retries,
819
852
  log_images=log_images,
820
853
  )
@@ -854,6 +887,7 @@ async def task_run_sample(
854
887
  time_limit=time_limit,
855
888
  working_limit=working_limit,
856
889
  semaphore=semaphore,
890
+ tg=tg,
857
891
  )
858
892
 
859
893
  # no error
@@ -879,6 +913,7 @@ async def log_sample(
879
913
  state: TaskState,
880
914
  scores: dict[str, SampleScore],
881
915
  error: EvalError | None,
916
+ limit: EvalSampleLimit | None,
882
917
  error_retries: list[EvalError],
883
918
  log_images: bool,
884
919
  ) -> None:
@@ -894,15 +929,6 @@ async def log_sample(
894
929
  # compute total time if we can
895
930
  total_time = time.monotonic() - start_time if start_time is not None else None
896
931
 
897
- # if a limit was hit, note that in the Eval Sample
898
- limit = None
899
- for e in transcript().events:
900
- if e.event == "sample_limit":
901
- limit = EvalSampleLimit(
902
- type=e.type, limit=e.limit if e.limit is not None else -1
903
- )
904
- break
905
-
906
932
  eval_sample = EvalSample(
907
933
  id=id,
908
934
  epoch=state.epoch,
@@ -0,0 +1,15 @@
1
+ from anyio.abc import TaskGroup
2
+
3
+ _eval_task_group: TaskGroup | None = None
4
+
5
+
6
+ def init_eval_task_group(tg: TaskGroup | None) -> None:
7
+ global _eval_task_group
8
+ _eval_task_group = tg
9
+
10
+
11
+ def eval_task_group() -> TaskGroup:
12
+ global _eval_task_group
13
+ if _eval_task_group is None:
14
+ raise RuntimeError("Task group has not been initialized")
15
+ return _eval_task_group
@@ -155,6 +155,23 @@ def view_server(
155
155
  body=samples.model_dump_json(), headers={"ETag": samples.etag}
156
156
  )
157
157
 
158
+ @routes.get("/api/log-message")
159
+ async def api_log_message(request: web.Request) -> web.Response:
160
+ # log file requested
161
+ file = query_param_required("log_file", request, str)
162
+
163
+ file = urllib.parse.unquote(file)
164
+ validate_log_file_request(file)
165
+
166
+ # message to log
167
+ message = query_param_required("message", request, str)
168
+
169
+ # log the message
170
+ logger.warning(f"[CLIENT MESSAGE] ({file}): {message}")
171
+
172
+ # respond
173
+ return web.Response(status=204)
174
+
158
175
  @routes.get("/api/pending-sample-data")
159
176
  async def api_sample_events(request: web.Request) -> web.Response:
160
177
  # log file requested
@@ -15655,18 +15655,40 @@ pre[class*="language-"] {
15655
15655
  background-color: var(--bs-light-border-subtle);
15656
15656
  margin-top: -1px;
15657
15657
  }
15658
- ._message_17kai_1 {
15658
+ ._keyPairContainer_1ltuo_1 {
15659
+ display: grid;
15660
+ grid-template-columns: max-content auto;
15661
+ column-gap: 0.5em;
15662
+ padding-top: 4px;
15663
+ padding-bottom: 4px;
15664
+ border-bottom: solid 1px var(--bs-border-color);
15665
+ }
15666
+
15667
+ ._key_1ltuo_1 {
15668
+ display: grid;
15669
+ grid-template-columns: 1em auto;
15670
+ cursor: pointer;
15671
+ }
15672
+
15673
+ ._pre_1ltuo_16 {
15674
+ margin-bottom: 0;
15675
+ }
15676
+
15677
+ ._treeIcon_1ltuo_20 {
15678
+ margin-top: -3px;
15679
+ }
15680
+ ._message_1ivu3_1 {
15659
15681
  font-weight: 300;
15660
15682
  margin-left: 0;
15661
15683
  margin-right: 0;
15662
15684
  white-space: normal;
15663
15685
  }
15664
15686
 
15665
- ._systemRole_17kai_8 {
15687
+ ._systemRole_1ivu3_8 {
15666
15688
  opacity: 0.7;
15667
15689
  }
15668
15690
 
15669
- ._messageGrid_17kai_12 {
15691
+ ._messageGrid_1ivu3_12 {
15670
15692
  display: grid;
15671
15693
  grid-template-columns: max-content max-content max-content;
15672
15694
  column-gap: 0.3em;
@@ -15674,24 +15696,28 @@ pre[class*="language-"] {
15674
15696
  margin-bottom: 0.3em;
15675
15697
  }
15676
15698
 
15677
- ._messageContents_17kai_20 {
15699
+ ._messageContents_1ivu3_20 {
15678
15700
  margin-left: 0;
15679
15701
  padding-bottom: 0;
15680
15702
  }
15681
15703
 
15682
- ._messageContents_17kai_20._indented_17kai_25 {
15704
+ ._messageContents_1ivu3_20._indented_1ivu3_25 {
15683
15705
  margin-left: 0rem;
15684
15706
  }
15685
15707
 
15686
- ._copyLink_17kai_29 {
15708
+ ._copyLink_1ivu3_29 {
15687
15709
  opacity: 0;
15688
15710
  padding-left: 0;
15689
15711
  padding-right: 2em;
15690
15712
  }
15691
15713
 
15692
- ._copyLink_17kai_29:hover {
15714
+ ._copyLink_1ivu3_29:hover {
15693
15715
  opacity: 1;
15694
15716
  }
15717
+
15718
+ ._metadataLabel_1ivu3_39 {
15719
+ padding-top: 1em;
15720
+ }
15695
15721
  ._webSearch_1376z_1 {
15696
15722
  display: grid;
15697
15723
  grid-template-columns: max-content 1fr;
@@ -15702,28 +15728,6 @@ pre[class*="language-"] {
15702
15728
  ._query_1376z_8 {
15703
15729
  font-family: var(--bs-font-monospace);
15704
15730
  }
15705
- ._keyPairContainer_1ltuo_1 {
15706
- display: grid;
15707
- grid-template-columns: max-content auto;
15708
- column-gap: 0.5em;
15709
- padding-top: 4px;
15710
- padding-bottom: 4px;
15711
- border-bottom: solid 1px var(--bs-border-color);
15712
- }
15713
-
15714
- ._key_1ltuo_1 {
15715
- display: grid;
15716
- grid-template-columns: 1em auto;
15717
- cursor: pointer;
15718
- }
15719
-
15720
- ._pre_1ltuo_16 {
15721
- margin-bottom: 0;
15722
- }
15723
-
15724
- ._treeIcon_1ltuo_20 {
15725
- margin-top: -3px;
15726
- }
15727
15731
  ._contentData_1sd1z_1 {
15728
15732
  border: solid var(--bs-light-border-subtle) 1px;
15729
15733
  padding: 0.5em;