inspect-ai 0.3.104__py3-none-any.whl → 0.3.105__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. inspect_ai/_eval/evalset.py +1 -1
  2. inspect_ai/_eval/task/run.py +64 -38
  3. inspect_ai/_view/server.py +17 -0
  4. inspect_ai/_view/www/dist/assets/index.css +33 -29
  5. inspect_ai/_view/www/dist/assets/index.js +559 -247
  6. inspect_ai/_view/www/src/app/samples/chat/ChatMessage.module.css +4 -0
  7. inspect_ai/_view/www/src/app/samples/chat/ChatMessage.tsx +17 -0
  8. inspect_ai/_view/www/src/app/samples/sample-tools/filters.ts +26 -0
  9. inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/SampleFilter.tsx +14 -3
  10. inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/completions.ts +359 -7
  11. inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/language.ts +6 -0
  12. inspect_ai/_view/www/src/app/samples/transcript/outline/OutlineRow.tsx +1 -1
  13. inspect_ai/_view/www/src/client/api/api-browser.ts +25 -0
  14. inspect_ai/_view/www/src/client/api/api-http.ts +3 -0
  15. inspect_ai/_view/www/src/client/api/api-vscode.ts +6 -0
  16. inspect_ai/_view/www/src/client/api/client-api.ts +3 -0
  17. inspect_ai/_view/www/src/client/api/jsonrpc.ts +1 -0
  18. inspect_ai/_view/www/src/client/api/types.ts +3 -0
  19. inspect_ai/_view/www/src/state/samplePolling.ts +17 -1
  20. inspect_ai/agent/_handoff.py +5 -2
  21. inspect_ai/agent/_react.py +5 -5
  22. inspect_ai/dataset/_dataset.py +1 -1
  23. inspect_ai/log/_samples.py +5 -0
  24. inspect_ai/model/_call_tools.py +4 -4
  25. inspect_ai/model/_providers/anthropic.py +23 -2
  26. inspect_ai/model/_providers/google.py +5 -1
  27. inspect_ai/util/__init__.py +8 -0
  28. inspect_ai/util/_background.py +64 -0
  29. inspect_ai/util/_limit.py +72 -5
  30. inspect_ai/util/_sandbox/__init__.py +2 -0
  31. inspect_ai/util/_sandbox/service.py +28 -7
  32. inspect_ai/util/_subprocess.py +51 -38
  33. {inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/METADATA +1 -1
  34. {inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/RECORD +38 -37
  35. {inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/WHEEL +0 -0
  36. {inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/entry_points.txt +0 -0
  37. {inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/licenses/LICENSE +0 -0
  38. {inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/top_level.txt +0 -0
@@ -578,7 +578,7 @@ def task_identifier(task: ResolvedTask | EvalLog) -> str:
578
578
  else:
579
579
  task_file = task.eval.task_file or ""
580
580
  task_name = task.eval.task
581
- task_args = task.eval.task_args
581
+ task_args = task.eval.task_args_passed
582
582
  model = str(task.eval.model)
583
583
  model_roles = task.eval.model_roles or {}
584
584
 
@@ -10,6 +10,7 @@ from pathlib import PurePath
10
10
  from typing import Callable, Literal
11
11
 
12
12
  import anyio
13
+ from anyio.abc import TaskGroup
13
14
  from typing_extensions import Unpack
14
15
 
15
16
  from inspect_ai._display import (
@@ -306,37 +307,57 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
306
307
  task.metrics,
307
308
  )
308
309
 
310
+ async def run_sample(
311
+ sample: Sample, state: TaskState
312
+ ) -> dict[str, SampleScore] | None:
313
+ result: dict[str, SampleScore] | None = None
314
+
315
+ async def run(tg: TaskGroup) -> None:
316
+ try:
317
+ nonlocal result
318
+ result = await task_run_sample(
319
+ tg=tg,
320
+ task_name=task.name,
321
+ log_location=profile.log_location,
322
+ sample=sample,
323
+ state=state,
324
+ sandbox=sandbox,
325
+ max_sandboxes=config.max_sandboxes,
326
+ sandbox_cleanup=sandbox_cleanup,
327
+ plan=plan,
328
+ scorers=scorers,
329
+ generate=generate,
330
+ progress=progress,
331
+ logger=logger if log_samples else None,
332
+ log_images=log_images,
333
+ sample_source=sample_source,
334
+ sample_error=sample_error_handler,
335
+ sample_complete=sample_complete,
336
+ fails_on_error=(
337
+ config.fail_on_error is None
338
+ or config.fail_on_error is True
339
+ ),
340
+ retry_on_error=config.retry_on_error or 0,
341
+ error_retries=[],
342
+ time_limit=config.time_limit,
343
+ working_limit=config.working_limit,
344
+ semaphore=sample_semaphore,
345
+ )
346
+ finally:
347
+ tg.cancel_scope.cancel()
348
+
349
+ async with anyio.create_task_group() as tg:
350
+ tg.start_soon(run, tg)
351
+
352
+ return result
353
+
309
354
  sample_results = await tg_collect(
310
355
  [
311
- functools.partial(
312
- task_run_sample,
313
- task_name=task.name,
314
- log_location=profile.log_location,
315
- sample=sample,
316
- state=state,
317
- sandbox=sandbox,
318
- max_sandboxes=config.max_sandboxes,
319
- sandbox_cleanup=sandbox_cleanup,
320
- plan=plan,
321
- scorers=scorers,
322
- generate=generate,
323
- progress=progress,
324
- logger=logger if log_samples else None,
325
- log_images=log_images,
326
- sample_source=sample_source,
327
- sample_error=sample_error_handler,
328
- sample_complete=sample_complete,
329
- fails_on_error=(
330
- config.fail_on_error is None
331
- or config.fail_on_error is True
332
- ),
333
- retry_on_error=config.retry_on_error or 0,
334
- error_retries=[],
335
- time_limit=config.time_limit,
336
- working_limit=config.working_limit,
337
- semaphore=sample_semaphore,
356
+ functools.partial(run_sample, sample, state)
357
+ for (sample, state) in zip(
358
+ samples,
359
+ states,
338
360
  )
339
- for (sample, state) in zip(samples, states)
340
361
  ]
341
362
  )
342
363
 
@@ -492,6 +513,7 @@ def update_metrics_display_fn(
492
513
 
493
514
  async def task_run_sample(
494
515
  *,
516
+ tg: TaskGroup,
495
517
  task_name: str,
496
518
  log_location: str,
497
519
  sample: Sample,
@@ -611,12 +633,14 @@ async def task_run_sample(
611
633
  working_limit=working_limit,
612
634
  fails_on_error=fails_on_error or (retry_on_error > 0),
613
635
  transcript=sample_transcript,
636
+ tg=tg,
614
637
  ) as active,
615
638
  ):
616
639
  start_time: float | None = None
617
640
  error: EvalError | None = None
618
641
  raise_error: BaseException | None = None
619
642
  results: dict[str, SampleScore] = {}
643
+ limit: EvalSampleLimit | None = None
620
644
  try:
621
645
  # begin init
622
646
  init_span = span("init", type="init")
@@ -704,9 +728,17 @@ async def task_run_sample(
704
728
  # handle the cancel exception
705
729
  raise
706
730
 
707
- except (LimitExceededError, TerminateSampleError):
731
+ except LimitExceededError as ex:
732
+ # capture most recent state for scoring
733
+ state = sample_state() or state
734
+ limit = EvalSampleLimit(
735
+ type=ex.type, limit=ex.limit if ex.limit is not None else -1
736
+ )
737
+
738
+ except TerminateSampleError:
708
739
  # capture most recent state for scoring
709
740
  state = sample_state() or state
741
+ limit = EvalSampleLimit(type="operator", limit=1)
710
742
 
711
743
  except BaseException as ex:
712
744
  error, raise_error = handle_error(ex)
@@ -815,6 +847,7 @@ async def task_run_sample(
815
847
  state=state,
816
848
  scores=results,
817
849
  error=error,
850
+ limit=limit,
818
851
  error_retries=error_retries,
819
852
  log_images=log_images,
820
853
  )
@@ -854,6 +887,7 @@ async def task_run_sample(
854
887
  time_limit=time_limit,
855
888
  working_limit=working_limit,
856
889
  semaphore=semaphore,
890
+ tg=tg,
857
891
  )
858
892
 
859
893
  # no error
@@ -879,6 +913,7 @@ async def log_sample(
879
913
  state: TaskState,
880
914
  scores: dict[str, SampleScore],
881
915
  error: EvalError | None,
916
+ limit: EvalSampleLimit | None,
882
917
  error_retries: list[EvalError],
883
918
  log_images: bool,
884
919
  ) -> None:
@@ -894,15 +929,6 @@ async def log_sample(
894
929
  # compute total time if we can
895
930
  total_time = time.monotonic() - start_time if start_time is not None else None
896
931
 
897
- # if a limit was hit, note that in the Eval Sample
898
- limit = None
899
- for e in transcript().events:
900
- if e.event == "sample_limit":
901
- limit = EvalSampleLimit(
902
- type=e.type, limit=e.limit if e.limit is not None else -1
903
- )
904
- break
905
-
906
932
  eval_sample = EvalSample(
907
933
  id=id,
908
934
  epoch=state.epoch,
@@ -155,6 +155,23 @@ def view_server(
155
155
  body=samples.model_dump_json(), headers={"ETag": samples.etag}
156
156
  )
157
157
 
158
+ @routes.get("/api/log-message")
159
+ async def api_log_message(request: web.Request) -> web.Response:
160
+ # log file requested
161
+ file = query_param_required("log_file", request, str)
162
+
163
+ file = urllib.parse.unquote(file)
164
+ validate_log_file_request(file)
165
+
166
+ # message to log
167
+ message = query_param_required("message", request, str)
168
+
169
+ # log the message
170
+ logger.warning(f"[CLIENT MESSAGE] ({file}): {message}")
171
+
172
+ # respond
173
+ return web.Response(status=204)
174
+
158
175
  @routes.get("/api/pending-sample-data")
159
176
  async def api_sample_events(request: web.Request) -> web.Response:
160
177
  # log file requested
@@ -15655,18 +15655,40 @@ pre[class*="language-"] {
15655
15655
  background-color: var(--bs-light-border-subtle);
15656
15656
  margin-top: -1px;
15657
15657
  }
15658
- ._message_17kai_1 {
15658
+ ._keyPairContainer_1ltuo_1 {
15659
+ display: grid;
15660
+ grid-template-columns: max-content auto;
15661
+ column-gap: 0.5em;
15662
+ padding-top: 4px;
15663
+ padding-bottom: 4px;
15664
+ border-bottom: solid 1px var(--bs-border-color);
15665
+ }
15666
+
15667
+ ._key_1ltuo_1 {
15668
+ display: grid;
15669
+ grid-template-columns: 1em auto;
15670
+ cursor: pointer;
15671
+ }
15672
+
15673
+ ._pre_1ltuo_16 {
15674
+ margin-bottom: 0;
15675
+ }
15676
+
15677
+ ._treeIcon_1ltuo_20 {
15678
+ margin-top: -3px;
15679
+ }
15680
+ ._message_1ivu3_1 {
15659
15681
  font-weight: 300;
15660
15682
  margin-left: 0;
15661
15683
  margin-right: 0;
15662
15684
  white-space: normal;
15663
15685
  }
15664
15686
 
15665
- ._systemRole_17kai_8 {
15687
+ ._systemRole_1ivu3_8 {
15666
15688
  opacity: 0.7;
15667
15689
  }
15668
15690
 
15669
- ._messageGrid_17kai_12 {
15691
+ ._messageGrid_1ivu3_12 {
15670
15692
  display: grid;
15671
15693
  grid-template-columns: max-content max-content max-content;
15672
15694
  column-gap: 0.3em;
@@ -15674,24 +15696,28 @@ pre[class*="language-"] {
15674
15696
  margin-bottom: 0.3em;
15675
15697
  }
15676
15698
 
15677
- ._messageContents_17kai_20 {
15699
+ ._messageContents_1ivu3_20 {
15678
15700
  margin-left: 0;
15679
15701
  padding-bottom: 0;
15680
15702
  }
15681
15703
 
15682
- ._messageContents_17kai_20._indented_17kai_25 {
15704
+ ._messageContents_1ivu3_20._indented_1ivu3_25 {
15683
15705
  margin-left: 0rem;
15684
15706
  }
15685
15707
 
15686
- ._copyLink_17kai_29 {
15708
+ ._copyLink_1ivu3_29 {
15687
15709
  opacity: 0;
15688
15710
  padding-left: 0;
15689
15711
  padding-right: 2em;
15690
15712
  }
15691
15713
 
15692
- ._copyLink_17kai_29:hover {
15714
+ ._copyLink_1ivu3_29:hover {
15693
15715
  opacity: 1;
15694
15716
  }
15717
+
15718
+ ._metadataLabel_1ivu3_39 {
15719
+ padding-top: 1em;
15720
+ }
15695
15721
  ._webSearch_1376z_1 {
15696
15722
  display: grid;
15697
15723
  grid-template-columns: max-content 1fr;
@@ -15702,28 +15728,6 @@ pre[class*="language-"] {
15702
15728
  ._query_1376z_8 {
15703
15729
  font-family: var(--bs-font-monospace);
15704
15730
  }
15705
- ._keyPairContainer_1ltuo_1 {
15706
- display: grid;
15707
- grid-template-columns: max-content auto;
15708
- column-gap: 0.5em;
15709
- padding-top: 4px;
15710
- padding-bottom: 4px;
15711
- border-bottom: solid 1px var(--bs-border-color);
15712
- }
15713
-
15714
- ._key_1ltuo_1 {
15715
- display: grid;
15716
- grid-template-columns: 1em auto;
15717
- cursor: pointer;
15718
- }
15719
-
15720
- ._pre_1ltuo_16 {
15721
- margin-bottom: 0;
15722
- }
15723
-
15724
- ._treeIcon_1ltuo_20 {
15725
- margin-top: -3px;
15726
- }
15727
15731
  ._contentData_1sd1z_1 {
15728
15732
  border: solid var(--bs-light-border-subtle) 1px;
15729
15733
  padding: 0.5em;