inspect-ai 0.3.75__py3-none-any.whl → 0.3.77__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. inspect_ai/_cli/eval.py +16 -0
  2. inspect_ai/_display/core/results.py +6 -1
  3. inspect_ai/_eval/eval.py +8 -1
  4. inspect_ai/_eval/evalset.py +6 -2
  5. inspect_ai/_eval/registry.py +3 -5
  6. inspect_ai/_eval/run.py +7 -2
  7. inspect_ai/_eval/task/run.py +4 -0
  8. inspect_ai/_util/content.py +3 -0
  9. inspect_ai/_util/logger.py +3 -0
  10. inspect_ai/_view/www/dist/assets/index.css +28 -16
  11. inspect_ai/_view/www/dist/assets/index.js +4811 -4609
  12. inspect_ai/_view/www/log-schema.json +79 -9
  13. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +22 -4
  14. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +1 -1
  15. inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +1 -1
  16. inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -2
  17. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +1 -1
  18. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +2 -2
  19. inspect_ai/_view/www/src/types/log.d.ts +11 -5
  20. inspect_ai/log/_recorders/json.py +8 -0
  21. inspect_ai/log/_transcript.py +13 -4
  22. inspect_ai/model/_call_tools.py +13 -4
  23. inspect_ai/model/_chat_message.py +3 -0
  24. inspect_ai/model/_model.py +5 -1
  25. inspect_ai/model/_model_output.py +6 -1
  26. inspect_ai/model/_openai.py +78 -10
  27. inspect_ai/model/_openai_responses.py +277 -0
  28. inspect_ai/model/_providers/anthropic.py +134 -75
  29. inspect_ai/model/_providers/azureai.py +2 -2
  30. inspect_ai/model/_providers/mistral.py +29 -13
  31. inspect_ai/model/_providers/openai.py +64 -57
  32. inspect_ai/model/_providers/openai_responses.py +177 -0
  33. inspect_ai/model/_providers/openrouter.py +52 -2
  34. inspect_ai/model/_providers/providers.py +1 -1
  35. inspect_ai/model/_providers/vertex.py +5 -2
  36. inspect_ai/tool/__init__.py +6 -0
  37. inspect_ai/tool/_tool.py +23 -3
  38. inspect_ai/tool/_tool_call.py +5 -2
  39. inspect_ai/tool/_tool_support_helpers.py +200 -0
  40. inspect_ai/tool/_tools/_bash_session.py +119 -0
  41. inspect_ai/tool/_tools/_computer/_computer.py +1 -1
  42. inspect_ai/tool/_tools/_text_editor.py +121 -0
  43. inspect_ai/tool/_tools/_think.py +48 -0
  44. inspect_ai/tool/_tools/_web_browser/_back_compat.py +150 -0
  45. inspect_ai/tool/_tools/_web_browser/_web_browser.py +75 -130
  46. inspect_ai/tool/_tools/_web_search.py +1 -1
  47. inspect_ai/util/_json.py +28 -0
  48. inspect_ai/util/_sandbox/context.py +16 -7
  49. inspect_ai/util/_sandbox/docker/config.py +1 -1
  50. inspect_ai/util/_sandbox/docker/internal.py +3 -3
  51. {inspect_ai-0.3.75.dist-info → inspect_ai-0.3.77.dist-info}/METADATA +5 -2
  52. {inspect_ai-0.3.75.dist-info → inspect_ai-0.3.77.dist-info}/RECORD +56 -80
  53. {inspect_ai-0.3.75.dist-info → inspect_ai-0.3.77.dist-info}/WHEEL +1 -1
  54. inspect_ai/model/_image.py +0 -15
  55. inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +0 -8
  56. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +0 -24
  57. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +0 -25
  58. inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +0 -22
  59. inspect_ai/tool/_tools/_web_browser/_resources/README.md +0 -63
  60. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +0 -71
  61. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +0 -323
  62. inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +0 -5
  63. inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +0 -279
  64. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +0 -9
  65. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +0 -293
  66. inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +0 -94
  67. inspect_ai/tool/_tools/_web_browser/_resources/constants.py +0 -2
  68. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +0 -2
  69. inspect_ai/tool/_tools/_web_browser/_resources/mock_environment.py +0 -45
  70. inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +0 -50
  71. inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +0 -48
  72. inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +0 -280
  73. inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +0 -65
  74. inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +0 -64
  75. inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +0 -146
  76. inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +0 -64
  77. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +0 -180
  78. inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +0 -99
  79. inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +0 -15
  80. inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +0 -44
  81. inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +0 -39
  82. inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +0 -214
  83. inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +0 -35
  84. inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +0 -192
  85. {inspect_ai-0.3.75.dist-info → inspect_ai-0.3.77.dist-info}/entry_points.txt +0 -0
  86. {inspect_ai-0.3.75.dist-info → inspect_ai-0.3.77.dist-info/licenses}/LICENSE +0 -0
  87. {inspect_ai-0.3.75.dist-info → inspect_ai-0.3.77.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py CHANGED
@@ -115,6 +115,13 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
115
115
  help="Tags to associate with this evaluation run.",
116
116
  envvar="INSPECT_EVAL_TAGS",
117
117
  )
118
+ @click.option(
119
+ "--metadata",
120
+ multiple=True,
121
+ type=str,
122
+ help="Metadata to associate with this evaluation run (more than one --metadata argument can be specified).",
123
+ envvar="INSPECT_EVAL_METADATA",
124
+ )
118
125
  @click.option(
119
126
  "--trace",
120
127
  type=bool,
@@ -449,6 +456,7 @@ def eval_command(
449
456
  s: tuple[str] | None,
450
457
  solver_config: str | None,
451
458
  tags: str | None,
459
+ metadata: tuple[str] | None,
452
460
  trace: bool | None,
453
461
  approval: str | None,
454
462
  sandbox: str | None,
@@ -525,6 +533,7 @@ def eval_command(
525
533
  s=s,
526
534
  solver_config=solver_config,
527
535
  tags=tags,
536
+ metadata=metadata,
528
537
  trace=trace,
529
538
  approval=approval,
530
539
  sandbox=sandbox,
@@ -616,6 +625,7 @@ def eval_set_command(
616
625
  s: tuple[str] | None,
617
626
  solver_config: str | None,
618
627
  tags: str | None,
628
+ metadata: tuple[str] | None,
619
629
  sandbox: str | None,
620
630
  no_sandbox_cleanup: bool | None,
621
631
  epochs: int | None,
@@ -695,6 +705,7 @@ def eval_set_command(
695
705
  s=s,
696
706
  solver_config=solver_config,
697
707
  tags=tags,
708
+ metadata=metadata,
698
709
  trace=trace,
699
710
  approval=approval,
700
711
  sandbox=sandbox,
@@ -749,6 +760,7 @@ def eval_exec(
749
760
  s: tuple[str] | None,
750
761
  solver_config: str | None,
751
762
  tags: str | None,
763
+ metadata: tuple[str] | None,
752
764
  trace: bool | None,
753
765
  approval: str | None,
754
766
  sandbox: str | None,
@@ -790,6 +802,9 @@ def eval_exec(
790
802
  # parse tags
791
803
  eval_tags = parse_comma_separated(tags)
792
804
 
805
+ # parse metadata
806
+ eval_metadata = parse_cli_args(metadata)
807
+
793
808
  # resolve epochs
794
809
  eval_epochs = (
795
810
  Epochs(epochs, create_reducers(parse_comma_separated(epochs_reducer)))
@@ -825,6 +840,7 @@ def eval_exec(
825
840
  task_args=task_args,
826
841
  solver=SolverSpec(solver, solver_args) if solver else None,
827
842
  tags=eval_tags,
843
+ metadata=eval_metadata,
828
844
  trace=trace,
829
845
  approval=approval,
830
846
  sandbox=parse_sandbox(sandbox),
@@ -131,9 +131,14 @@ def task_stats(stats: EvalStats) -> RenderableType:
131
131
  else:
132
132
  input_tokens = f"[bold]I: [/bold]{usage.input_tokens:,}"
133
133
 
134
+ if usage.reasoning_tokens is not None:
135
+ reasoning_tokens = f", [bold]R: [/bold]{usage.reasoning_tokens:,}"
136
+ else:
137
+ reasoning_tokens = ""
138
+
134
139
  table.add_row(
135
140
  Text(model, style="bold"),
136
- f" {usage.total_tokens:,} tokens [{input_tokens}, [bold]O: [/bold]{usage.output_tokens:,}]",
141
+ f" {usage.total_tokens:,} tokens [{input_tokens}, [bold]O: [/bold]{usage.output_tokens:,}{reasoning_tokens}]",
137
142
  style=theme.light,
138
143
  )
139
144
 
inspect_ai/_eval/eval.py CHANGED
@@ -68,6 +68,7 @@ def eval(
68
68
  sandbox_cleanup: bool | None = None,
69
69
  solver: Solver | list[Solver] | SolverSpec | None = None,
70
70
  tags: list[str] | None = None,
71
+ metadata: dict[str, Any] | None = None,
71
72
  trace: bool | None = None,
72
73
  display: DisplayType | None = None,
73
74
  approval: str | list[ApprovalPolicy] | None = None,
@@ -116,6 +117,7 @@ def eval(
116
117
  solver: Alternative solver for task(s).
117
118
  Optional (uses task solver by default).
118
119
  tags: Tags to associate with this evaluation run.
120
+ metadata: Metadata to associate with this evaluation run.
119
121
  trace: Trace message interactions with evaluated model to terminal.
120
122
  display: Task display type (defaults to 'full').
121
123
  approval: Tool use approval policies.
@@ -186,6 +188,7 @@ def eval(
186
188
  sandbox_cleanup=sandbox_cleanup,
187
189
  solver=solver,
188
190
  tags=tags,
191
+ metadata=metadata,
189
192
  approval=approval,
190
193
  log_level=log_level,
191
194
  log_level_transcript=log_level_transcript,
@@ -235,6 +238,7 @@ async def eval_async(
235
238
  sandbox_cleanup: bool | None = None,
236
239
  solver: Solver | list[Solver] | SolverSpec | None = None,
237
240
  tags: list[str] | None = None,
241
+ metadata: dict[str, Any] | None = None,
238
242
  approval: str | list[ApprovalPolicy] | ApprovalPolicyConfig | None = None,
239
243
  log_level: str | None = None,
240
244
  log_level_transcript: str | None = None,
@@ -274,7 +278,8 @@ async def eval_async(
274
278
  sandbox: Sandbox environment type (or optionally a str or tuple with a shorthand spec)
275
279
  sandbox_cleanup: Cleanup sandbox environments after task completes (defaults to True)
276
280
  solver: Alternative solver for task(s). Optional (uses task solver by default).
277
- tags (list[str] | None): Tags to associate with this evaluation run.
281
+ tags: Tags to associate with this evaluation run.
282
+ metadata: Metadata to associate with this evaluation run.
278
283
  approval: Tool use approval policies.
279
284
  Either a path to an approval policy config file or a list of approval policies.
280
285
  Defaults to no approval policy.
@@ -449,6 +454,7 @@ async def eval_async(
449
454
  epochs_reducer=epochs_reducer,
450
455
  solver=solver,
451
456
  tags=tags,
457
+ metadata=metadata,
452
458
  score=score,
453
459
  debug_errors=debug_errors is True,
454
460
  **kwargs,
@@ -473,6 +479,7 @@ async def eval_async(
473
479
  epochs_reducer=epochs_reducer,
474
480
  solver=solver,
475
481
  tags=tags,
482
+ metadata=metadata,
476
483
  score=score,
477
484
  **kwargs,
478
485
  )
@@ -35,7 +35,7 @@ from inspect_ai.model import (
35
35
  from inspect_ai.model._generate_config import GenerateConfig
36
36
  from inspect_ai.solver._solver import Solver, SolverSpec
37
37
  from inspect_ai.util import DisplayType, SandboxEnvironmentType
38
- from inspect_ai.util._display import init_display_type
38
+ from inspect_ai.util._display import display_type_initialized, init_display_type
39
39
 
40
40
  from .eval import eval, eval_init
41
41
  from .loader import resolve_task_args
@@ -68,6 +68,7 @@ def eval_set(
68
68
  sandbox_cleanup: bool | None = None,
69
69
  solver: Solver | list[Solver] | SolverSpec | None = None,
70
70
  tags: list[str] | None = None,
71
+ metadata: dict[str, Any] | None = None,
71
72
  trace: bool | None = None,
72
73
  display: DisplayType | None = None,
73
74
  approval: str | list[ApprovalPolicy] | None = None,
@@ -127,6 +128,7 @@ def eval_set(
127
128
  solver: Alternative solver(s) for
128
129
  evaluating task(s). ptional (uses task solver by default).
129
130
  tags: Tags to associate with this evaluation run.
131
+ metadata: Metadata to associate with this evaluation run.
130
132
  trace: Trace message interactions with evaluated model to terminal.
131
133
  display: Task display type (defaults to 'full').
132
134
  approval: Tool use approval policies.
@@ -193,6 +195,7 @@ def eval_set(
193
195
  sandbox_cleanup=sandbox_cleanup,
194
196
  solver=solver,
195
197
  tags=tags,
198
+ metadata=metadata,
196
199
  trace=trace,
197
200
  display=display,
198
201
  approval=approval,
@@ -234,7 +237,8 @@ def eval_set(
234
237
  return results
235
238
 
236
239
  # initialise display (otherwise eval_init will set it to full)
237
- display = init_display_type(display)
240
+ if not display_type_initialized():
241
+ display = init_display_type(display)
238
242
  if display == "conversation":
239
243
  raise RuntimeError("eval_set cannot be used with conversation display.")
240
244
 
@@ -75,12 +75,10 @@ def task_create(name: str, **kwargs: Any) -> Task:
75
75
  task_params: list[str] = task_info.metadata["params"]
76
76
  task_args: dict[str, Any] = {}
77
77
  for param in kwargs.keys():
78
- if param in task_params:
78
+ if param in task_params or "kwargs" in task_params:
79
79
  task_args[param] = kwargs[param]
80
- if "kwargs" in task_params:
81
- task_args[param] = kwargs[param]
82
- else:
83
- logger.warning(f"param '{param}' not used by task '{name}'")
80
+ else:
81
+ logger.warning(f"param '{param}' not used by task '{name}'")
84
82
 
85
83
  return cast(Task, registry_create("task", name, **task_args))
86
84
 
inspect_ai/_eval/run.py CHANGED
@@ -2,8 +2,9 @@ import functools
2
2
  import logging
3
3
  import os
4
4
  import sys
5
- from typing import Awaitable, Callable, Set, cast
5
+ from typing import Any, Awaitable, Callable, Set, cast
6
6
 
7
+ from inspect_ai._eval.task.task import Task
7
8
  from inspect_ai._util.trace import trace_action
8
9
 
9
10
  if sys.version_info < (3, 11):
@@ -67,6 +68,7 @@ async def eval_run(
67
68
  epochs_reducer: list[ScoreReducer] | None = None,
68
69
  solver: Solver | SolverSpec | None = None,
69
70
  tags: list[str] | None = None,
71
+ metadata: dict[str, Any] | None = None,
70
72
  debug_errors: bool = False,
71
73
  score: bool = True,
72
74
  **kwargs: Unpack[GenerateConfigArgs],
@@ -81,6 +83,7 @@ async def eval_run(
81
83
  eval_wd = os.getcwd()
82
84
 
83
85
  # ensure sample ids
86
+ task: Task | None = None
84
87
  for resolved_task in tasks:
85
88
  # add sample ids to dataset if they aren't there (start at 1 not 0)
86
89
  task = resolved_task.task
@@ -91,6 +94,8 @@ async def eval_run(
91
94
  # Ensure sample ids are unique
92
95
  ensure_unique_ids(task.dataset)
93
96
 
97
+ assert task, "Must encounter a task"
98
+
94
99
  # run startup pass for the sandbox environments
95
100
  shutdown_sandbox_environments: Callable[[], Awaitable[None]] | None = None
96
101
  if has_sandbox:
@@ -201,7 +206,7 @@ async def eval_run(
201
206
  task_args=resolved_task.task_args,
202
207
  model_args=resolved_task.model.model_args,
203
208
  eval_config=task_eval_config,
204
- metadata=task.metadata,
209
+ metadata=((metadata or {}) | (task.metadata or {})) or None,
205
210
  recorder=recorder,
206
211
  )
207
212
  await logger.init()
@@ -599,6 +599,10 @@ async def task_run_sample(
599
599
  )
600
600
 
601
601
  async with sandboxenv_cm:
602
+ timeout_cm: (
603
+ contextlib._GeneratorContextManager[anyio.CancelScope, None, None]
604
+ | contextlib.nullcontext[None]
605
+ ) = contextlib.nullcontext()
602
606
  try:
603
607
  # update active sample wth sandboxes now that we are initialised
604
608
  active.sandboxes = await sandbox_connections()
@@ -12,6 +12,9 @@ class ContentText(BaseModel):
12
12
  text: str
13
13
  """Text content."""
14
14
 
15
+ refusal: bool | None = Field(default=None)
16
+ """Was this a refusal message?"""
17
+
15
18
 
16
19
  class ContentReasoning(BaseModel):
17
20
  """Reasoning content.
@@ -150,6 +150,9 @@ def init_logger(log_level: str | None, log_level_transcript: str | None = None)
150
150
  transcript_levelno=transcript_levelno,
151
151
  )
152
152
 
153
+ # set the global log level
154
+ getLogger().setLevel(log_level)
155
+
153
156
  # set the log level for our package
154
157
  getLogger(PKG_NAME).setLevel(capture_level)
155
158
  getLogger(PKG_NAME).addHandler(_logHandler)
@@ -16461,44 +16461,44 @@ ul.jsondiffpatch-textdiff {
16461
16461
  font-weight: 600;
16462
16462
  padding-bottom: 0.3em;
16463
16463
  }
16464
- ._output_3axgd_1 {
16464
+ ._output_15urk_1 {
16465
16465
  padding-top: 1em;
16466
16466
  }
16467
16467
 
16468
- ._container_3axgd_5 {
16468
+ ._container_15urk_5 {
16469
16469
  margin: 0.5em 0;
16470
16470
  width: 100%;
16471
16471
  }
16472
16472
 
16473
- ._all_3axgd_10 {
16473
+ ._all_15urk_10 {
16474
16474
  display: grid;
16475
16475
  grid-template-columns: 1fr 1fr 1fr;
16476
16476
  column-gap: 1em;
16477
16477
  }
16478
16478
 
16479
- ._tableSelection_3axgd_16 {
16479
+ ._tableSelection_15urk_16 {
16480
16480
  width: fit-content;
16481
16481
  align-self: start;
16482
16482
  justify-self: start;
16483
16483
  }
16484
16484
 
16485
- ._tools_3axgd_22 {
16485
+ ._tools_15urk_22 {
16486
16486
  grid-column: -1/1;
16487
16487
  }
16488
16488
 
16489
- ._codePre_3axgd_26 {
16489
+ ._codePre_15urk_26 {
16490
16490
  background: var(--bs-light);
16491
16491
  width: 100%;
16492
16492
  padding: 0.5em;
16493
16493
  border-radius: var(--bs-border-radius);
16494
16494
  }
16495
16495
 
16496
- ._code_3axgd_26 {
16497
- white-space: pre-wrap;
16498
- word-wrap: anywhere;
16496
+ ._code_15urk_26 {
16497
+ white-space: pre-wrap !important;
16498
+ word-wrap: anywhere !important;
16499
16499
  }
16500
16500
 
16501
- ._toolConfig_3axgd_38 {
16501
+ ._toolConfig_15urk_38 {
16502
16502
  display: grid;
16503
16503
  grid-template-columns: max-content auto;
16504
16504
  column-gap: 1em;
@@ -17032,12 +17032,14 @@ div.ap-player div.ap-control-bar * {
17032
17032
  div.ap-control-bar svg.ap-icon path {
17033
17033
  fill: var(--term-color-foreground);
17034
17034
  }
17035
- div.ap-control-bar span.ap-playback-button {
17035
+ div.ap-control-bar span.ap-button {
17036
17036
  display: flex;
17037
17037
  flex: 0 0 auto;
17038
17038
  cursor: pointer;
17039
- height: 12px;
17039
+ }
17040
+ div.ap-control-bar span.ap-playback-button {
17040
17041
  width: 12px;
17042
+ height: 12px;
17041
17043
  padding: 10px;
17042
17044
  }
17043
17045
  div.ap-control-bar span.ap-playback-button svg {
@@ -17104,13 +17106,9 @@ div.ap-control-bar.ap-seekable .ap-progressbar .ap-bar {
17104
17106
  cursor: pointer;
17105
17107
  }
17106
17108
  div.ap-control-bar .ap-fullscreen-button {
17107
- display: block;
17108
- flex: 0 0 auto;
17109
17109
  width: 14px;
17110
17110
  height: 14px;
17111
17111
  padding: 9px;
17112
- cursor: pointer;
17113
- position: relative;
17114
17112
  }
17115
17113
  div.ap-control-bar .ap-fullscreen-button svg {
17116
17114
  width: 14px;
@@ -17127,6 +17125,20 @@ div.ap-control-bar .ap-fullscreen-button .ap-tooltip {
17127
17125
  left: initial;
17128
17126
  transform: none;
17129
17127
  }
17128
+ div.ap-control-bar .ap-kbd-button {
17129
+ height: 14px;
17130
+ padding: 9px;
17131
+ margin: 0 4px;
17132
+ }
17133
+ div.ap-control-bar .ap-kbd-button svg {
17134
+ width: 26px;
17135
+ height: 14px;
17136
+ }
17137
+ div.ap-control-bar .ap-kbd-button .ap-tooltip {
17138
+ right: 5px;
17139
+ left: initial;
17140
+ transform: none;
17141
+ }
17130
17142
  div.ap-wrapper.ap-hud .ap-control-bar {
17131
17143
  opacity: 1;
17132
17144
  }