inspect-ai 0.3.87__py3-none-any.whl → 0.3.89__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. inspect_ai/_cli/eval.py +16 -0
  2. inspect_ai/_cli/score.py +1 -12
  3. inspect_ai/_cli/util.py +4 -2
  4. inspect_ai/_display/core/footer.py +2 -2
  5. inspect_ai/_display/plain/display.py +2 -2
  6. inspect_ai/_eval/context.py +7 -1
  7. inspect_ai/_eval/eval.py +51 -27
  8. inspect_ai/_eval/evalset.py +27 -10
  9. inspect_ai/_eval/loader.py +7 -8
  10. inspect_ai/_eval/run.py +23 -31
  11. inspect_ai/_eval/score.py +18 -1
  12. inspect_ai/_eval/task/log.py +5 -13
  13. inspect_ai/_eval/task/resolved.py +1 -0
  14. inspect_ai/_eval/task/run.py +231 -244
  15. inspect_ai/_eval/task/task.py +25 -2
  16. inspect_ai/_eval/task/util.py +1 -8
  17. inspect_ai/_util/constants.py +1 -0
  18. inspect_ai/_util/json.py +8 -3
  19. inspect_ai/_util/registry.py +30 -13
  20. inspect_ai/_view/www/App.css +5 -0
  21. inspect_ai/_view/www/dist/assets/index.css +55 -18
  22. inspect_ai/_view/www/dist/assets/index.js +550 -458
  23. inspect_ai/_view/www/log-schema.json +84 -1
  24. inspect_ai/_view/www/src/metadata/MetaDataView.module.css +1 -1
  25. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +13 -8
  26. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +3 -0
  27. inspect_ai/_view/www/src/plan/ModelCard.module.css +16 -0
  28. inspect_ai/_view/www/src/plan/ModelCard.tsx +93 -0
  29. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +5 -1
  30. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +3 -3
  31. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +6 -29
  32. inspect_ai/_view/www/src/types/log.d.ts +150 -129
  33. inspect_ai/_view/www/src/workspace/navbar/ModelRolesView.module.css +16 -0
  34. inspect_ai/_view/www/src/workspace/navbar/ModelRolesView.tsx +43 -0
  35. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +1 -1
  36. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +5 -0
  37. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -0
  38. inspect_ai/agent/_agent.py +12 -0
  39. inspect_ai/agent/_as_tool.py +1 -1
  40. inspect_ai/agent/_bridge/bridge.py +9 -2
  41. inspect_ai/agent/_react.py +142 -74
  42. inspect_ai/agent/_run.py +13 -2
  43. inspect_ai/agent/_types.py +6 -0
  44. inspect_ai/approval/_apply.py +6 -9
  45. inspect_ai/approval/_approver.py +3 -3
  46. inspect_ai/approval/_auto.py +2 -2
  47. inspect_ai/approval/_call.py +20 -4
  48. inspect_ai/approval/_human/approver.py +3 -3
  49. inspect_ai/approval/_human/manager.py +2 -2
  50. inspect_ai/approval/_human/panel.py +3 -3
  51. inspect_ai/approval/_policy.py +3 -3
  52. inspect_ai/log/__init__.py +2 -0
  53. inspect_ai/log/_log.py +23 -2
  54. inspect_ai/log/_model.py +58 -0
  55. inspect_ai/log/_recorders/file.py +14 -3
  56. inspect_ai/log/_transcript.py +3 -0
  57. inspect_ai/model/__init__.py +2 -0
  58. inspect_ai/model/_call_tools.py +15 -2
  59. inspect_ai/model/_model.py +49 -3
  60. inspect_ai/model/_openai.py +151 -21
  61. inspect_ai/model/_providers/anthropic.py +25 -14
  62. inspect_ai/model/_providers/bedrock.py +3 -3
  63. inspect_ai/model/_providers/cloudflare.py +29 -108
  64. inspect_ai/model/_providers/google.py +21 -10
  65. inspect_ai/model/_providers/grok.py +23 -17
  66. inspect_ai/model/_providers/groq.py +61 -37
  67. inspect_ai/model/_providers/llama_cpp_python.py +8 -9
  68. inspect_ai/model/_providers/mistral.py +8 -3
  69. inspect_ai/model/_providers/ollama.py +8 -9
  70. inspect_ai/model/_providers/openai.py +53 -157
  71. inspect_ai/model/_providers/openai_compatible.py +195 -0
  72. inspect_ai/model/_providers/openrouter.py +4 -15
  73. inspect_ai/model/_providers/providers.py +11 -0
  74. inspect_ai/model/_providers/together.py +25 -23
  75. inspect_ai/model/_trim.py +83 -0
  76. inspect_ai/solver/_plan.py +5 -3
  77. inspect_ai/tool/_tool_call.py +3 -0
  78. inspect_ai/tool/_tool_def.py +8 -2
  79. inspect_ai/util/__init__.py +3 -0
  80. inspect_ai/util/_concurrency.py +15 -2
  81. {inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/METADATA +1 -1
  82. {inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/RECORD +86 -81
  83. inspect_ai/_eval/task/rundir.py +0 -78
  84. inspect_ai/_view/www/node_modules/flatted/python/flatted.py +0 -149
  85. {inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/WHEEL +0 -0
  86. {inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/entry_points.txt +0 -0
  87. {inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/licenses/LICENSE +0 -0
  88. {inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/top_level.txt +0 -0
@@ -54,6 +54,7 @@ class Task:
54
54
  metrics: list[Metric] | dict[str, list[Metric]] | None = None,
55
55
  model: str | Model | None = None,
56
56
  config: GenerateConfig = GenerateConfig(),
57
+ model_roles: dict[str, str | Model] | None = None,
57
58
  sandbox: SandboxEnvironmentType | None = None,
58
59
  approval: str | list[ApprovalPolicy] | None = None,
59
60
  epochs: int | Epochs | None = None,
@@ -79,7 +80,8 @@ class Task:
79
80
  scorer: Scorer used to evaluate model output.
80
81
  metrics: Alternative metrics (overrides the metrics provided by the specified scorer).
81
82
  model: Default model for task (Optional, defaults to eval model).
82
- config: Model generation config.
83
+ config: Model generation config for default model (does not apply to model roles)
84
+ model_roles: Named roles for use in `get_model()`.
83
85
  sandbox: Sandbox environment type (or optionally a str or tuple with a shorthand spec)
84
86
  approval: Tool use approval policies.
85
87
  Either a path to an approval policy config file or a list of approval policies. Defaults to no approval policy.
@@ -136,6 +138,7 @@ class Task:
136
138
  self.metrics = metrics
137
139
  self.model = resolve_model(model)
138
140
  self.config = config
141
+ self.model_roles = resolve_model_roles(model_roles)
139
142
  self.sandbox = resolve_sandbox_environment(sandbox)
140
143
  self.approval = resolve_approval(approval)
141
144
  epochs = resolve_epochs(epochs)
@@ -185,6 +188,7 @@ def task_with(
185
188
  metrics: list[Metric] | dict[str, list[Metric]] | None | NotGiven = NOT_GIVEN,
186
189
  model: str | Model | NotGiven = NOT_GIVEN,
187
190
  config: GenerateConfig | NotGiven = NOT_GIVEN,
191
+ model_roles: dict[str, str | Model] | NotGiven = NOT_GIVEN,
188
192
  sandbox: SandboxEnvironmentType | None | NotGiven = NOT_GIVEN,
189
193
  approval: str | list[ApprovalPolicy] | None | NotGiven = NOT_GIVEN,
190
194
  epochs: int | Epochs | None | NotGiven = NOT_GIVEN,
@@ -214,7 +218,8 @@ def task_with(
214
218
  scorer: Scorer used to evaluate model output.
215
219
  metrics: Alternative metrics (overrides the metrics provided by the specified scorer).
216
220
  model: Default model for task (Optional, defaults to eval model).
217
- config: Model generation config.
221
+ config: Model generation config for default model (does not apply to model roles)
222
+ model_roles: Named roles for use in `get_model()`.
218
223
  sandbox: Sandbox environment type (or optionally a str or tuple with a shorthand spec)
219
224
  approval: Tool use approval policies.
220
225
  Either a path to an approval policy config file or a list of approval policies. Defaults to no approval policy.
@@ -257,6 +262,8 @@ def task_with(
257
262
  task.model = resolve_model(model)
258
263
  if not isinstance(config, NotGiven):
259
264
  task.config = config
265
+ if not isinstance(model_roles, NotGiven):
266
+ task.model_roles = resolve_model_roles(model_roles)
260
267
  if not isinstance(sandbox, NotGiven):
261
268
  task.sandbox = resolve_sandbox_environment(sandbox)
262
269
  if not isinstance(approval, NotGiven):
@@ -315,6 +322,7 @@ class PreviousTask:
315
322
  task: str | Task
316
323
  task_args: dict[str, Any]
317
324
  model: Model | None
325
+ model_roles: dict[str, Model] | None
318
326
  log: EvalLog
319
327
 
320
328
 
@@ -365,6 +373,21 @@ def resolve_model(model: str | Model | None) -> Model | None:
365
373
  return model
366
374
 
367
375
 
376
+ def resolve_model_roles(
377
+ model_roles: dict[str, str | Model] | None,
378
+ ) -> dict[str, Model] | None:
379
+ if model_roles is not None:
380
+ resolved_model_roles = {
381
+ k: get_model(v, memoize=False) if isinstance(v, str) else v
382
+ for k, v in model_roles.items()
383
+ }
384
+ for k, v in resolved_model_roles.items():
385
+ v._set_role(k)
386
+ return resolved_model_roles
387
+ else:
388
+ return None
389
+
390
+
368
391
  def resolve_scorer(scorer: Scorer | list[Scorer] | None) -> list[Scorer] | None:
369
392
  return (
370
393
  scorer if isinstance(scorer, list) else [scorer] if scorer is not None else None
@@ -25,13 +25,6 @@ def task_run_dir(task: Task) -> str:
25
25
  return getattr(task, TASK_RUN_DIR_ATTR, os.getcwd())
26
26
 
27
27
 
28
- def task_chdir(task: Task) -> str | None:
29
- if task.attribs.get("chdir", False) is True:
30
- return task_run_dir(task)
31
- else:
32
- return None
33
-
34
-
35
28
  def task_file(task: Task, relative: bool = False) -> str | None:
36
29
  file = cast(str | None, getattr(task, TASK_FILE_ATTR, None))
37
30
  if file:
@@ -46,7 +39,7 @@ def task_file(task: Task, relative: bool = False) -> str | None:
46
39
  def slice_dataset(
47
40
  dataset: Dataset,
48
41
  limit: int | tuple[int, int] | None,
49
- sample_id: str | int | list[str | int] | None,
42
+ sample_id: str | int | list[str] | list[int] | list[str | int] | None,
50
43
  ) -> Dataset:
51
44
  def normalise(id: str | int | None) -> str:
52
45
  if isinstance(id, str) and id.isdigit():
@@ -38,6 +38,7 @@ CONSOLE_DISPLAY_WIDTH = 120
38
38
  BASE_64_DATA_REMOVED = "<base64-data-removed>"
39
39
  SANDBOX_SETUP_TIMEOUT = 300
40
40
  NO_CONTENT = "(no content)"
41
+ MODEL_NONE = "none/none"
41
42
 
42
43
  DESERIALIZING = "deserializing"
43
44
  DESERIALIZING_CONTEXT = {DESERIALIZING: True}
inspect_ai/_util/json.py CHANGED
@@ -93,9 +93,14 @@ def json_changes(
93
93
  replaced = before
94
94
  for path in paths:
95
95
  decoded_path = decode_json_pointer_segment(path)
96
- index: Any = (
97
- int(decoded_path) if decoded_path.isnumeric() else decoded_path
98
- )
96
+ if isinstance(replaced, list):
97
+ if not decoded_path.isnumeric():
98
+ raise ValueError(
99
+ f"Invalid JSON Pointer segment for list: {decoded_path}"
100
+ )
101
+ index = int(decoded_path)
102
+ else:
103
+ index = decoded_path
99
104
  replaced = replaced[index]
100
105
  json_change.replaced = replaced
101
106
  changes.append(json_change)
@@ -14,18 +14,24 @@ from .entrypoints import ensure_entry_points
14
14
  obj_type = type
15
15
 
16
16
  RegistryType = Literal[
17
- "modelapi",
18
17
  "task",
19
18
  "solver",
20
- "plan",
19
+ "agent",
20
+ "tool",
21
21
  "scorer",
22
22
  "metric",
23
- "tool",
24
- "agent",
25
- "sandboxenv",
26
23
  "score_reducer",
24
+ "modelapi",
25
+ "sandboxenv",
27
26
  "approver",
28
27
  ]
28
+ """Enumeration of registry object types.
29
+
30
+ These are the types of objects in this system that can be
31
+ registered using a decorator (e.g. `@task`, `@solver`).
32
+ Registered objects can in turn be created dynamically using
33
+ the `registry_create()` function.
34
+ """
29
35
 
30
36
 
31
37
  class RegistryInfo(BaseModel):
@@ -181,17 +187,28 @@ def registry_find(predicate: Callable[[RegistryInfo], bool]) -> list[object]:
181
187
  def registry_create(type: RegistryType, name: str, **kwargs: Any) -> object:
182
188
  r"""Create a registry object.
183
189
 
184
- Registry objects can be ordinary functions that implement a protocol,
185
- factory functions that return a function based on **kwargs, or classes
186
- deriving that can be created using **kwargs
190
+ Creates objects registered via decorator (e.g. `@task`, `@solver`). Note
191
+ that this can also create registered objects within Python packages, in
192
+ which case the name of the package should be used a prefix, e.g.
193
+
194
+ ```python
195
+ registry_create("scorer", "mypackage/myscorer", ...)
196
+ ```
197
+
198
+ Object within the Inspect package do not require a prefix, nor do
199
+ objects from imported modules that aren't in a package.
187
200
 
188
201
  Args:
189
- type (RegistryType): Type of registry object to create
190
- name (str): Name of registry options to create
191
- **kwargs (Any): Optional creation arguments
202
+ type: Type of registry object to create
203
+ name: Name of registry object to create
204
+ **kwargs: Optional creation arguments
192
205
 
193
206
  Returns:
194
- Registry object with registry info attribute
207
+ Instance of specified name and type.
208
+
209
+ Raises:
210
+ LookupError: If the named object was not found in the registry.
211
+ TypeError: If the specified parameters are not valid for the object.
195
212
  """
196
213
  # lookup the object
197
214
  obj = registry_lookup(type, name)
@@ -225,7 +242,7 @@ def registry_create(type: RegistryType, name: str, **kwargs: Any) -> object:
225
242
  else:
226
243
  return obj
227
244
  else:
228
- raise ValueError(f"{name} was not found in the registry")
245
+ raise LookupError(f"{name} was not found in the registry")
229
246
 
230
247
 
231
248
  def registry_info(o: object) -> RegistryInfo:
@@ -31,6 +31,7 @@
31
31
  --inspect-font-size-base: 0.9rem;
32
32
  --inspect-font-size-small: 0.8rem;
33
33
  --inspect-font-size-smaller: 0.8rem;
34
+ --inspect-font-size-smallest: 0.7rem;
34
35
 
35
36
  /* Inspect Glass */
36
37
  --inspect-glass-color: #000000;
@@ -113,6 +114,10 @@ body[class^="vscode-"] .app-main-grid {
113
114
  font-size: var(--inspect-font-size-smaller);
114
115
  }
115
116
 
117
+ .text-size-smallest {
118
+ font-size: var(--inspect-font-size-smallest);
119
+ }
120
+
116
121
  .text-truncate {
117
122
  white-space: nowrap;
118
123
  text-overflow: ellipsis;
@@ -14304,6 +14304,7 @@ pre[class*="language-"] {
14304
14304
  --inspect-font-size-base: 0.9rem;
14305
14305
  --inspect-font-size-small: 0.8rem;
14306
14306
  --inspect-font-size-smaller: 0.8rem;
14307
+ --inspect-font-size-smallest: 0.7rem;
14307
14308
 
14308
14309
  /* Inspect Glass */
14309
14310
  --inspect-glass-color: #000000;
@@ -14386,6 +14387,10 @@ body[class^="vscode-"] .app-main-grid {
14386
14387
  font-size: var(--inspect-font-size-smaller);
14387
14388
  }
14388
14389
 
14390
+ .text-size-smallest {
14391
+ font-size: var(--inspect-font-size-smallest);
14392
+ }
14393
+
14389
14394
  .text-truncate {
14390
14395
  white-space: nowrap;
14391
14396
  text-overflow: ellipsis;
@@ -15934,37 +15939,37 @@ ul.jsondiffpatch-textdiff {
15934
15939
  ._number_140x5_7 {
15935
15940
  margin-top: 0.1em;
15936
15941
  }
15937
- ._table_1memb_1 {
15942
+ ._table_9qith_1 {
15938
15943
  padding-left: 0;
15939
15944
  margin-left: 0;
15940
15945
  margin-bottom: 0.2rem;
15941
15946
  }
15942
15947
 
15943
- ._th_1memb_7 {
15948
+ ._th_9qith_7 {
15944
15949
  padding: 0;
15945
15950
  }
15946
15951
 
15947
- ._cell_1memb_11 {
15948
- padding: 0.3em 0.3em 0.3em 0em;
15952
+ ._cell_9qith_11 {
15953
+ padding: 0em 0.5em 0.3em 0em !important;
15949
15954
  }
15950
15955
 
15951
- ._compact_1memb_15 ._cell_1memb_11 {
15956
+ ._compact_9qith_15 ._cell_9qith_11 {
15952
15957
  padding: 0;
15953
15958
  }
15954
15959
 
15955
- ._cellKey_1memb_19 {
15960
+ ._cellKey_9qith_19 {
15956
15961
  font-weight: 400;
15957
15962
  padding-right: 1em;
15958
15963
  white-space: nowrap;
15959
15964
  }
15960
15965
 
15961
- ._compact_1memb_15 ._cellKey_1memb_19 {
15966
+ ._compact_9qith_15 ._cellKey_9qith_19 {
15962
15967
  font-weight: 400;
15963
15968
  padding-right: 0.2em;
15964
15969
  white-space: nowrap;
15965
15970
  }
15966
15971
 
15967
- ._cellValue_1memb_31 {
15972
+ ._cellValue_9qith_31 {
15968
15973
  font-weight: 300;
15969
15974
  white-space: pre-wrap;
15970
15975
  word-wrap: anywhere;
@@ -19645,6 +19650,22 @@ span.ap-marker-container:hover span.ap-marker {
19645
19650
  ._text_1yknn_20 {
19646
19651
  margin-top: -2px;
19647
19652
  }
19653
+ ._container_304w9_1 {
19654
+ display: grid;
19655
+ grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
19656
+ row-gap: 2em;
19657
+ column-gap: 1em;
19658
+ }
19659
+
19660
+ ._modelInfo_304w9_8 {
19661
+ display: grid;
19662
+ grid-template-columns: max-content auto;
19663
+ column-gap: 1em;
19664
+ }
19665
+
19666
+ ._role_304w9_14 {
19667
+ grid-column: -1/1;
19668
+ }
19648
19669
  ._item_1uzhd_1 {
19649
19670
  margin-bottom: 0em;
19650
19671
  }
@@ -19808,55 +19829,71 @@ span.ap-marker-container:hover span.ap-marker {
19808
19829
  ._copyButton_1goi8_1:hover {
19809
19830
  opacity: 0.75;
19810
19831
  }
19811
- ._container_14b93_1 {
19832
+ ._container_q17yq_1 {
19833
+ display: flex;
19834
+ flex-direction: row;
19835
+ flex-wrap: wrap;
19836
+ gap: 0;
19837
+ margin-top: -0.2rem;
19838
+ margin-bottom: 0.2rem;
19839
+ }
19840
+
19841
+ ._grid_q17yq_10 {
19842
+ display: grid;
19843
+ grid-template-rows: repeat(auto-fill, minmax(10px, 1fr));
19844
+ grid-template-columns: 1fr;
19845
+ gap: 0.1em;
19846
+ padding-right: 1em;
19847
+ }
19848
+ ._container_291sb_1 {
19812
19849
  display: flex;
19813
19850
  padding-top: 0;
19814
19851
  margin-left: 0.5rem;
19815
19852
  min-width: 250px;
19816
19853
  }
19817
19854
 
19818
- ._wrapper_14b93_8 {
19855
+ ._wrapper_291sb_8 {
19819
19856
  display: grid;
19820
19857
  grid-template-columns: minmax(auto, 1fr) 1fr;
19821
19858
  width: 100%;
19822
19859
  }
19823
19860
 
19824
- ._toggle_14b93_14 {
19861
+ ._toggle_291sb_14 {
19825
19862
  padding: 0rem 0.1rem 0.1rem 0rem;
19826
19863
  display: flex;
19827
19864
  }
19828
19865
 
19829
- ._body_14b93_19 {
19866
+ ._body_291sb_19 {
19830
19867
  display: flex;
19831
19868
  flex-direction: column;
19832
19869
  margin-left: 0.2rem;
19833
19870
  }
19834
19871
 
19835
- ._bodyContainer_14b93_25 {
19872
+ ._bodyContainer_291sb_25 {
19836
19873
  margin-top: 0.1rem;
19837
19874
  display: grid;
19838
19875
  grid-template-columns: minmax(30px, max-content) minmax(100px, max-content);
19839
19876
  }
19840
19877
 
19841
- ._taskTitle_14b93_31 {
19878
+ ._taskTitle_291sb_31 {
19842
19879
  font-weight: 600;
19843
19880
  margin-right: 0.3rem;
19844
19881
  }
19845
19882
 
19846
- ._taskModel_14b93_36 {
19883
+ ._taskModel_291sb_36 {
19847
19884
  padding-top: 0.4rem;
19848
19885
  }
19849
19886
 
19850
- ._taskStatus_14b93_40 {
19887
+ ._taskStatus_291sb_40 {
19851
19888
  display: flex;
19852
19889
  justify-content: end;
19853
19890
  margin-right: 1em;
19854
19891
  margin-bottom: 0;
19855
19892
  }
19856
19893
 
19857
- ._secondaryContainer_14b93_47 {
19894
+ ._secondaryContainer_291sb_47 {
19858
19895
  opacity: 0.7;
19859
- margin-top: 0.1rem;
19896
+ margin-top: -0.1rem;
19860
19897
  padding-bottom: 0;
19861
19898
  display: grid;
19862
19899
  grid-template-columns: minmax(0, max-content) max-content;