inspect-ai 0.3.74__py3-none-any.whl → 0.3.76__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. inspect_ai/__init__.py +3 -2
  2. inspect_ai/_cli/cache.py +1 -1
  3. inspect_ai/_cli/common.py +15 -0
  4. inspect_ai/_cli/eval.py +4 -5
  5. inspect_ai/_cli/log.py +1 -1
  6. inspect_ai/_cli/sandbox.py +1 -1
  7. inspect_ai/_cli/trace.py +1 -1
  8. inspect_ai/_cli/view.py +1 -1
  9. inspect_ai/_display/core/config.py +3 -1
  10. inspect_ai/_eval/eval.py +55 -61
  11. inspect_ai/_eval/evalset.py +64 -154
  12. inspect_ai/_eval/loader.py +27 -54
  13. inspect_ai/_eval/registry.py +4 -15
  14. inspect_ai/_eval/run.py +7 -4
  15. inspect_ai/_eval/task/__init__.py +8 -2
  16. inspect_ai/_eval/task/log.py +9 -1
  17. inspect_ai/_eval/task/resolved.py +35 -0
  18. inspect_ai/_eval/task/run.py +4 -0
  19. inspect_ai/_eval/task/task.py +50 -69
  20. inspect_ai/_eval/task/tasks.py +30 -0
  21. inspect_ai/_util/constants.py +3 -0
  22. inspect_ai/_util/dotenv.py +17 -0
  23. inspect_ai/_util/logger.py +3 -0
  24. inspect_ai/_util/registry.py +43 -2
  25. inspect_ai/_view/server.py +28 -10
  26. inspect_ai/_view/www/dist/assets/index.css +32 -19
  27. inspect_ai/_view/www/dist/assets/index.js +17682 -29989
  28. inspect_ai/_view/www/log-schema.json +79 -9
  29. inspect_ai/_view/www/package.json +2 -2
  30. inspect_ai/_view/www/src/appearance/styles.ts +6 -5
  31. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +2 -2
  32. inspect_ai/_view/www/src/constants.ts +3 -0
  33. inspect_ai/_view/www/src/logfile/remoteZipFile.ts +141 -20
  34. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +2 -1
  35. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +1 -1
  36. inspect_ai/_view/www/src/samples/chat/tools/tool.ts +7 -5
  37. inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +1 -1
  38. inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -2
  39. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +1 -0
  40. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +3 -1
  41. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +1 -1
  42. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +5 -2
  43. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +2 -2
  44. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +5 -1
  45. inspect_ai/_view/www/src/types/log.d.ts +11 -5
  46. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +17 -12
  47. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -1
  48. inspect_ai/_view/www/yarn.lock +12 -5
  49. inspect_ai/log/_log.py +10 -1
  50. inspect_ai/log/_recorders/eval.py +27 -8
  51. inspect_ai/log/_recorders/json.py +10 -2
  52. inspect_ai/log/_transcript.py +13 -4
  53. inspect_ai/model/_call_tools.py +13 -4
  54. inspect_ai/model/_chat_message.py +15 -1
  55. inspect_ai/model/_model.py +30 -12
  56. inspect_ai/model/_model_output.py +6 -1
  57. inspect_ai/model/_openai.py +11 -6
  58. inspect_ai/model/_providers/anthropic.py +167 -77
  59. inspect_ai/model/_providers/google.py +6 -2
  60. inspect_ai/model/_providers/none.py +31 -0
  61. inspect_ai/model/_providers/openai.py +11 -8
  62. inspect_ai/model/_providers/providers.py +7 -0
  63. inspect_ai/model/_providers/vertex.py +5 -2
  64. inspect_ai/solver/_bridge/bridge.py +1 -1
  65. inspect_ai/solver/_chain.py +7 -6
  66. inspect_ai/tool/__init__.py +4 -0
  67. inspect_ai/tool/_tool_call.py +5 -2
  68. inspect_ai/tool/_tool_support_helpers.py +200 -0
  69. inspect_ai/tool/_tools/_bash_session.py +119 -0
  70. inspect_ai/tool/_tools/_computer/_computer.py +1 -1
  71. inspect_ai/tool/_tools/_text_editor.py +121 -0
  72. inspect_ai/tool/_tools/_web_browser/_back_compat.py +150 -0
  73. inspect_ai/tool/_tools/_web_browser/_web_browser.py +75 -130
  74. inspect_ai/tool/_tools/_web_search.py +2 -2
  75. inspect_ai/util/_json.py +28 -0
  76. inspect_ai/util/_sandbox/context.py +18 -8
  77. inspect_ai/util/_sandbox/docker/config.py +1 -1
  78. inspect_ai/util/_sandbox/docker/internal.py +3 -3
  79. inspect_ai/util/_sandbox/environment.py +17 -2
  80. {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/METADATA +8 -5
  81. {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/RECORD +85 -108
  82. {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/WHEEL +1 -1
  83. inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +0 -8
  84. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +0 -24
  85. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +0 -25
  86. inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +0 -22
  87. inspect_ai/tool/_tools/_web_browser/_resources/README.md +0 -63
  88. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +0 -71
  89. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +0 -323
  90. inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +0 -5
  91. inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +0 -279
  92. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +0 -9
  93. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +0 -293
  94. inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +0 -94
  95. inspect_ai/tool/_tools/_web_browser/_resources/constants.py +0 -2
  96. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +0 -2
  97. inspect_ai/tool/_tools/_web_browser/_resources/mock_environment.py +0 -45
  98. inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +0 -50
  99. inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +0 -48
  100. inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +0 -280
  101. inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +0 -65
  102. inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +0 -64
  103. inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +0 -146
  104. inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +0 -64
  105. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +0 -180
  106. inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +0 -99
  107. inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +0 -15
  108. inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +0 -44
  109. inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +0 -39
  110. inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +0 -214
  111. inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +0 -35
  112. inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +0 -192
  113. {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/entry_points.txt +0 -0
  114. {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info/licenses}/LICENSE +0 -0
  115. {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/top_level.txt +0 -0
@@ -148,7 +148,7 @@ export type Input =
148
148
  | ChatMessageAssistant
149
149
  | ChatMessageTool
150
150
  )[];
151
- export type Id1 = string;
151
+ export type Id1 = string | null;
152
152
  export type Content =
153
153
  | string
154
154
  | (
@@ -175,7 +175,7 @@ export type Video = string;
175
175
  export type Format1 = "mp4" | "mpeg" | "mov";
176
176
  export type Source = ("input" | "generate") | null;
177
177
  export type Role = "system";
178
- export type Id2 = string;
178
+ export type Id2 = string | null;
179
179
  export type Content1 =
180
180
  | string
181
181
  | (
@@ -188,7 +188,7 @@ export type Content1 =
188
188
  export type Source1 = ("input" | "generate") | null;
189
189
  export type Role1 = "user";
190
190
  export type ToolCallId = string[] | null;
191
- export type Id3 = string;
191
+ export type Id3 = string | null;
192
192
  export type Content2 =
193
193
  | string
194
194
  | (
@@ -203,12 +203,13 @@ export type Role2 = "assistant";
203
203
  export type ToolCalls = ToolCall[] | null;
204
204
  export type Id4 = string;
205
205
  export type Function = string;
206
- export type Type8 = "function";
206
+ export type Type8 = string;
207
+ export type InternalName = string | null;
207
208
  export type ParseError = string | null;
208
209
  export type Title = string | null;
209
210
  export type Format2 = "text" | "markdown";
210
211
  export type Content3 = string;
211
- export type Id5 = string;
212
+ export type Id5 = string | null;
212
213
  export type Content4 =
213
214
  | string
214
215
  | (
@@ -222,6 +223,7 @@ export type Source3 = ("input" | "generate") | null;
222
223
  export type Role3 = "tool";
223
224
  export type ToolCallId1 = string | null;
224
225
  export type Function1 = string | null;
226
+ export type InternalName1 = string | null;
225
227
  export type Type9 =
226
228
  | "parsing"
227
229
  | "timeout"
@@ -369,6 +371,7 @@ export type Event6 = "tool";
369
371
  export type Type12 = "function";
370
372
  export type Id7 = string;
371
373
  export type Function2 = string;
374
+ export type InternalName2 = string | null;
372
375
  export type Result1 =
373
376
  | string
374
377
  | number
@@ -911,6 +914,7 @@ export interface ToolCall {
911
914
  function: Function;
912
915
  arguments: Arguments;
913
916
  type: Type8;
917
+ internal_name: InternalName;
914
918
  parse_error: ParseError;
915
919
  view: ToolCallContent | null;
916
920
  }
@@ -933,6 +937,7 @@ export interface ChatMessageTool {
933
937
  role: Role3;
934
938
  tool_call_id: ToolCallId1;
935
939
  function: Function1;
940
+ internal_name: InternalName1;
936
941
  error: ToolCallError | null;
937
942
  }
938
943
  export interface ToolCallError {
@@ -1201,6 +1206,7 @@ export interface ToolEvent {
1201
1206
  id: Id7;
1202
1207
  function: Function2;
1203
1208
  arguments: Arguments1;
1209
+ internal_name: InternalName2;
1204
1210
  view: ToolCallContent | null;
1205
1211
  result: Result1;
1206
1212
  truncated: Truncated;
@@ -3,6 +3,7 @@ import { FC, useCallback } from "react";
3
3
  import { SampleSummary } from "../../api/types";
4
4
  import { ApplicationIcons } from "../../appearance/icons";
5
5
  import { CopyButton } from "../../components/CopyButton";
6
+ import { kModelNone } from "../../constants";
6
7
  import { EvalResults, EvalSpec, Status } from "../../types/log";
7
8
  import { filename } from "../../utils/path";
8
9
  import styles from "./PrimaryBar.module.css";
@@ -71,18 +72,22 @@ export const PrimaryBar: FC<PrimaryBarProps> = ({
71
72
  >
72
73
  {evalSpec?.task}
73
74
  </div>
74
- <div
75
- id="task-model"
76
- className={clsx(
77
- "task-model",
78
- "text-truncate",
79
- styles.taskModel,
80
- "text-size-base",
81
- )}
82
- title={evalSpec?.model}
83
- >
84
- {evalSpec?.model}
85
- </div>
75
+ {evalSpec?.model && evalSpec.model !== kModelNone ? (
76
+ <div
77
+ id="task-model"
78
+ className={clsx(
79
+ "task-model",
80
+ "text-truncate",
81
+ styles.taskModel,
82
+ "text-size-base",
83
+ )}
84
+ title={evalSpec?.model}
85
+ >
86
+ {evalSpec?.model}
87
+ </div>
88
+ ) : (
89
+ ""
90
+ )}
86
91
  </div>
87
92
  <div className={clsx("text-size-small", styles.secondaryContainer)}>
88
93
  <div className={clsx("navbar-secondary-text", "text-truncate")}>
@@ -1,6 +1,7 @@
1
1
  import clsx from "clsx";
2
2
  import { FC, Fragment } from "react";
3
3
  import { EvalLogHeader } from "../../api/types";
4
+ import { kModelNone } from "../../constants";
4
5
  import { EvalStatus } from "./EvalStatus";
5
6
  import styles from "./SidebarLogEntry.module.css";
6
7
 
@@ -51,7 +52,7 @@ export const SidebarLogEntry: FC<SidebarLogEntryProps> = ({
51
52
  </div>
52
53
  <small className={clsx("mb-1", "text-size-small")}>{timeStr}</small>
53
54
 
54
- {model ? (
55
+ {model && model !== kModelNone ? (
55
56
  <div>
56
57
  <small className={clsx("mb-1", "text-size-small")}>{model}</small>
57
58
  </div>
@@ -220,13 +220,20 @@
220
220
  "@codemirror/view" "^6.0.0"
221
221
  crelt "^1.0.5"
222
222
 
223
- "@codemirror/state@^6.0.0", "@codemirror/state@^6.4.0", "@codemirror/state@^6.5.0", "@codemirror/state@^6.5.1":
223
+ "@codemirror/state@^6.0.0", "@codemirror/state@^6.4.0", "@codemirror/state@^6.5.0":
224
224
  version "6.5.1"
225
225
  resolved "https://registry.yarnpkg.com/@codemirror/state/-/state-6.5.1.tgz#e5c0599f7b43cf03f19e05861317df5425c07904"
226
226
  integrity sha512-3rA9lcwciEB47ZevqvD8qgbzhM9qMb8vCcQCNmDfVRPQG4JT9mSb0Jg8H7YjKGGQcFnLN323fj9jdnG59Kx6bg==
227
227
  dependencies:
228
228
  "@marijn/find-cluster-break" "^1.0.0"
229
229
 
230
+ "@codemirror/state@^6.5.2":
231
+ version "6.5.2"
232
+ resolved "https://registry.yarnpkg.com/@codemirror/state/-/state-6.5.2.tgz#8eca3a64212a83367dc85475b7d78d5c9b7076c6"
233
+ integrity sha512-FVqsPqtPWKVVL3dPSxy8wEF/ymIEuVzF1PK3VbUgrxXpJUSHQWWZz4JMToquRxnkw+36LTamCZG2iua2Ptq0fA==
234
+ dependencies:
235
+ "@marijn/find-cluster-break" "^1.0.0"
236
+
230
237
  "@codemirror/view@^6.0.0", "@codemirror/view@^6.17.0", "@codemirror/view@^6.23.0", "@codemirror/view@^6.27.0", "@codemirror/view@^6.35.0":
231
238
  version "6.36.2"
232
239
  resolved "https://registry.yarnpkg.com/@codemirror/view/-/view-6.36.2.tgz#aeb644e161440734ac5a153bf6e5b4a4355047be"
@@ -862,10 +869,10 @@ argparse@^2.0.1:
862
869
  resolved "https://registry.yarnpkg.com/argparse/-/argparse-2.0.1.tgz#246f50f3ca78a3240f6c997e8a9bd1eac49e4b38"
863
870
  integrity sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==
864
871
 
865
- asciinema-player@^3.8.2:
866
- version "3.8.2"
867
- resolved "https://registry.yarnpkg.com/asciinema-player/-/asciinema-player-3.8.2.tgz#12fbf475ddaeee0051ace17532e5f003475f6dfa"
868
- integrity sha512-Lgcnj9u/H6sRpGRX1my7Azcay6llLmB/GVkCGcDbPwdTVTisS1ir8SQ9jRWRvjlLUjpSJkN0euruvy3sLRM8tw==
872
+ asciinema-player@^3.9.0:
873
+ version "3.9.0"
874
+ resolved "https://registry.yarnpkg.com/asciinema-player/-/asciinema-player-3.9.0.tgz#c60742f85978e861b878fc7eb6289a5622c298af"
875
+ integrity sha512-SXVFImVzeNr8ZUdNIHABGuzlbnGWTKy245AquAjODsAnv+Lp6vxjYGN0LfA8ns30tnx/ag/bMrTbLq13TpHE6w==
869
876
  dependencies:
870
877
  "@babel/runtime" "^7.21.0"
871
878
  solid-js "^1.3.0"
inspect_ai/log/_log.py CHANGED
@@ -215,7 +215,16 @@ class EvalSample(BaseModel):
215
215
  Returns:
216
216
  StoreModel: Instance of model_cls bound to sample store data.
217
217
  """
218
- return model_cls(store=Store(self.store))
218
+ # un-namespace names for creation
219
+ data = {
220
+ k.replace(f"{model_cls.__name__}:", "", 1): v for k, v in self.store.items()
221
+ }
222
+
223
+ # since we are reading from the log provide a fully detached store
224
+ data["store"] = Store()
225
+
226
+ # create the model
227
+ return model_cls.model_validate(data)
219
228
 
220
229
  events: list[Event] = Field(default_factory=list)
221
230
  """Events that occurred during sample execution."""
@@ -10,7 +10,7 @@ from pydantic import BaseModel, Field
10
10
  from pydantic_core import to_json
11
11
  from typing_extensions import override
12
12
 
13
- from inspect_ai._util.constants import LOG_SCHEMA_VERSION
13
+ from inspect_ai._util.constants import DESERIALIZING_CONTEXT, LOG_SCHEMA_VERSION
14
14
  from inspect_ai._util.content import (
15
15
  ContentAudio,
16
16
  ContentImage,
@@ -224,7 +224,9 @@ class EvalRecorder(FileRecorder):
224
224
  with ZipFile(z, mode="r") as zip:
225
225
  try:
226
226
  with zip.open(_sample_filename(id, epoch), "r") as f:
227
- return EvalSample(**json.load(f))
227
+ return EvalSample.model_validate(
228
+ json.load(f), context=DESERIALIZING_CONTEXT
229
+ )
228
230
  except KeyError:
229
231
  raise IndexError(
230
232
  f"Sample id {id} for epoch {epoch} not found in log {location}"
@@ -414,7 +416,10 @@ def _read_log(log: BinaryIO, location: str, header_only: bool = False) -> EvalLo
414
416
  if REDUCTIONS_JSON in zip.namelist():
415
417
  with zip.open(REDUCTIONS_JSON, "r") as f:
416
418
  reductions = [
417
- EvalSampleReductions(**reduction) for reduction in json.load(f)
419
+ EvalSampleReductions.model_validate(
420
+ reduction, context=DESERIALIZING_CONTEXT
421
+ )
422
+ for reduction in json.load(f)
418
423
  ]
419
424
  if evalLog.results is not None:
420
425
  evalLog.reductions = reductions
@@ -425,7 +430,11 @@ def _read_log(log: BinaryIO, location: str, header_only: bool = False) -> EvalLo
425
430
  for name in zip.namelist():
426
431
  if name.startswith(f"{SAMPLES_DIR}/") and name.endswith(".json"):
427
432
  with zip.open(name, "r") as f:
428
- samples.append(EvalSample(**json.load(f)))
433
+ samples.append(
434
+ EvalSample.model_validate(
435
+ json.load(f), context=DESERIALIZING_CONTEXT
436
+ ),
437
+ )
429
438
  sort_samples(samples)
430
439
  evalLog.samples = samples
431
440
  return evalLog
@@ -452,7 +461,10 @@ def _read_all_summaries(zip: ZipFile, count: int) -> list[SampleSummary]:
452
461
  if SUMMARIES_JSON in zip.namelist():
453
462
  summaries_raw = _read_json(zip, SUMMARIES_JSON)
454
463
  if isinstance(summaries_raw, list):
455
- return [SampleSummary(**value) for value in summaries_raw]
464
+ return [
465
+ SampleSummary.model_validate(value, context=DESERIALIZING_CONTEXT)
466
+ for value in summaries_raw
467
+ ]
456
468
  else:
457
469
  raise ValueError(
458
470
  f"Expected a list of summaries when reading {SUMMARIES_JSON}"
@@ -464,7 +476,14 @@ def _read_all_summaries(zip: ZipFile, count: int) -> list[SampleSummary]:
464
476
  summary_path = _journal_summary_path(summary_file)
465
477
  summary = _read_json(zip, summary_path)
466
478
  if isinstance(summary, list):
467
- summaries.extend([SampleSummary(**value) for value in summary])
479
+ summaries.extend(
480
+ [
481
+ SampleSummary.model_validate(
482
+ value, context=DESERIALIZING_CONTEXT
483
+ )
484
+ for value in summary
485
+ ]
486
+ )
468
487
  else:
469
488
  raise ValueError(
470
489
  f"Expected a list of summaries when reading {summary_file}"
@@ -476,12 +495,12 @@ def _read_header(zip: ZipFile, location: str) -> EvalLog:
476
495
  # first see if the header is here
477
496
  if HEADER_JSON in zip.namelist():
478
497
  with zip.open(HEADER_JSON, "r") as f:
479
- log = EvalLog(**json.load(f))
498
+ log = EvalLog.model_validate(json.load(f), context=DESERIALIZING_CONTEXT)
480
499
  log.location = location
481
500
  return log
482
501
  else:
483
502
  with zip.open(_journal_path(START_JSON), "r") as f:
484
- start = LogStart(**json.load(f))
503
+ start = LogStart.model_validate(json.load(f), context=DESERIALIZING_CONTEXT)
485
504
  return EvalLog(
486
505
  version=start.version, eval=start.eval, plan=start.plan, location=location
487
506
  )
@@ -7,7 +7,7 @@ from pydantic import BaseModel
7
7
  from pydantic_core import from_json
8
8
  from typing_extensions import override
9
9
 
10
- from inspect_ai._util.constants import LOG_SCHEMA_VERSION
10
+ from inspect_ai._util.constants import DESERIALIZING_CONTEXT, LOG_SCHEMA_VERSION
11
11
  from inspect_ai._util.error import EvalError
12
12
  from inspect_ai._util.file import absolute_file_path, file
13
13
  from inspect_ai._util.trace import trace_action
@@ -143,7 +143,7 @@ class JSONRecorder(FileRecorder):
143
143
  with file(location, "r") as f:
144
144
  # parse w/ pydantic
145
145
  raw_data = from_json(f.read())
146
- log = EvalLog(**raw_data)
146
+ log = EvalLog.model_validate(raw_data, context=DESERIALIZING_CONTEXT)
147
147
  log.location = location
148
148
 
149
149
  # fail for unknown version
@@ -217,6 +217,11 @@ def _read_header_streaming(log_file: str) -> EvalLog:
217
217
 
218
218
  # Parse the log file, stopping before parsing samples
219
219
  status: Literal["started", "success", "cancelled", "error"] | None = None
220
+ eval: EvalSpec | None = None
221
+ plan: EvalPlan | None = None
222
+ results: EvalResults | None = None
223
+ stats: EvalStats | None = None
224
+ error: EvalError | None = None
220
225
  for k, v in ijson.kvitems(f, ""):
221
226
  if k == "status":
222
227
  assert v in get_args(
@@ -239,6 +244,9 @@ def _read_header_streaming(log_file: str) -> EvalLog:
239
244
  break
240
245
 
241
246
  assert status, "Must encounter a 'status'"
247
+ assert eval, "Must encounter a 'eval'"
248
+ assert plan, "Must encounter a 'plan'"
249
+ assert stats, "Must encounter a 'stats'"
242
250
 
243
251
  return EvalLog(
244
252
  eval=eval,
@@ -146,7 +146,7 @@ class ModelEvent(BaseEvent):
146
146
  """working time for model call that succeeded (i.e. was not retried)."""
147
147
 
148
148
  @field_serializer("completed")
149
- def serialize_completed(self, dt: datetime) -> str:
149
+ def serialize_completed(self, dt: datetime | None) -> str | None:
150
150
  if dt is None:
151
151
  return None
152
152
  return dt.astimezone().isoformat()
@@ -170,6 +170,9 @@ class ToolEvent(BaseEvent):
170
170
  arguments: dict[str, JsonValue]
171
171
  """Arguments to function."""
172
172
 
173
+ internal_name: str | None = Field(default=None)
174
+ """Internal name for tool (if any)."""
175
+
173
176
  view: ToolCallContent | None = Field(default=None)
174
177
  """Custom view of tool call input."""
175
178
 
@@ -235,7 +238,9 @@ class ToolEvent(BaseEvent):
235
238
  """Required so that we can include '_cancel_fn' as a member."""
236
239
 
237
240
  @field_serializer("completed")
238
- def serialize_completed(self, dt: datetime) -> str:
241
+ def serialize_completed(self, dt: datetime | None) -> str | None:
242
+ if dt is None:
243
+ return None
239
244
  return dt.astimezone().isoformat()
240
245
 
241
246
 
@@ -270,7 +275,9 @@ class SandboxEvent(BaseEvent):
270
275
  """Time that sandbox action completed (see `timestamp` for started)"""
271
276
 
272
277
  @field_serializer("completed")
273
- def serialize_completed(self, dt: datetime) -> str:
278
+ def serialize_completed(self, dt: datetime | None) -> str | None:
279
+ if dt is None:
280
+ return None
274
281
  return dt.astimezone().isoformat()
275
282
 
276
283
 
@@ -412,7 +419,9 @@ class SubtaskEvent(BaseEvent):
412
419
  """Working time for subtask (i.e. time not spent waiting on semaphores or model retries)."""
413
420
 
414
421
  @field_serializer("completed")
415
- def serialize_completed(self, dt: datetime) -> str:
422
+ def serialize_completed(self, dt: datetime | None) -> str | None:
423
+ if dt is None:
424
+ return None
416
425
  return dt.astimezone().isoformat()
417
426
 
418
427
 
@@ -25,7 +25,6 @@ from typing import (
25
25
  if sys.version_info < (3, 11):
26
26
  from exceptiongroup import ExceptionGroup
27
27
 
28
-
29
28
  import anyio
30
29
  import yaml
31
30
  from anyio.streams.memory import MemoryObjectSendStream
@@ -168,6 +167,7 @@ async def call_tools(
168
167
  id=call.id,
169
168
  function=call.function,
170
169
  arguments=call.arguments,
170
+ internal_name=call.internal_name,
171
171
  result=content,
172
172
  truncated=truncated,
173
173
  view=call.view,
@@ -183,6 +183,7 @@ async def call_tools(
183
183
  content=content,
184
184
  tool_call_id=call.id,
185
185
  function=call.function,
186
+ internal_name=call.internal_name,
186
187
  error=tool_error,
187
188
  ),
188
189
  event,
@@ -201,6 +202,7 @@ async def call_tools(
201
202
  id=call.id,
202
203
  function=call.function,
203
204
  arguments=call.arguments,
205
+ internal_name=call.internal_name,
204
206
  view=call.view,
205
207
  pending=True,
206
208
  )
@@ -216,9 +218,7 @@ async def call_tools(
216
218
  tg.start_soon(call_tool_task, call, send_stream)
217
219
  event._set_cancel_fn(tg.cancel_scope.cancel)
218
220
  async with receive_stream:
219
- async for result in receive_stream:
220
- tool_message, result_event = result
221
- break
221
+ tool_message, result_event = await receive_stream.receive()
222
222
  except ExceptionGroup as ex:
223
223
  raise ex.exceptions[0]
224
224
 
@@ -226,6 +226,7 @@ async def call_tools(
226
226
  tool_message = ChatMessageTool(
227
227
  content="",
228
228
  function=call.function,
229
+ internal_name=call.internal_name,
229
230
  tool_call_id=call.id,
230
231
  error=ToolCallError(
231
232
  "timeout", "Command timed out before completing."
@@ -235,6 +236,7 @@ async def call_tools(
235
236
  id=call.id,
236
237
  function=call.function,
237
238
  arguments=call.arguments,
239
+ internal_name=call.internal_name,
238
240
  result=tool_message.content,
239
241
  truncated=None,
240
242
  view=call.view,
@@ -508,6 +510,13 @@ def tool_parse_error_message(arguments: str, ex: Exception) -> str:
508
510
  def parse_tool_call(
509
511
  id: str, function: str, arguments: str, tools: list[ToolInfo] | None = None
510
512
  ) -> ToolCall:
513
+ """Parse a tool call from a JSON payload.
514
+
515
+ Note that this function doesn't know about internal tool names so the caller
516
+ should ammend the returned `ToolCall` by mapping the parsed `function` field from
517
+ from an internal name to an inspect tool name and fixing up the `ToolCall` object
518
+ as required to reflect this change.
519
+ """
511
520
  error: str | None = None
512
521
  arguments_dict: dict[str, Any] = {}
513
522
 
@@ -4,6 +4,7 @@ from typing import Any, Literal, Type, Union
4
4
  from pydantic import BaseModel, Field, model_validator
5
5
  from shortuuid import uuid
6
6
 
7
+ from inspect_ai._util.constants import DESERIALIZING
7
8
  from inspect_ai._util.content import Content, ContentReasoning, ContentText
8
9
  from inspect_ai.tool import ToolCall
9
10
  from inspect_ai.tool._tool_call import ToolCallError
@@ -16,7 +17,7 @@ logger = getLogger(__name__)
16
17
  class ChatMessageBase(BaseModel):
17
18
  """Base class for chat messages."""
18
19
 
19
- id: str = Field(default_factory=uuid)
20
+ id: str | None = Field(default=None)
20
21
  """Unique identifer for message."""
21
22
 
22
23
  content: str | list[Content]
@@ -25,6 +26,16 @@ class ChatMessageBase(BaseModel):
25
26
  source: Literal["input", "generate"] | None = Field(default=None)
26
27
  """Source of message."""
27
28
 
29
+ def model_post_init(self, __context: Any) -> None:
30
+ # check if deserializing
31
+ is_deserializing = isinstance(__context, dict) and __context.get(
32
+ DESERIALIZING, False
33
+ )
34
+
35
+ # Generate ID if needed and not deserializing
36
+ if self.id is None and not is_deserializing:
37
+ self.id = uuid()
38
+
28
39
  @property
29
40
  def text(self) -> str:
30
41
  """Get the text content of this message.
@@ -147,6 +158,9 @@ class ChatMessageTool(ChatMessageBase):
147
158
  function: str | None = Field(default=None)
148
159
  """Name of function called."""
149
160
 
161
+ internal_name: str | None = Field(default=None)
162
+ """Internal name for tool (if any)."""
163
+
150
164
  error: ToolCallError | None = Field(default=None)
151
165
  """Error which occurred during tool call."""
152
166
 
@@ -33,6 +33,7 @@ from inspect_ai._util.content import (
33
33
  from inspect_ai._util.hooks import init_hooks, override_api_key, send_telemetry
34
34
  from inspect_ai._util.interrupt import check_sample_interrupt
35
35
  from inspect_ai._util.logger import warn_once
36
+ from inspect_ai._util.notgiven import NOT_GIVEN, NotGiven
36
37
  from inspect_ai._util.platform import platform_init
37
38
  from inspect_ai._util.registry import (
38
39
  RegistryInfo,
@@ -77,7 +78,7 @@ class ModelAPI(abc.ABC):
77
78
  by the user. You can then pass these on to the approriate place in
78
79
  your model initialisation code (for example, here is what many
79
80
  of the built-in providers do with the `model_args` passed to them:
80
- https://inspect.ai-safety-institute.org.uk/models.html#model-args)
81
+ https://inspect.aisi.org.uk/models.html#model-args)
81
82
  """
82
83
 
83
84
  def __init__(
@@ -232,15 +233,19 @@ class Model:
232
233
  config: GenerateConfig
233
234
  """Generation config."""
234
235
 
235
- def __init__(self, api: ModelAPI, config: GenerateConfig) -> None:
236
+ def __init__(
237
+ self, api: ModelAPI, config: GenerateConfig, model_args: dict[str, Any] = {}
238
+ ) -> None:
236
239
  """Create a model.
237
240
 
238
241
  Args:
239
242
  api: Model API provider.
240
243
  config: Model configuration.
244
+ model_args: Optional model args
241
245
  """
242
246
  self.api = api
243
247
  self.config = config
248
+ self.model_args = model_args
244
249
 
245
250
  # state indicating whether our lifetime is bound by a context manager
246
251
  self._context_bound = False
@@ -449,6 +454,7 @@ class Model:
449
454
  async def generate() -> ModelOutput:
450
455
  check_sample_interrupt()
451
456
 
457
+ cache_entry: CacheEntry | None
452
458
  if cache:
453
459
  if isinstance(cache, CachePolicy):
454
460
  policy = cache
@@ -476,6 +482,8 @@ class Model:
476
482
  call=None,
477
483
  )
478
484
  return existing
485
+ else:
486
+ cache_entry = None
479
487
 
480
488
  # verify that model apis are allowed
481
489
  self.verify_model_apis()
@@ -545,7 +553,7 @@ class Model:
545
553
  json.dumps(dict(model=str(self), usage=output.usage.model_dump())),
546
554
  )
547
555
 
548
- if cache:
556
+ if cache and cache_entry:
549
557
  cache_store(entry=cache_entry, output=output)
550
558
 
551
559
  return output
@@ -773,6 +781,10 @@ def get_model(
773
781
  if isinstance(model, Model):
774
782
  return model
775
783
 
784
+ # next see if this is the special "none" model
785
+ if model == "none":
786
+ model = "none/none"
787
+
776
788
  # now try finding an 'ambient' model (active or env var)
777
789
  if model is None:
778
790
  # return active_model if there is one
@@ -835,7 +847,7 @@ def get_model(
835
847
  config=config,
836
848
  **model_args,
837
849
  )
838
- m = Model(modelapi_instance, config)
850
+ m = Model(modelapi_instance, config, model_args)
839
851
  if memoize:
840
852
  _models[model_cache_key] = m
841
853
  return m
@@ -860,17 +872,25 @@ def cached_model(key: str) -> Model | None:
860
872
 
861
873
 
862
874
  def resolve_models(
863
- model: str | Model | list[str] | list[Model] | None,
875
+ model: str | Model | list[str] | list[Model] | None | NotGiven = NOT_GIVEN,
864
876
  model_base_url: str | None = None,
865
877
  model_args: dict[str, Any] = dict(),
866
878
  config: GenerateConfig = GenerateConfig(),
867
879
  ) -> list[Model]:
880
+ # resolve NotGiven to current INSPECT_EVAL_MODEL
881
+ if isinstance(model, NotGiven):
882
+ model = os.getenv("INSPECT_EVAL_MODEL", None)
883
+
884
+ # resolve None to NoModel
885
+ if model is None:
886
+ return [get_model("none")]
887
+
868
888
  # reflect back a plain model
869
889
  if isinstance(model, Model):
870
890
  return [model]
871
891
 
872
892
  # helper to resolve model of various types
873
- def resolve_model(m: str | Model | None) -> Model:
893
+ def resolve_model(m: str | Model) -> Model:
874
894
  return get_model(
875
895
  model=m,
876
896
  base_url=model_base_url,
@@ -878,11 +898,8 @@ def resolve_models(
878
898
  **model_args,
879
899
  )
880
900
 
881
- # resolve None and str to list
882
- if model is None or isinstance(model, str):
883
- model = model or os.getenv("INSPECT_EVAL_MODEL", None)
884
- if model is None:
885
- raise ValueError("No model specified (and no INSPECT_EVAL_MODEL defined)")
901
+ # str to list
902
+ if isinstance(model, str):
886
903
  model = [m.strip() for m in model.split(",")]
887
904
 
888
905
  # resolve models
@@ -1098,6 +1115,7 @@ def tool_result_images_reducer(
1098
1115
  content=edited_tool_message_content,
1099
1116
  tool_call_id=message.tool_call_id,
1100
1117
  function=message.function,
1118
+ internal_name=message.internal_name,
1101
1119
  )
1102
1120
  ],
1103
1121
  pending_content + new_user_message_content,
@@ -1236,7 +1254,7 @@ def active_model() -> Model | None:
1236
1254
 
1237
1255
 
1238
1256
  # shared contexts for asyncio tasks
1239
- active_model_context_var: ContextVar[Model] = ContextVar("active_model")
1257
+ active_model_context_var: ContextVar[Model | None] = ContextVar("active_model")
1240
1258
 
1241
1259
 
1242
1260
  def handle_sample_message_limit(input: str | list[ChatMessage]) -> None:
@@ -188,8 +188,10 @@ class ModelOutput(BaseModel):
188
188
  model: str,
189
189
  tool_name: str,
190
190
  tool_arguments: dict[str, Any],
191
+ internal_tool_name: str | None = None,
191
192
  tool_call_id: str | None = None,
192
193
  content: str | None = None,
194
+ type: str = "function",
193
195
  ) -> "ModelOutput":
194
196
  """
195
197
  Returns a ModelOutput for requesting a tool call.
@@ -197,6 +199,8 @@ class ModelOutput(BaseModel):
197
199
  Args:
198
200
  model: model name
199
201
  tool_name: The name of the tool.
202
+ internal_tool_name: The model's internal name for the tool (if any).
203
+ type: The model's type for the tool. e.g. "function", "computer_use_preview"
200
204
  tool_arguments: The arguments passed to the tool.
201
205
  tool_call_id: Optional ID for the tool call. Defaults to a random UUID.
202
206
  content: Optional content to include in the message. Defaults to "tool call for tool {tool_name}".
@@ -221,8 +225,9 @@ class ModelOutput(BaseModel):
221
225
  ToolCall(
222
226
  id=tool_call_id,
223
227
  function=tool_name,
228
+ internal_name=internal_tool_name,
224
229
  arguments=tool_arguments,
225
- type="function",
230
+ type=type,
226
231
  )
227
232
  ],
228
233
  ),