inspect-ai 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. inspect_ai/_cli/eval.py +27 -0
  2. inspect_ai/_display/textual/widgets/samples.py +3 -3
  3. inspect_ai/_display/textual/widgets/transcript.py +3 -29
  4. inspect_ai/_eval/eval.py +19 -2
  5. inspect_ai/_eval/evalset.py +4 -1
  6. inspect_ai/_eval/run.py +41 -0
  7. inspect_ai/_eval/task/generate.py +38 -44
  8. inspect_ai/_eval/task/log.py +26 -28
  9. inspect_ai/_eval/task/run.py +23 -27
  10. inspect_ai/_util/answer.py +26 -0
  11. inspect_ai/_util/constants.py +0 -1
  12. inspect_ai/_util/local_server.py +398 -0
  13. inspect_ai/_util/working.py +10 -4
  14. inspect_ai/_view/www/dist/assets/index.css +173 -159
  15. inspect_ai/_view/www/dist/assets/index.js +1417 -1142
  16. inspect_ai/_view/www/log-schema.json +379 -3
  17. inspect_ai/_view/www/package.json +1 -1
  18. inspect_ai/_view/www/src/@types/log.d.ts +93 -14
  19. inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
  20. inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
  21. inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
  22. inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
  23. inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
  24. inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
  25. inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
  26. inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
  27. inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
  28. inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
  29. inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
  30. inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
  31. inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
  32. inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
  33. inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
  34. inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
  35. inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
  36. inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
  37. inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
  38. inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
  39. inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
  40. inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
  41. inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
  42. inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
  43. inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
  44. inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
  45. inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
  46. inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
  47. inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
  48. inspect_ai/_view/www/src/components/Card.css +0 -1
  49. inspect_ai/_view/www/src/constants.ts +2 -0
  50. inspect_ai/_view/www/src/utils/numeric.ts +17 -0
  51. inspect_ai/agent/_agent.py +3 -3
  52. inspect_ai/agent/_as_solver.py +22 -12
  53. inspect_ai/agent/_as_tool.py +20 -6
  54. inspect_ai/agent/_handoff.py +12 -1
  55. inspect_ai/agent/_react.py +4 -3
  56. inspect_ai/agent/_run.py +16 -3
  57. inspect_ai/agent/_types.py +9 -0
  58. inspect_ai/dataset/_dataset.py +6 -3
  59. inspect_ai/log/__init__.py +14 -0
  60. inspect_ai/log/_convert.py +4 -9
  61. inspect_ai/log/_file.py +56 -0
  62. inspect_ai/log/_log.py +99 -0
  63. inspect_ai/log/_recorders/__init__.py +2 -0
  64. inspect_ai/log/_recorders/buffer/database.py +12 -11
  65. inspect_ai/log/_recorders/buffer/filestore.py +2 -2
  66. inspect_ai/log/_recorders/buffer/types.py +2 -2
  67. inspect_ai/log/_recorders/eval.py +20 -65
  68. inspect_ai/log/_recorders/file.py +28 -6
  69. inspect_ai/log/_recorders/recorder.py +7 -0
  70. inspect_ai/log/_recorders/types.py +1 -23
  71. inspect_ai/log/_samples.py +14 -25
  72. inspect_ai/log/_transcript.py +84 -36
  73. inspect_ai/log/_tree.py +118 -0
  74. inspect_ai/log/_util.py +52 -0
  75. inspect_ai/model/__init__.py +5 -1
  76. inspect_ai/model/_call_tools.py +72 -44
  77. inspect_ai/model/_generate_config.py +14 -8
  78. inspect_ai/model/_model.py +66 -88
  79. inspect_ai/model/_model_output.py +25 -0
  80. inspect_ai/model/_openai.py +2 -0
  81. inspect_ai/model/_providers/anthropic.py +13 -23
  82. inspect_ai/model/_providers/hf.py +27 -1
  83. inspect_ai/model/_providers/openai_o1.py +8 -2
  84. inspect_ai/model/_providers/providers.py +18 -4
  85. inspect_ai/model/_providers/sglang.py +247 -0
  86. inspect_ai/model/_providers/vllm.py +211 -400
  87. inspect_ai/scorer/_choice.py +1 -2
  88. inspect_ai/solver/__init__.py +7 -2
  89. inspect_ai/solver/_basic_agent.py +3 -10
  90. inspect_ai/solver/_chain.py +1 -1
  91. inspect_ai/solver/_fork.py +1 -1
  92. inspect_ai/solver/_multiple_choice.py +5 -22
  93. inspect_ai/solver/_plan.py +2 -2
  94. inspect_ai/solver/_task_state.py +26 -88
  95. inspect_ai/solver/_transcript.py +6 -7
  96. inspect_ai/tool/_json_rpc_helpers.py +45 -17
  97. inspect_ai/tool/_mcp/_mcp.py +8 -5
  98. inspect_ai/tool/_mcp/_sandbox.py +8 -2
  99. inspect_ai/tool/_mcp/server.py +3 -1
  100. inspect_ai/tool/_tool_call.py +4 -1
  101. inspect_ai/tool/_tool_support_helpers.py +51 -12
  102. inspect_ai/tool/_tools/_bash_session.py +190 -68
  103. inspect_ai/tool/_tools/_computer/_computer.py +25 -1
  104. inspect_ai/tool/_tools/_execute.py +4 -1
  105. inspect_ai/tool/_tools/_text_editor.py +4 -3
  106. inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
  107. inspect_ai/util/__init__.py +16 -0
  108. inspect_ai/util/_anyio.py +11 -0
  109. inspect_ai/util/_collect.py +50 -0
  110. inspect_ai/util/_limit.py +393 -0
  111. inspect_ai/util/_limited_conversation.py +57 -0
  112. inspect_ai/util/_span.py +58 -0
  113. inspect_ai/util/_subtask.py +27 -42
  114. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
  115. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +120 -134
  116. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
  117. inspect_ai/_display/core/group.py +0 -79
  118. inspect_ai/solver/_limit.py +0 -39
  119. inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
  120. inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
  121. inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
  122. inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
  123. inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
  124. inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
  125. inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
  126. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
  127. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
  128. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
  129. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
  130. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
  131. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
  132. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
  133. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
  134. inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
  135. inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
  136. inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
  137. inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
  138. inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
  139. inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
  140. inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
  141. inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
  142. inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
  143. inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
  144. inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
  145. inspect_ai/tool/_tools/_computer/test_args.py +0 -151
  146. /inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
  147. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
  148. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
  149. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py CHANGED
@@ -43,6 +43,9 @@ MAX_SANDBOXES_HELP = "Maximum number of sandboxes (per-provider) to run in paral
43
43
  NO_SANDBOX_CLEANUP_HELP = "Do not cleanup sandbox environments after task completes"
44
44
  FAIL_ON_ERROR_HELP = "Threshold of sample errors to tolerage (by default, evals fail when any error occurs). Value between 0 to 1 to set a proportion; value greater than 1 to set a count."
45
45
  NO_LOG_SAMPLES_HELP = "Do not include samples in the log file."
46
+ NO_LOG_REALTIME_HELP = (
47
+ "Do not log events in realtime (affects live viewing of samples in inspect view)"
48
+ )
46
49
  NO_FAIL_ON_ERROR_HELP = "Do not fail the eval if errors occur within samples (instead, continue running other samples)"
47
50
  RETRY_ON_ERROR_HELP = "Retry samples if they encounter errors (by default, no retries occur). Specify --retry-on-error to retry a single time, or specify e.g. `--retry-on-error=3` to retry multiple times."
48
51
  LOG_IMAGES_HELP = (
@@ -281,6 +284,13 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
281
284
  help=NO_LOG_SAMPLES_HELP,
282
285
  envvar="INSPECT_EVAL_NO_LOG_SAMPLES",
283
286
  )
287
+ @click.option(
288
+ "--no-log-realtime",
289
+ type=bool,
290
+ is_flag=True,
291
+ help=NO_LOG_REALTIME_HELP,
292
+ envvar="INSPECT_EVAL_NO_LOG_REALTIME",
293
+ )
284
294
  @click.option(
285
295
  "--log-images/--no-log-images",
286
296
  type=bool,
@@ -544,6 +554,7 @@ def eval_command(
544
554
  no_fail_on_error: bool | None,
545
555
  retry_on_error: int | None,
546
556
  no_log_samples: bool | None,
557
+ no_log_realtime: bool | None,
547
558
  log_images: bool | None,
548
559
  log_buffer: int | None,
549
560
  log_shared: int | None,
@@ -600,6 +611,7 @@ def eval_command(
600
611
  retry_on_error=retry_on_error,
601
612
  debug_errors=common["debug_errors"],
602
613
  no_log_samples=no_log_samples,
614
+ no_log_realtime=no_log_realtime,
603
615
  log_images=log_images,
604
616
  log_buffer=log_buffer,
605
617
  log_shared=log_shared,
@@ -718,6 +730,7 @@ def eval_set_command(
718
730
  no_fail_on_error: bool | None,
719
731
  retry_on_error: int | None,
720
732
  no_log_samples: bool | None,
733
+ no_log_realtime: bool | None,
721
734
  log_images: bool | None,
722
735
  log_buffer: int | None,
723
736
  log_shared: int | None,
@@ -779,6 +792,7 @@ def eval_set_command(
779
792
  retry_on_error=retry_on_error,
780
793
  debug_errors=common["debug_errors"],
781
794
  no_log_samples=no_log_samples,
795
+ no_log_realtime=no_log_realtime,
782
796
  log_images=log_images,
783
797
  log_buffer=log_buffer,
784
798
  log_shared=log_shared,
@@ -837,6 +851,7 @@ def eval_exec(
837
851
  retry_on_error: int | None,
838
852
  debug_errors: bool | None,
839
853
  no_log_samples: bool | None,
854
+ no_log_realtime: bool | None,
840
855
  log_images: bool | None,
841
856
  log_buffer: int | None,
842
857
  log_shared: int | None,
@@ -889,6 +904,7 @@ def eval_exec(
889
904
  # resolve negating options
890
905
  sandbox_cleanup = False if no_sandbox_cleanup else None
891
906
  log_samples = False if no_log_samples else None
907
+ log_realtime = False if no_log_realtime else None
892
908
  log_images = False if log_images is False else None
893
909
  trace = True if trace else None
894
910
  score = False if no_score else True
@@ -929,6 +945,7 @@ def eval_exec(
929
945
  max_subprocesses=max_subprocesses,
930
946
  max_sandboxes=max_sandboxes,
931
947
  log_samples=log_samples,
948
+ log_realtime=log_realtime,
932
949
  log_images=log_images,
933
950
  log_buffer=log_buffer,
934
951
  log_shared=log_shared,
@@ -1069,6 +1086,13 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
1069
1086
  help=NO_LOG_SAMPLES_HELP,
1070
1087
  envvar="INSPECT_EVAL_LOG_SAMPLES",
1071
1088
  )
1089
+ @click.option(
1090
+ "--no-log-realtime",
1091
+ type=bool,
1092
+ is_flag=True,
1093
+ help=NO_LOG_REALTIME_HELP,
1094
+ envvar="INSPECT_EVAL_LOG_REALTIME",
1095
+ )
1072
1096
  @click.option(
1073
1097
  "--log-images/--no-log-images",
1074
1098
  type=bool,
@@ -1136,6 +1160,7 @@ def eval_retry_command(
1136
1160
  no_fail_on_error: bool | None,
1137
1161
  retry_on_error: int | None,
1138
1162
  no_log_samples: bool | None,
1163
+ no_log_realtime: bool | None,
1139
1164
  log_images: bool | None,
1140
1165
  log_buffer: int | None,
1141
1166
  log_shared: int | None,
@@ -1154,6 +1179,7 @@ def eval_retry_command(
1154
1179
  # resolve negating options
1155
1180
  sandbox_cleanup = False if no_sandbox_cleanup else None
1156
1181
  log_samples = False if no_log_samples else None
1182
+ log_realtime = False if no_log_realtime else None
1157
1183
  log_images = False if log_images is False else None
1158
1184
  score = False if no_score else True
1159
1185
  score_display = False if no_score_display else None
@@ -1189,6 +1215,7 @@ def eval_retry_command(
1189
1215
  retry_on_error=retry_on_error,
1190
1216
  debug_errors=common["debug_errors"],
1191
1217
  log_samples=log_samples,
1218
+ log_realtime=log_realtime,
1192
1219
  log_images=log_images,
1193
1220
  log_buffer=log_buffer,
1194
1221
  log_shared=log_shared,
@@ -591,10 +591,10 @@ class SampleToolbar(Horizontal):
591
591
  )
592
592
  if isinstance(last_event, ModelEvent):
593
593
  # see if there are retries in play
594
- if sample.retry_count > 0:
595
- suffix = "retry" if sample.retry_count == 1 else "retries"
594
+ if last_event.retries:
595
+ suffix = "retry" if last_event.retries == 1 else "retries"
596
596
  pending_caption_text = (
597
- f"Generating ({sample.retry_count:,} {suffix})..."
597
+ f"Generating ({last_event.retries:,} {suffix})..."
598
598
  )
599
599
  else:
600
600
  pending_caption_text = "Generating..."
@@ -30,7 +30,7 @@ from inspect_ai.log._transcript import (
30
30
  SampleInitEvent,
31
31
  SampleLimitEvent,
32
32
  ScoreEvent,
33
- StepEvent,
33
+ SpanBeginEvent,
34
34
  SubtaskEvent,
35
35
  ToolEvent,
36
36
  )
@@ -211,10 +211,6 @@ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
211
211
  # render the call
212
212
  content = transcript_tool_call(event)
213
213
 
214
- # render sub-events
215
- if event.events:
216
- content.extend(render_sub_events(event.events))
217
-
218
214
  # render the output
219
215
  if isinstance(event.result, list):
220
216
  result: ToolResult = "\n".join(
@@ -235,23 +231,6 @@ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
235
231
  return [EventDisplay("tool call", Group(*content))]
236
232
 
237
233
 
238
- def render_step_event(event: StepEvent) -> EventDisplay:
239
- if event.type == "solver":
240
- return render_solver_event(event)
241
- if event.type == "scorer":
242
- return render_scorer_event(event)
243
- else:
244
- return EventDisplay(step_title(event))
245
-
246
-
247
- def render_solver_event(event: StepEvent) -> EventDisplay:
248
- return EventDisplay(step_title(event))
249
-
250
-
251
- def render_scorer_event(event: StepEvent) -> EventDisplay:
252
- return EventDisplay(step_title(event))
253
-
254
-
255
234
  def render_score_event(event: ScoreEvent) -> EventDisplay:
256
235
  table = Table(box=None, show_header=False)
257
236
  table.add_column("", min_width=10, justify="left")
@@ -272,10 +251,6 @@ def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
272
251
  # render header
273
252
  content: list[RenderableType] = [transcript_function(event.name, event.input)]
274
253
 
275
- # render sub-events
276
- if event.events:
277
- content.extend(render_sub_events(event.events))
278
-
279
254
  if event.result:
280
255
  content.append(Text())
281
256
  if isinstance(event.result, str | int | float | bool | None):
@@ -345,8 +320,8 @@ def render_message(message: ChatMessage) -> list[RenderableType]:
345
320
  return content
346
321
 
347
322
 
348
- def step_title(event: StepEvent) -> str:
349
- return f"{event.type or 'step'}: {event.name}"
323
+ def span_title(event: SpanBeginEvent) -> str:
324
+ return f"{event.type or 'span'}: {event.name}"
350
325
 
351
326
 
352
327
  EventRenderer = Callable[[Any], EventDisplay | list[EventDisplay] | None]
@@ -354,7 +329,6 @@ EventRenderer = Callable[[Any], EventDisplay | list[EventDisplay] | None]
354
329
  _renderers: list[tuple[Type[Event], EventRenderer]] = [
355
330
  (SampleInitEvent, render_sample_init_event),
356
331
  (SampleLimitEvent, render_sample_limit_event),
357
- (StepEvent, render_step_event),
358
332
  (ModelEvent, render_model_event),
359
333
  (ToolEvent, render_tool_event),
360
334
  (SubtaskEvent, render_subtask_event),
inspect_ai/_eval/eval.py CHANGED
@@ -101,6 +101,7 @@ def eval(
101
101
  max_subprocesses: int | None = None,
102
102
  max_sandboxes: int | None = None,
103
103
  log_samples: bool | None = None,
104
+ log_realtime: bool | None = None,
104
105
  log_images: bool | None = None,
105
106
  log_buffer: int | None = None,
106
107
  log_shared: bool | int | None = None,
@@ -145,7 +146,7 @@ def eval(
145
146
  to "eval", the native high-performance format).
146
147
  limit: Limit evaluated samples
147
148
  (defaults to all samples).
148
- sample_id: Evaluate specific sample(s) from the dataset.
149
+ sample_id: Evaluate specific sample(s) from the dataset. Use plain ids or preface with task names as required to disambiguate ids across tasks (e.g. `popularity:10`).
149
150
  epochs: Epochs to repeat samples for and optional score
150
151
  reducer function(s) used to combine sample scores (defaults to "mean")
151
152
  fail_on_error: `True` to fail on first sample error
@@ -171,6 +172,7 @@ def eval(
171
172
  max_sandboxes: Maximum number of sandboxes (per-provider)
172
173
  to run in parallel.
173
174
  log_samples: Log detailed samples and scores (defaults to True)
175
+ log_realtime: Log events in realtime (enables live viewing of samples in inspect view). Defaults to True.
174
176
  log_images: Log base64 encoded version of images,
175
177
  even if specified as a filename or URL (defaults to False)
176
178
  log_buffer: Number of samples to buffer before writing log file.
@@ -228,6 +230,7 @@ def eval(
228
230
  max_subprocesses=max_subprocesses,
229
231
  max_sandboxes=max_sandboxes,
230
232
  log_samples=log_samples,
233
+ log_realtime=log_realtime,
231
234
  log_images=log_images,
232
235
  log_buffer=log_buffer,
233
236
  log_shared=log_shared,
@@ -281,6 +284,7 @@ async def eval_async(
281
284
  max_subprocesses: int | None = None,
282
285
  max_sandboxes: int | None = None,
283
286
  log_samples: bool | None = None,
287
+ log_realtime: bool | None = None,
284
288
  log_images: bool | None = None,
285
289
  log_buffer: int | None = None,
286
290
  log_shared: bool | int | None = None,
@@ -314,7 +318,7 @@ async def eval_async(
314
318
  log_dir: Output path for logging results (defaults to file log in ./logs directory).
315
319
  log_format: Format for writing log files (defaults to "eval", the native high-performance format).
316
320
  limit: Limit evaluated samples (defaults to all samples).
317
- sample_id: Evaluate specific sample(s) from the dataset.
321
+ sample_id: Evaluate specific sample(s) from the dataset. Use plain ids or preface with task names as required to disambiguate ids across tasks (e.g. `popularity:10`).
318
322
  epochs: Epochs to repeat samples for and optional score
319
323
  reducer function(s) used to combine sample scores (defaults to "mean")
320
324
  fail_on_error: `True` to fail on first sample error
@@ -335,6 +339,7 @@ async def eval_async(
335
339
  max_subprocesses: Maximum number of subprocesses to run in parallel (default is os.cpu_count())
336
340
  max_sandboxes: Maximum number of sandboxes (per-provider) to run in parallel.
337
341
  log_samples: Log detailed samples and scores (defaults to True)
342
+ log_realtime: Log events in realtime (enables live viewing of samples in inspect view). Defaults to True.
338
343
  log_images: Log base64 encoded version of images, even if specified as a filename or URL (defaults to False)
339
344
  log_buffer: Number of samples to buffer before writing log file.
340
345
  If not specified, an appropriate default for the format and filesystem is
@@ -473,6 +478,7 @@ async def eval_async(
473
478
  max_sandboxes=max_sandboxes,
474
479
  sandbox_cleanup=sandbox_cleanup,
475
480
  log_samples=log_samples,
481
+ log_realtime=log_realtime,
476
482
  log_images=log_images,
477
483
  log_buffer=log_buffer,
478
484
  log_shared=log_shared,
@@ -562,6 +568,7 @@ def eval_retry(
562
568
  retry_on_error: int | None = None,
563
569
  debug_errors: bool | None = None,
564
570
  log_samples: bool | None = None,
571
+ log_realtime: bool | None = None,
565
572
  log_images: bool | None = None,
566
573
  log_buffer: int | None = None,
567
574
  log_shared: bool | int | None = None,
@@ -603,6 +610,7 @@ def eval_retry(
603
610
  debug_errors: Raise task errors (rather than logging them)
604
611
  so they can be debugged (defaults to False).
605
612
  log_samples: Log detailed samples and scores (defaults to True)
613
+ log_realtime: Log events in realtime (enables live viewing of samples in inspect view). Defaults to True.
606
614
  log_images: Log base64 encoded version of images,
607
615
  even if specified as a filename or URL (defaults to False)
608
616
  log_buffer: Number of samples to buffer before writing log file.
@@ -645,6 +653,7 @@ def eval_retry(
645
653
  retry_on_error=retry_on_error,
646
654
  debug_errors=debug_errors,
647
655
  log_samples=log_samples,
656
+ log_realtime=log_realtime,
648
657
  log_images=log_images,
649
658
  log_buffer=log_buffer,
650
659
  log_shared=log_shared,
@@ -673,6 +682,7 @@ async def eval_retry_async(
673
682
  retry_on_error: int | None = None,
674
683
  debug_errors: bool | None = None,
675
684
  log_samples: bool | None = None,
685
+ log_realtime: bool | None = None,
676
686
  log_images: bool | None = None,
677
687
  log_buffer: int | None = None,
678
688
  log_shared: bool | int | None = None,
@@ -707,6 +717,7 @@ async def eval_retry_async(
707
717
  debug_errors: Raise task errors (rather than logging them)
708
718
  so they can be debugged (defaults to False).
709
719
  log_samples: Log detailed samples and scores (defaults to True)
720
+ log_realtime: Log events in realtime (enables live viewing of samples in inspect view). Defaults to True.
710
721
  log_images: Log base64 encoded version of images,
711
722
  even if specified as a filename or URL (defaults to False)
712
723
  log_buffer: Number of samples to buffer before writing log file.
@@ -817,6 +828,11 @@ async def eval_retry_async(
817
828
  log_samples = (
818
829
  log_samples if log_samples is not None else eval_log.eval.config.log_samples
819
830
  )
831
+ log_realtime = (
832
+ log_realtime
833
+ if log_realtime is not None
834
+ else eval_log.eval.config.log_realtime
835
+ )
820
836
  log_images = (
821
837
  log_images if log_images is not None else eval_log.eval.config.log_images
822
838
  )
@@ -875,6 +891,7 @@ async def eval_retry_async(
875
891
  max_subprocesses=max_subprocesses,
876
892
  max_sandboxes=max_sandboxes,
877
893
  log_samples=log_samples,
894
+ log_realtime=log_realtime,
878
895
  log_images=log_images,
879
896
  log_buffer=log_buffer,
880
897
  log_shared=log_shared,
@@ -93,6 +93,7 @@ def eval_set(
93
93
  max_subprocesses: int | None = None,
94
94
  max_sandboxes: int | None = None,
95
95
  log_samples: bool | None = None,
96
+ log_realtime: bool | None = None,
96
97
  log_images: bool | None = None,
97
98
  log_buffer: int | None = None,
98
99
  log_shared: bool | int | None = None,
@@ -147,7 +148,7 @@ def eval_set(
147
148
  log files (defaults to "eval", the native high-performance format).
148
149
  limit: Limit evaluated samples
149
150
  (defaults to all samples).
150
- sample_id: Evaluate specific sample(s) from the dataset.
151
+ sample_id: Evaluate specific sample(s) from the dataset. Use plain ids or preface with task names as required to disambiguate ids across tasks (e.g. `popularity:10`).
151
152
  epochs: Epochs to repeat samples for and optional score
152
153
  reducer function(s) used to combine sample scores (defaults to "mean")
153
154
  fail_on_error: `True` to fail on first sample error
@@ -173,6 +174,7 @@ def eval_set(
173
174
  max_sandboxes: Maximum number of sandboxes (per-provider)
174
175
  to run in parallel.
175
176
  log_samples: Log detailed samples and scores (defaults to True)
177
+ log_realtime: Log events in realtime (enables live viewing of samples in inspect view). Defaults to True.
176
178
  log_images: Log base64 encoded version of images,
177
179
  even if specified as a filename or URL (defaults to False)
178
180
  log_buffer: Number of samples to buffer before writing log file.
@@ -229,6 +231,7 @@ def eval_set(
229
231
  max_subprocesses=max_subprocesses,
230
232
  max_sandboxes=max_sandboxes,
231
233
  log_samples=log_samples,
234
+ log_realtime=log_realtime,
232
235
  log_images=log_images,
233
236
  log_buffer=log_buffer,
234
237
  log_shared=log_shared,
inspect_ai/_eval/run.py CHANGED
@@ -122,6 +122,11 @@ async def eval_run(
122
122
  task = resolved_task.task
123
123
  task_eval_config = eval_config.model_copy()
124
124
 
125
+ # sample_ids can be specified per task
126
+ task_eval_config.sample_id = resolve_task_sample_ids(
127
+ resolved_task.task.name, task_eval_config.sample_id
128
+ )
129
+
125
130
  # resolve the task scorers
126
131
  eval_scorer_specs = (
127
132
  [as_scorer_spec(scorer) for scorer in task.scorer]
@@ -424,6 +429,42 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
424
429
  return results
425
430
 
426
431
 
432
+ def resolve_task_sample_ids(
433
+ task: str, sample_id: str | int | list[str] | list[int] | list[str | int] | None
434
+ ) -> str | int | list[str] | list[int] | list[str | int] | None:
435
+ def collect_for_task(sample: str | int) -> str | int | None:
436
+ if isinstance(sample, str):
437
+ scoped = sample.split(":", maxsplit=1)
438
+ if len(scoped) > 1:
439
+ if scoped[0].lower() == task.lower():
440
+ return scoped[1]
441
+ else:
442
+ return None
443
+ else:
444
+ return sample
445
+ else:
446
+ return sample
447
+
448
+ if sample_id is not None:
449
+ if isinstance(sample_id, list):
450
+ ids: list[int | str] = []
451
+ for id in sample_id:
452
+ collect = collect_for_task(id)
453
+ if collect is not None:
454
+ ids.append(collect)
455
+ return ids
456
+
457
+ else:
458
+ collect = collect_for_task(sample_id)
459
+ if collect is not None:
460
+ return collect
461
+ else:
462
+ return []
463
+
464
+ else:
465
+ return sample_id
466
+
467
+
427
468
  async def startup_sandbox_environments(
428
469
  eval_sandbox: SandboxEnvironmentSpec | None,
429
470
  tasks: list[ResolvedTask],
@@ -4,7 +4,6 @@ from inspect_ai.model import CachePolicy, GenerateConfig, Model
4
4
  from inspect_ai.model._cache import epoch
5
5
  from inspect_ai.model._call_tools import execute_tools
6
6
  from inspect_ai.solver import TaskState
7
- from inspect_ai.solver._limit import SampleLimitExceededError
8
7
  from inspect_ai.tool import ToolFunction
9
8
 
10
9
 
@@ -18,53 +17,48 @@ async def task_generate(
18
17
  # track tool_choice (revert to "auto" after first forced call of a tool)
19
18
  tool_choice = state.tool_choice
20
19
 
21
- try:
22
- while True:
23
- # If we don't update the epoch here as we go, it's entirely possible
24
- # we'd cache the same response for every single epoch, which would
25
- # completely defeat the point!
26
- epoch.set(state.epoch)
20
+ while True:
21
+ # If we don't update the epoch here as we go, it's entirely possible
22
+ # we'd cache the same response for every single epoch, which would
23
+ # completely defeat the point!
24
+ epoch.set(state.epoch)
27
25
 
28
- # call the model
29
- state.output = await model.generate(
30
- input=state.messages,
31
- tools=state.tools,
32
- tool_choice=tool_choice,
33
- config=config,
34
- cache=cache,
35
- )
36
-
37
- # append the assistant message
38
- message = state.output.message
39
- state.messages.append(message)
40
-
41
- # check for completed
42
- if state.completed:
43
- return state
26
+ # call the model
27
+ state.output = await model.generate(
28
+ input=state.messages,
29
+ tools=state.tools,
30
+ tool_choice=tool_choice,
31
+ config=config,
32
+ cache=cache,
33
+ )
44
34
 
45
- # resolve tool calls if necessary
46
- if tool_calls != "none" and message.tool_calls:
47
- # call tools and update messages and output
48
- messages, output = await execute_tools(
49
- state.messages, state.tools, config.max_tool_output
50
- )
51
- state.messages.extend(messages)
52
- if output is not None:
53
- state.output = output
35
+ # append the assistant message
36
+ message = state.output.message
37
+ state.messages.append(message)
54
38
 
55
- # check for completed or only executing a single tool call
56
- if state.completed or tool_calls == "single":
57
- return state
39
+ # check for completed
40
+ if state.completed:
41
+ return state
58
42
 
59
- # if a tool_call was forced set tool_choice to 'auto'
60
- # (otherwise it will get forced over and over again)
61
- if isinstance(tool_choice, ToolFunction):
62
- tool_choice = "auto"
43
+ # resolve tool calls if necessary
44
+ if tool_calls != "none" and message.tool_calls:
45
+ # call tools and update messages and output
46
+ messages, output = await execute_tools(
47
+ state.messages, state.tools, config.max_tool_output
48
+ )
49
+ state.messages.extend(messages)
50
+ if output is not None:
51
+ state.output = output
63
52
 
64
- # no tool calls or not resolving tool calls, we are done!
65
- else:
53
+ # check for completed or only executing a single tool call
54
+ if state.completed or tool_calls == "single":
66
55
  return state
67
56
 
68
- # propagate current state along with sample limit exceeded
69
- except SampleLimitExceededError as ex:
70
- raise ex.with_state(state)
57
+ # if a tool_call was forced set tool_choice to 'auto'
58
+ # (otherwise it will get forced over and over again)
59
+ if isinstance(tool_choice, ToolFunction):
60
+ tool_choice = "auto"
61
+
62
+ # no tool calls or not resolving tool calls, we are done!
63
+ else:
64
+ return state
@@ -30,13 +30,14 @@ from inspect_ai.log._log import (
30
30
  EvalLog,
31
31
  EvalMetricDefinition,
32
32
  EvalSampleReductions,
33
+ EvalSampleSummary,
33
34
  EvalScorer,
34
35
  eval_config_defaults,
35
36
  )
36
37
  from inspect_ai.log._model import model_args_for_log, model_roles_to_model_roles_config
37
38
  from inspect_ai.log._recorders import Recorder
38
39
  from inspect_ai.log._recorders.buffer import SampleBufferDatabase
39
- from inspect_ai.log._recorders.types import SampleEvent, SampleSummary
40
+ from inspect_ai.log._recorders.types import SampleEvent
40
41
  from inspect_ai.log._transcript import Event
41
42
  from inspect_ai.model import (
42
43
  GenerateConfig,
@@ -160,13 +161,17 @@ class TaskLogger:
160
161
  self.flush_buffer = eval_config.log_buffer or recorder.default_log_buffer()
161
162
  self.flush_pending: list[tuple[str | int, int]] = []
162
163
 
164
+ # sample buffer db
165
+ self._buffer_db: SampleBufferDatabase | None = None
166
+
163
167
  async def init(self) -> None:
164
168
  self._location = await self.recorder.log_init(self.eval)
165
- self._buffer_db = SampleBufferDatabase(
166
- location=self._location,
167
- log_images=self.eval.config.log_images is not False,
168
- log_shared=self.eval.config.log_shared,
169
- )
169
+ if self.eval.config.log_realtime is not False:
170
+ self._buffer_db = SampleBufferDatabase(
171
+ location=self._location,
172
+ log_images=self.eval.config.log_images is not False,
173
+ log_shared=self.eval.config.log_shared,
174
+ )
170
175
 
171
176
  @property
172
177
  def location(self) -> str:
@@ -180,36 +185,26 @@ class TaskLogger:
180
185
  await self.recorder.log_start(self.eval, plan)
181
186
  await self.recorder.flush(self.eval)
182
187
 
183
- async def start_sample(self, sample: SampleSummary) -> None:
184
- self._buffer_db.start_sample(sample)
188
+ async def start_sample(self, sample: EvalSampleSummary) -> None:
189
+ if self._buffer_db is not None:
190
+ self._buffer_db.start_sample(sample)
185
191
 
186
192
  def log_sample_event(self, id: str | int, epoch: int, event: Event) -> None:
187
193
  # log the sample event
188
- self._buffer_db.log_events([SampleEvent(id=id, epoch=epoch, event=event)])
194
+ if self._buffer_db is not None:
195
+ self._buffer_db.log_events([SampleEvent(id=id, epoch=epoch, event=event)])
189
196
 
190
197
  def remove_sample(self, id: str | int, epoch: int) -> None:
191
- self._buffer_db.remove_samples([(id, epoch)])
198
+ if self._buffer_db is not None:
199
+ self._buffer_db.remove_samples([(id, epoch)])
192
200
 
193
201
  async def complete_sample(self, sample: EvalSample, *, flush: bool) -> None:
194
202
  # log the sample
195
203
  await self.recorder.log_sample(self.eval, sample)
196
204
 
197
205
  # mark complete
198
- self._buffer_db.complete_sample(
199
- SampleSummary(
200
- id=sample.id,
201
- epoch=sample.epoch,
202
- input=sample.input,
203
- target=sample.target,
204
- completed=True,
205
- scores=sample.scores,
206
- error=sample.error.message if sample.error is not None else None,
207
- limit=f"{sample.limit.type}" if sample.limit is not None else None,
208
- retries=len(sample.error_retries)
209
- if sample.error_retries is not None
210
- else None,
211
- )
212
- )
206
+ if self._buffer_db is not None:
207
+ self._buffer_db.complete_sample(sample.summary())
213
208
 
214
209
  # flush if requested
215
210
  if flush:
@@ -219,7 +214,8 @@ class TaskLogger:
219
214
  await self.recorder.flush(self.eval)
220
215
 
221
216
  # notify the event db it can remove these
222
- self._buffer_db.remove_samples(self.flush_pending)
217
+ if self._buffer_db is not None:
218
+ self._buffer_db.remove_samples(self.flush_pending)
223
219
 
224
220
  # Clear
225
221
  self.flush_pending.clear()
@@ -229,7 +225,8 @@ class TaskLogger:
229
225
  self._samples_completed += 1
230
226
 
231
227
  def update_metrics(self, metrics: list[TaskDisplayMetric]) -> None:
232
- self._buffer_db.update_metrics(metrics)
228
+ if self._buffer_db is not None:
229
+ self._buffer_db.update_metrics(metrics)
233
230
 
234
231
  async def log_finish(
235
232
  self,
@@ -245,7 +242,8 @@ class TaskLogger:
245
242
  )
246
243
 
247
244
  # cleanup the events db
248
- self._buffer_db.cleanup()
245
+ if self._buffer_db is not None:
246
+ self._buffer_db.cleanup()
249
247
 
250
248
  # return log
251
249
  return log