inspect-ai 0.3.80__py3-none-any.whl → 0.3.82__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (179) hide show
  1. inspect_ai/_cli/eval.py +35 -2
  2. inspect_ai/_cli/util.py +44 -1
  3. inspect_ai/_display/core/config.py +1 -1
  4. inspect_ai/_display/core/display.py +13 -4
  5. inspect_ai/_display/core/results.py +1 -1
  6. inspect_ai/_display/textual/widgets/task_detail.py +5 -4
  7. inspect_ai/_eval/eval.py +38 -1
  8. inspect_ai/_eval/evalset.py +5 -0
  9. inspect_ai/_eval/run.py +5 -2
  10. inspect_ai/_eval/task/log.py +53 -6
  11. inspect_ai/_eval/task/run.py +51 -10
  12. inspect_ai/_util/constants.py +2 -0
  13. inspect_ai/_util/file.py +17 -1
  14. inspect_ai/_util/json.py +36 -1
  15. inspect_ai/_view/server.py +113 -1
  16. inspect_ai/_view/www/App.css +1 -1
  17. inspect_ai/_view/www/dist/assets/index.css +518 -296
  18. inspect_ai/_view/www/dist/assets/index.js +38803 -36307
  19. inspect_ai/_view/www/eslint.config.mjs +1 -1
  20. inspect_ai/_view/www/log-schema.json +13 -0
  21. inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
  22. inspect_ai/_view/www/package.json +8 -2
  23. inspect_ai/_view/www/src/App.tsx +151 -855
  24. inspect_ai/_view/www/src/api/api-browser.ts +176 -5
  25. inspect_ai/_view/www/src/api/api-vscode.ts +75 -1
  26. inspect_ai/_view/www/src/api/client-api.ts +66 -10
  27. inspect_ai/_view/www/src/api/jsonrpc.ts +2 -0
  28. inspect_ai/_view/www/src/api/types.ts +107 -2
  29. inspect_ai/_view/www/src/appearance/icons.ts +1 -0
  30. inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +3 -3
  31. inspect_ai/_view/www/src/components/DownloadPanel.tsx +2 -2
  32. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +56 -61
  33. inspect_ai/_view/www/src/components/FindBand.tsx +17 -9
  34. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +1 -1
  35. inspect_ai/_view/www/src/components/JsonPanel.tsx +14 -24
  36. inspect_ai/_view/www/src/components/LargeModal.tsx +2 -35
  37. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +27 -11
  38. inspect_ai/_view/www/src/components/LiveVirtualList.module.css +11 -0
  39. inspect_ai/_view/www/src/components/LiveVirtualList.tsx +177 -0
  40. inspect_ai/_view/www/src/components/MarkdownDiv.tsx +3 -3
  41. inspect_ai/_view/www/src/components/MessageBand.tsx +14 -9
  42. inspect_ai/_view/www/src/components/MorePopOver.tsx +3 -3
  43. inspect_ai/_view/www/src/components/NavPills.tsx +20 -8
  44. inspect_ai/_view/www/src/components/NoContentsPanel.module.css +12 -0
  45. inspect_ai/_view/www/src/components/NoContentsPanel.tsx +20 -0
  46. inspect_ai/_view/www/src/components/ProgressBar.module.css +5 -4
  47. inspect_ai/_view/www/src/components/ProgressBar.tsx +3 -2
  48. inspect_ai/_view/www/src/components/PulsingDots.module.css +81 -0
  49. inspect_ai/_view/www/src/components/PulsingDots.tsx +45 -0
  50. inspect_ai/_view/www/src/components/TabSet.tsx +4 -37
  51. inspect_ai/_view/www/src/components/ToolButton.tsx +3 -4
  52. inspect_ai/_view/www/src/index.tsx +26 -94
  53. inspect_ai/_view/www/src/logfile/remoteLogFile.ts +9 -1
  54. inspect_ai/_view/www/src/logfile/remoteZipFile.ts +30 -4
  55. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +4 -6
  56. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +1 -1
  57. inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +9 -1
  58. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +67 -28
  59. inspect_ai/_view/www/src/samples/SampleDialog.tsx +51 -22
  60. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +4 -0
  61. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +144 -90
  62. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
  63. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +82 -35
  64. inspect_ai/_view/www/src/samples/SamplesTools.tsx +23 -30
  65. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +2 -1
  66. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +1 -1
  67. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +45 -53
  68. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +4 -1
  69. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +3 -0
  70. inspect_ai/_view/www/src/samples/chat/messages.ts +34 -0
  71. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.module.css +3 -0
  72. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +10 -1
  73. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +22 -46
  74. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +25 -17
  75. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +2 -1
  76. inspect_ai/_view/www/src/samples/descriptor/types.ts +6 -5
  77. inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +21 -3
  78. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +20 -1
  79. inspect_ai/_view/www/src/samples/list/SampleList.tsx +105 -85
  80. inspect_ai/_view/www/src/samples/list/SampleRow.module.css +6 -0
  81. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +27 -14
  82. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +29 -18
  83. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +28 -28
  84. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +19 -9
  85. inspect_ai/_view/www/src/samples/sampleDataAdapter.ts +33 -0
  86. inspect_ai/_view/www/src/samples/sampleLimit.ts +2 -2
  87. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +7 -9
  88. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +7 -11
  89. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +0 -13
  90. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +0 -13
  91. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +0 -13
  92. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +4 -0
  93. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +10 -24
  94. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +0 -13
  95. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +4 -22
  96. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +15 -24
  97. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +0 -13
  98. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +6 -28
  99. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +24 -34
  100. inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +4 -0
  101. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +8 -13
  102. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +197 -338
  103. inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.module.css +16 -0
  104. inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.tsx +44 -0
  105. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +7 -4
  106. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +52 -58
  107. inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.module.css +23 -0
  108. inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.tsx +27 -0
  109. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +30 -1
  110. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +102 -72
  111. inspect_ai/_view/www/src/scoring/utils.ts +87 -0
  112. inspect_ai/_view/www/src/state/appSlice.ts +244 -0
  113. inspect_ai/_view/www/src/state/hooks.ts +397 -0
  114. inspect_ai/_view/www/src/state/logPolling.ts +196 -0
  115. inspect_ai/_view/www/src/state/logSlice.ts +214 -0
  116. inspect_ai/_view/www/src/state/logsPolling.ts +118 -0
  117. inspect_ai/_view/www/src/state/logsSlice.ts +181 -0
  118. inspect_ai/_view/www/src/state/samplePolling.ts +311 -0
  119. inspect_ai/_view/www/src/state/sampleSlice.ts +127 -0
  120. inspect_ai/_view/www/src/state/sampleUtils.ts +21 -0
  121. inspect_ai/_view/www/src/state/scrolling.ts +206 -0
  122. inspect_ai/_view/www/src/state/store.ts +168 -0
  123. inspect_ai/_view/www/src/state/store_filter.ts +84 -0
  124. inspect_ai/_view/www/src/state/utils.ts +23 -0
  125. inspect_ai/_view/www/src/storage/index.ts +26 -0
  126. inspect_ai/_view/www/src/types/log.d.ts +2 -0
  127. inspect_ai/_view/www/src/types.ts +94 -32
  128. inspect_ai/_view/www/src/utils/attachments.ts +58 -23
  129. inspect_ai/_view/www/src/utils/logger.ts +52 -0
  130. inspect_ai/_view/www/src/utils/polling.ts +100 -0
  131. inspect_ai/_view/www/src/utils/react.ts +30 -0
  132. inspect_ai/_view/www/src/utils/vscode.ts +1 -1
  133. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +181 -216
  134. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +11 -53
  135. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +8 -18
  136. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +1 -0
  137. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +40 -22
  138. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +0 -1
  139. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +98 -39
  140. inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.module.css +32 -0
  141. inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.tsx +32 -0
  142. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +11 -13
  143. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +6 -2
  144. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +4 -4
  145. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +28 -13
  146. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +5 -10
  147. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +4 -4
  148. inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.module.css +22 -0
  149. inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.tsx +19 -0
  150. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +110 -115
  151. inspect_ai/_view/www/src/workspace/tabs/grouping.ts +37 -5
  152. inspect_ai/_view/www/src/workspace/tabs/types.ts +4 -0
  153. inspect_ai/_view/www/src/workspace/types.ts +4 -3
  154. inspect_ai/_view/www/src/workspace/utils.ts +4 -4
  155. inspect_ai/_view/www/vite.config.js +6 -0
  156. inspect_ai/_view/www/yarn.lock +370 -354
  157. inspect_ai/log/_condense.py +26 -0
  158. inspect_ai/log/_log.py +6 -3
  159. inspect_ai/log/_recorders/buffer/__init__.py +14 -0
  160. inspect_ai/log/_recorders/buffer/buffer.py +30 -0
  161. inspect_ai/log/_recorders/buffer/database.py +685 -0
  162. inspect_ai/log/_recorders/buffer/filestore.py +259 -0
  163. inspect_ai/log/_recorders/buffer/types.py +84 -0
  164. inspect_ai/log/_recorders/eval.py +2 -11
  165. inspect_ai/log/_recorders/types.py +30 -0
  166. inspect_ai/log/_transcript.py +27 -1
  167. inspect_ai/model/_call_tools.py +1 -0
  168. inspect_ai/model/_generate_config.py +2 -2
  169. inspect_ai/model/_model.py +1 -0
  170. inspect_ai/tool/_tool_support_helpers.py +4 -4
  171. inspect_ai/tool/_tools/_web_browser/_web_browser.py +3 -1
  172. inspect_ai/util/_subtask.py +1 -0
  173. {inspect_ai-0.3.80.dist-info → inspect_ai-0.3.82.dist-info}/METADATA +2 -2
  174. {inspect_ai-0.3.80.dist-info → inspect_ai-0.3.82.dist-info}/RECORD +178 -138
  175. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +0 -22
  176. {inspect_ai-0.3.80.dist-info → inspect_ai-0.3.82.dist-info}/WHEEL +0 -0
  177. {inspect_ai-0.3.80.dist-info → inspect_ai-0.3.82.dist-info}/entry_points.txt +0 -0
  178. {inspect_ai-0.3.80.dist-info → inspect_ai-0.3.82.dist-info}/licenses/LICENSE +0 -0
  179. {inspect_ai-0.3.80.dist-info → inspect_ai-0.3.82.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py CHANGED
@@ -10,6 +10,7 @@ from inspect_ai._util.constants import (
10
10
  ALL_LOG_LEVELS,
11
11
  DEFAULT_EPOCHS,
12
12
  DEFAULT_LOG_LEVEL_TRANSCRIPT,
13
+ DEFAULT_LOG_SHARED,
13
14
  DEFAULT_MAX_CONNECTIONS,
14
15
  )
15
16
  from inspect_ai._util.file import filesystem
@@ -25,7 +26,12 @@ from .common import (
25
26
  common_options,
26
27
  process_common_options,
27
28
  )
28
- from .util import parse_cli_args, parse_cli_config, parse_sandbox
29
+ from .util import (
30
+ int_or_bool_flag_callback,
31
+ parse_cli_args,
32
+ parse_cli_config,
33
+ parse_sandbox,
34
+ )
29
35
 
30
36
  MAX_SAMPLES_HELP = "Maximum number of samples to run in parallel (default is running all samples in parallel)"
31
37
  MAX_TASKS_HELP = "Maximum number of tasks to run in parallel (default is 1)"
@@ -41,6 +47,7 @@ LOG_IMAGES_HELP = (
41
47
  "Include base64 encoded versions of filename or URL based images in the log file."
42
48
  )
43
49
  LOG_BUFFER_HELP = "Number of samples to buffer before writing log file. If not specified, an appropriate default for the format and filesystem is chosen (10 for most all cases, 100 for JSON logs on remote filesystems)."
50
+ LOG_SHARED_HELP = "Sync sample events to log directory so that users on other systems can see log updates in realtime (defaults to no syncing). If enabled will sync every 10 seconds (or pass a value to sync every `n` seconds)."
44
51
  NO_SCORE_HELP = (
45
52
  "Do not score model output (use the inspect score command to score output later)"
46
53
  )
@@ -266,6 +273,15 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
266
273
  @click.option(
267
274
  "--log-buffer", type=int, help=LOG_BUFFER_HELP, envvar="INSPECT_EVAL_LOG_BUFFER"
268
275
  )
276
+ @click.option(
277
+ "--log-shared",
278
+ is_flag=False,
279
+ flag_value="true",
280
+ default=None,
281
+ callback=int_or_bool_flag_callback(DEFAULT_LOG_SHARED),
282
+ help=LOG_SHARED_HELP,
283
+ envvar=["INSPECT_LOG_SHARED", "INSPECT_EVAL_LOG_SHARED"],
284
+ )
269
285
  @click.option(
270
286
  "--no-score",
271
287
  type=bool,
@@ -396,7 +412,7 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
396
412
  @click.option(
397
413
  "--reasoning-effort",
398
414
  type=click.Choice(["low", "medium", "high"]),
399
- help="Constrains effort on reasoning for reasoning models. Open AI o-series models only.",
415
+ help="Constrains effort on reasoning for reasoning models (defaults to `medium`). Open AI o-series models only.",
400
416
  envvar="INSPECT_EVAL_REASONING_EFFORT",
401
417
  )
402
418
  @click.option(
@@ -503,6 +519,7 @@ def eval_command(
503
519
  no_log_samples: bool | None,
504
520
  log_images: bool | None,
505
521
  log_buffer: int | None,
522
+ log_shared: int | None,
506
523
  no_score: bool | None,
507
524
  no_score_display: bool | None,
508
525
  log_format: Literal["eval", "json"] | None,
@@ -556,6 +573,7 @@ def eval_command(
556
573
  no_log_samples=no_log_samples,
557
574
  log_images=log_images,
558
575
  log_buffer=log_buffer,
576
+ log_shared=log_shared,
559
577
  no_score=no_score,
560
578
  no_score_display=no_score_display,
561
579
  is_eval_set=False,
@@ -670,6 +688,7 @@ def eval_set_command(
670
688
  no_log_samples: bool | None,
671
689
  log_images: bool | None,
672
690
  log_buffer: int | None,
691
+ log_shared: int | None,
673
692
  no_score: bool | None,
674
693
  no_score_display: bool | None,
675
694
  bundle_dir: str | None,
@@ -728,6 +747,7 @@ def eval_set_command(
728
747
  no_log_samples=no_log_samples,
729
748
  log_images=log_images,
730
749
  log_buffer=log_buffer,
750
+ log_shared=log_shared,
731
751
  no_score=no_score,
732
752
  no_score_display=no_score_display,
733
753
  is_eval_set=True,
@@ -783,6 +803,7 @@ def eval_exec(
783
803
  no_log_samples: bool | None,
784
804
  log_images: bool | None,
785
805
  log_buffer: int | None,
806
+ log_shared: int | None,
786
807
  no_score: bool | None,
787
808
  no_score_display: bool | None,
788
809
  is_eval_set: bool = False,
@@ -865,6 +886,7 @@ def eval_exec(
865
886
  log_samples=log_samples,
866
887
  log_images=log_images,
867
888
  log_buffer=log_buffer,
889
+ log_shared=log_shared,
868
890
  score=score,
869
891
  score_display=score_display,
870
892
  )
@@ -1004,6 +1026,15 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
1004
1026
  @click.option(
1005
1027
  "--log-buffer", type=int, help=LOG_BUFFER_HELP, envvar="INSPECT_EVAL_LOG_BUFFER"
1006
1028
  )
1029
+ @click.option(
1030
+ "--log-shared",
1031
+ is_flag=False,
1032
+ flag_value="true",
1033
+ default=None,
1034
+ callback=int_or_bool_flag_callback(DEFAULT_LOG_SHARED),
1035
+ help=LOG_SHARED_HELP,
1036
+ envvar=["INSPECT_LOG_SHARED", "INSPECT_EVAL_LOG_SHARED"],
1037
+ )
1007
1038
  @click.option(
1008
1039
  "--no-score",
1009
1040
  type=bool,
@@ -1052,6 +1083,7 @@ def eval_retry_command(
1052
1083
  no_log_samples: bool | None,
1053
1084
  log_images: bool | None,
1054
1085
  log_buffer: int | None,
1086
+ log_shared: int | None,
1055
1087
  no_score: bool | None,
1056
1088
  no_score_display: bool | None,
1057
1089
  max_connections: int | None,
@@ -1099,6 +1131,7 @@ def eval_retry_command(
1099
1131
  log_samples=log_samples,
1100
1132
  log_images=log_images,
1101
1133
  log_buffer=log_buffer,
1134
+ log_shared=log_shared,
1102
1135
  score=score,
1103
1136
  score_display=score_display,
1104
1137
  max_retries=max_retries,
inspect_ai/_cli/util.py CHANGED
@@ -1,11 +1,54 @@
1
- from typing import Any
1
+ from typing import Any, Callable
2
2
 
3
+ import click
3
4
  import yaml
4
5
 
5
6
  from inspect_ai._util.config import resolve_args
6
7
  from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
7
8
 
8
9
 
10
+ def int_or_bool_flag_callback(
11
+ true_value: int, false_value: int = 0
12
+ ) -> Callable[[click.Context, click.Parameter, Any], int]:
13
+ def callback(ctx: click.Context, param: click.Parameter, value: Any) -> int:
14
+ """Callback to parse the an option that can either be a boolean flag or integer.
15
+
16
+ Desired behavior:
17
+ - Not specified at all -> false_value
18
+ - Specified with no value -> true_value
19
+ - Specified with "true"/"false" -> true_value or false_value respectively
20
+ - Specified with an integer -> that integer
21
+ """
22
+ # 1. If this parameter was never given on the command line,
23
+ # then we return 0.
24
+ source = ctx.get_parameter_source(param.name) if param.name else ""
25
+ if source == click.core.ParameterSource.DEFAULT:
26
+ # Means the user did NOT specify the flag at all
27
+ return false_value
28
+
29
+ # 2. The user did specify the flag. If value is None,
30
+ # that means they used the flag with no argument, e.g. --my-flag
31
+ if value is None:
32
+ return true_value
33
+
34
+ # 3. If there is a value, try to parse booleans or an integer.
35
+ lower_val = value.lower()
36
+ if lower_val in ("true", "yes", "1"):
37
+ return true_value
38
+ elif lower_val in ("false", "no", "0"):
39
+ return false_value
40
+ else:
41
+ # 4. Otherwise, assume it is an integer
42
+ try:
43
+ return int(value)
44
+ except ValueError:
45
+ raise click.BadParameter(
46
+ f"Expected 'true', 'false', or an integer for --{param.name}. Got: {value}"
47
+ )
48
+
49
+ return callback
50
+
51
+
9
52
  def parse_cli_config(
10
53
  args: tuple[str] | list[str] | None, config: str | None
11
54
  ) -> dict[str, Any]:
@@ -36,7 +36,7 @@ def task_config(
36
36
  value = value if isinstance(value, list) else [value]
37
37
  value = [str(v) for v in value]
38
38
  config_print.append(f"{name}: {','.join(value)}")
39
- elif name not in ["limit", "model", "response_schema"]:
39
+ elif name not in ["limit", "model", "response_schema", "log_shared"]:
40
40
  if isinstance(value, list):
41
41
  value = ",".join([str(v) for v in value])
42
42
  if isinstance(value, str):
@@ -15,6 +15,7 @@ from typing import (
15
15
  )
16
16
 
17
17
  import rich
18
+ from pydantic import BaseModel, Field, field_validator
18
19
  from rich.console import Console
19
20
 
20
21
  from inspect_ai.log import EvalConfig, EvalResults, EvalStats
@@ -104,12 +105,20 @@ class TaskScreen(contextlib.AbstractContextManager["TaskScreen"]):
104
105
  raise NotImplementedError("input_panel not implemented by current display")
105
106
 
106
107
 
107
- @dataclass
108
- class TaskDisplayMetric:
108
+ class TaskDisplayMetric(BaseModel):
109
109
  scorer: str
110
110
  name: str
111
- value: float | int
112
- reducer: str | None
111
+ value: float | int | None = Field(default=None)
112
+ reducer: str | None = Field(default=None)
113
+
114
+ @field_validator("value", mode="before")
115
+ @classmethod
116
+ def handle_null_value(cls, v: Any) -> Union[float, int, None]:
117
+ if v is None:
118
+ return None
119
+ if isinstance(v, float | int):
120
+ return v
121
+ raise ValueError(f"Expected float, int, or None, got {type(v)}")
113
122
 
114
123
 
115
124
  @runtime_checkable
@@ -180,7 +180,7 @@ def task_metric(metrics: list[TaskDisplayMetric], width: int | None = None) -> s
180
180
  )
181
181
 
182
182
  metric = metrics[0]
183
- if np.isnan(metric.value):
183
+ if metric.value is None or np.isnan(metric.value):
184
184
  value = " n/a"
185
185
  else:
186
186
  value = f"{metric.value:.2f}"
@@ -14,7 +14,7 @@ from inspect_ai._display.core.display import TaskDisplayMetric
14
14
  @dataclass
15
15
  class TaskMetric:
16
16
  name: str
17
- value: float
17
+ value: float | int | None
18
18
 
19
19
 
20
20
  class TaskDetail(Widget):
@@ -233,9 +233,10 @@ class TaskMetrics(Widget):
233
233
  for metric in self.metrics:
234
234
  # Add the value static but keep it around
235
235
  # for future updates
236
- self.value_widgets[metric.name] = Static(
237
- self._metric_value(metric.value), markup=False
238
- )
236
+ if metric.value is not None:
237
+ self.value_widgets[metric.name] = Static(
238
+ self._metric_value(metric.value), markup=False
239
+ )
239
240
 
240
241
  grid.mount(Static(metric.name, markup=False))
241
242
  grid.mount(self.value_widgets[metric.name])
inspect_ai/_eval/eval.py CHANGED
@@ -15,7 +15,11 @@ from typing_extensions import Unpack
15
15
  from inspect_ai._cli.util import parse_cli_args
16
16
  from inspect_ai._display.core.active import display as task_display
17
17
  from inspect_ai._util.config import resolve_args
18
- from inspect_ai._util.constants import DEFAULT_LOG_FORMAT
18
+ from inspect_ai._util.constants import (
19
+ DEFAULT_LOG_FORMAT,
20
+ DEFAULT_LOG_SHARED,
21
+ JSON_LOG_FORMAT,
22
+ )
19
23
  from inspect_ai._util.error import PrerequisiteError
20
24
  from inspect_ai._util.file import absolute_file_path
21
25
  from inspect_ai._util.logger import warn_once
@@ -31,6 +35,7 @@ from inspect_ai.approval._policy import (
31
35
  from inspect_ai.log import EvalConfig, EvalLog, EvalLogInfo
32
36
  from inspect_ai.log._file import read_eval_log_async
33
37
  from inspect_ai.log._recorders import create_recorder_for_format
38
+ from inspect_ai.log._recorders.buffer import cleanup_sample_buffers
34
39
  from inspect_ai.model import (
35
40
  GenerateConfig,
36
41
  GenerateConfigArgs,
@@ -92,6 +97,7 @@ def eval(
92
97
  log_samples: bool | None = None,
93
98
  log_images: bool | None = None,
94
99
  log_buffer: int | None = None,
100
+ log_shared: bool | int | None = None,
95
101
  score: bool = True,
96
102
  score_display: bool | None = None,
97
103
  **kwargs: Unpack[GenerateConfigArgs],
@@ -161,6 +167,9 @@ def eval(
161
167
  log_buffer: Number of samples to buffer before writing log file.
162
168
  If not specified, an appropriate default for the format and filesystem is
163
169
  chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
170
+ log_shared: Sync sample events to log directory so that users on other systems
171
+ can see log updates in realtime (defaults to no syncing). Specify `True`
172
+ to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
164
173
  score: Score output (defaults to True)
165
174
  score_display: Show scoring metrics in realtime (defaults to True)
166
175
  **kwargs: Model generation options.
@@ -210,6 +219,7 @@ def eval(
210
219
  log_samples=log_samples,
211
220
  log_images=log_images,
212
221
  log_buffer=log_buffer,
222
+ log_shared=log_shared,
213
223
  score=score,
214
224
  score_display=score_display,
215
225
  **kwargs,
@@ -260,6 +270,7 @@ async def eval_async(
260
270
  log_samples: bool | None = None,
261
271
  log_images: bool | None = None,
262
272
  log_buffer: int | None = None,
273
+ log_shared: bool | int | None = None,
263
274
  score: bool = True,
264
275
  score_display: bool | None = None,
265
276
  **kwargs: Unpack[GenerateConfigArgs],
@@ -312,6 +323,7 @@ async def eval_async(
312
323
  log_buffer: Number of samples to buffer before writing log file.
313
324
  If not specified, an appropriate default for the format and filesystem is
314
325
  chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
326
+ log_shared: Indicate that the log directory is shared, which results in additional syncing of realtime log data for Inspect View.
315
327
  score: Score output (defaults to True)
316
328
  score_display: Show scoring metrics in realtime (defaults to True)
317
329
  **kwargs: Model generation options.
@@ -390,6 +402,15 @@ async def eval_async(
390
402
  f"ERROR: You do not have write permission for the log_dir '{log_dir}'"
391
403
  )
392
404
 
405
+ # resolve log_shared
406
+ log_shared = DEFAULT_LOG_SHARED if log_shared is True else log_shared
407
+
408
+ # validate that --log-shared can't use used with 'json' format
409
+ if log_shared and log_format == JSON_LOG_FORMAT:
410
+ raise PrerequisiteError(
411
+ "ERROR: --log-shared is not compatible with the json log format."
412
+ )
413
+
393
414
  # resolve solver
394
415
  solver = chain(solver) if isinstance(solver, list) else solver
395
416
 
@@ -426,6 +447,7 @@ async def eval_async(
426
447
  log_samples=log_samples,
427
448
  log_images=log_images,
428
449
  log_buffer=log_buffer,
450
+ log_shared=log_shared,
429
451
  score_display=score_display,
430
452
  )
431
453
 
@@ -485,6 +507,9 @@ async def eval_async(
485
507
  )
486
508
  logs = EvalLogs(results)
487
509
 
510
+ # cleanup sample buffers if required
511
+ cleanup_sample_buffers(log_dir)
512
+
488
513
  finally:
489
514
  _eval_async_running = False
490
515
 
@@ -510,6 +535,7 @@ def eval_retry(
510
535
  log_samples: bool | None = None,
511
536
  log_images: bool | None = None,
512
537
  log_buffer: int | None = None,
538
+ log_shared: bool | int | None = None,
513
539
  score: bool = True,
514
540
  score_display: bool | None = None,
515
541
  max_retries: int | None = None,
@@ -551,6 +577,9 @@ def eval_retry(
551
577
  log_buffer: Number of samples to buffer before writing log file.
552
578
  If not specified, an appropriate default for the format and filesystem is
553
579
  chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
580
+ log_shared: Sync sample events to log directory so that users on other systems
581
+ can see log updates in realtime (defaults to no syncing). Specify `True`
582
+ to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
554
583
  score: Score output (defaults to True)
555
584
  score_display: Show scoring metrics in realtime (defaults to True)
556
585
  max_retries:
@@ -586,6 +615,7 @@ def eval_retry(
586
615
  log_samples=log_samples,
587
616
  log_images=log_images,
588
617
  log_buffer=log_buffer,
618
+ log_shared=log_shared,
589
619
  score=score,
590
620
  score_display=score_display,
591
621
  max_retries=max_retries,
@@ -612,6 +642,7 @@ async def eval_retry_async(
612
642
  log_samples: bool | None = None,
613
643
  log_images: bool | None = None,
614
644
  log_buffer: int | None = None,
645
+ log_shared: bool | int | None = None,
615
646
  score: bool = True,
616
647
  score_display: bool | None = None,
617
648
  max_retries: int | None = None,
@@ -651,6 +682,8 @@ async def eval_retry_async(
651
682
  log_buffer: (int | None): Number of samples to buffer before writing log file.
652
683
  If not specified, an appropriate default for the format and filesystem is
653
684
  chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
685
+ log_shared: Indicate that the log directory is shared, which results in
686
+ additional syncing of realtime log data for Inspect View.
654
687
  score (bool): Score output (defaults to True)
655
688
  score_display (bool | None): Show scoring metrics in realtime (defaults to True)
656
689
  max_retries (int | None):
@@ -750,6 +783,9 @@ async def eval_retry_async(
750
783
  log_buffer = (
751
784
  log_buffer if log_buffer is not None else eval_log.eval.config.log_buffer
752
785
  )
786
+ log_shared = (
787
+ log_shared if log_shared is not None else eval_log.eval.config.log_shared
788
+ )
753
789
  score_display = (
754
790
  score_display
755
791
  if score_display is not None
@@ -796,6 +832,7 @@ async def eval_retry_async(
796
832
  log_samples=log_samples,
797
833
  log_images=log_images,
798
834
  log_buffer=log_buffer,
835
+ log_shared=log_shared,
799
836
  score=score,
800
837
  score_display=score_display,
801
838
  **dict(config),
@@ -92,6 +92,7 @@ def eval_set(
92
92
  log_samples: bool | None = None,
93
93
  log_images: bool | None = None,
94
94
  log_buffer: int | None = None,
95
+ log_shared: bool | int | None = None,
95
96
  bundle_dir: str | None = None,
96
97
  bundle_overwrite: bool = False,
97
98
  **kwargs: Unpack[GenerateConfigArgs],
@@ -171,6 +172,9 @@ def eval_set(
171
172
  log_buffer: Number of samples to buffer before writing log file.
172
173
  If not specified, an appropriate default for the format and filesystem is
173
174
  chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
175
+ log_shared: Sync sample events to log directory so that users on other systems
176
+ can see log updates in realtime (defaults to no syncing). Specify `True`
177
+ to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
174
178
  bundle_dir: If specified, the log viewer and logs generated
175
179
  by this eval set will be bundled into this directory.
176
180
  bundle_overwrite: Whether to overwrite files in the bundle_dir.
@@ -219,6 +223,7 @@ def eval_set(
219
223
  log_samples=log_samples,
220
224
  log_images=log_images,
221
225
  log_buffer=log_buffer,
226
+ log_shared=log_shared,
222
227
  score=score,
223
228
  **kwargs,
224
229
  )
inspect_ai/_eval/run.py CHANGED
@@ -407,12 +407,15 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
407
407
  # Use anyio task group instead of manual task management
408
408
  try:
409
409
  async with anyio.create_task_group() as tg:
410
+ # computer number of workers (never more than total_tasks)
411
+ num_workers = min(parallel, total_tasks)
412
+
410
413
  # start worker tasks
411
- for _ in range(parallel):
414
+ for _ in range(num_workers):
412
415
  tg.start_soon(worker)
413
416
 
414
417
  # enqueue initial set of tasks
415
- for _ in range(min(parallel, total_tasks)):
418
+ for _ in range(num_workers):
416
419
  await enque_next_task()
417
420
  except anyio.get_cancelled_exc_class():
418
421
  pass
@@ -4,6 +4,7 @@ from typing import Any, Iterator, Literal, cast
4
4
 
5
5
  from shortuuid import uuid
6
6
 
7
+ from inspect_ai._display.core.display import TaskDisplayMetric
7
8
  from inspect_ai._eval.task.util import slice_dataset
8
9
  from inspect_ai._util.constants import PKG_NAME
9
10
  from inspect_ai._util.datetime import iso_now
@@ -34,6 +35,9 @@ from inspect_ai.log._log import (
34
35
  eval_config_defaults,
35
36
  )
36
37
  from inspect_ai.log._recorders import Recorder
38
+ from inspect_ai.log._recorders.buffer import SampleBufferDatabase
39
+ from inspect_ai.log._recorders.types import SampleEvent, SampleSummary
40
+ from inspect_ai.log._transcript import Event
37
41
  from inspect_ai.model import (
38
42
  GenerateConfig,
39
43
  Model,
@@ -159,10 +163,15 @@ class TaskLogger:
159
163
 
160
164
  # size of flush buffer (how many samples we buffer before hitting storage)
161
165
  self.flush_buffer = eval_config.log_buffer or recorder.default_log_buffer()
162
- self.flush_pending = 0
166
+ self.flush_pending: list[tuple[str | int, int]] = []
163
167
 
164
168
  async def init(self) -> None:
165
169
  self._location = await self.recorder.log_init(self.eval)
170
+ self._buffer_db = SampleBufferDatabase(
171
+ location=self._location,
172
+ log_images=self.eval.config.log_images is not False,
173
+ log_shared=self.eval.config.log_shared,
174
+ )
166
175
 
167
176
  @property
168
177
  def location(self) -> str:
@@ -174,22 +183,53 @@ class TaskLogger:
174
183
 
175
184
  async def log_start(self, plan: EvalPlan) -> None:
176
185
  await self.recorder.log_start(self.eval, plan)
186
+ await self.recorder.flush(self.eval)
187
+
188
+ async def start_sample(self, sample: SampleSummary) -> None:
189
+ self._buffer_db.start_sample(sample)
190
+
191
+ def log_sample_event(self, id: str | int, epoch: int, event: Event) -> None:
192
+ # log the sample event
193
+ self._buffer_db.log_events([SampleEvent(id=id, epoch=epoch, event=event)])
177
194
 
178
- async def log_sample(self, sample: EvalSample, *, flush: bool) -> None:
195
+ async def complete_sample(self, sample: EvalSample, *, flush: bool) -> None:
179
196
  # log the sample
180
197
  await self.recorder.log_sample(self.eval, sample)
181
198
 
199
+ # mark complete
200
+ self._buffer_db.complete_sample(
201
+ SampleSummary(
202
+ id=sample.id,
203
+ epoch=sample.epoch,
204
+ input=sample.input,
205
+ target=sample.target,
206
+ completed=True,
207
+ scores=sample.scores,
208
+ error=sample.error.message if sample.error is not None else None,
209
+ limit=f"{sample.limit.type}" if sample.limit is not None else None,
210
+ )
211
+ )
212
+
182
213
  # flush if requested
183
214
  if flush:
184
- self.flush_pending += 1
185
- if self.flush_pending >= self.flush_buffer:
215
+ self.flush_pending.append((sample.id, sample.epoch))
216
+ if len(self.flush_pending) >= self.flush_buffer:
217
+ # flush to disk
186
218
  await self.recorder.flush(self.eval)
187
- self.flush_pending = 0
219
+
220
+ # notify the event db it can remove these
221
+ self._buffer_db.remove_samples(self.flush_pending)
222
+
223
+ # Clear
224
+ self.flush_pending.clear()
188
225
 
189
226
  # track sucessful samples logged
190
227
  if sample.error is None:
191
228
  self._samples_completed += 1
192
229
 
230
+ def update_metrics(self, metrics: list[TaskDisplayMetric]) -> None:
231
+ self._buffer_db.update_metrics(metrics)
232
+
193
233
  async def log_finish(
194
234
  self,
195
235
  status: Literal["success", "cancelled", "error"],
@@ -198,10 +238,17 @@ class TaskLogger:
198
238
  reductions: list[EvalSampleReductions] | None = None,
199
239
  error: EvalError | None = None,
200
240
  ) -> EvalLog:
201
- return await self.recorder.log_finish(
241
+ # finish and get log
242
+ log = await self.recorder.log_finish(
202
243
  self.eval, status, stats, results, reductions, error
203
244
  )
204
245
 
246
+ # cleanup the events db
247
+ self._buffer_db.cleanup()
248
+
249
+ # return log
250
+ return log
251
+
205
252
 
206
253
  async def log_start(
207
254
  logger: TaskLogger,