inspect-ai 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. inspect_ai/_cli/cache.py +8 -7
  2. inspect_ai/_cli/common.py +0 -12
  3. inspect_ai/_cli/eval.py +32 -4
  4. inspect_ai/_cli/info.py +1 -0
  5. inspect_ai/_cli/list.py +1 -1
  6. inspect_ai/_cli/log.py +2 -0
  7. inspect_ai/_cli/sandbox.py +4 -1
  8. inspect_ai/_cli/score.py +181 -32
  9. inspect_ai/_cli/trace.py +2 -0
  10. inspect_ai/_cli/view.py +4 -2
  11. inspect_ai/_display/core/config.py +7 -1
  12. inspect_ai/_display/core/progress.py +1 -1
  13. inspect_ai/_display/textual/app.py +8 -4
  14. inspect_ai/_display/textual/widgets/samples.py +6 -5
  15. inspect_ai/_display/textual/widgets/sandbox.py +6 -0
  16. inspect_ai/_eval/__init__.py +0 -0
  17. inspect_ai/_eval/eval.py +100 -97
  18. inspect_ai/_eval/evalset.py +69 -69
  19. inspect_ai/_eval/loader.py +122 -12
  20. inspect_ai/_eval/registry.py +1 -1
  21. inspect_ai/_eval/run.py +14 -0
  22. inspect_ai/_eval/score.py +125 -36
  23. inspect_ai/_eval/task/log.py +105 -4
  24. inspect_ai/_eval/task/results.py +92 -38
  25. inspect_ai/_eval/task/run.py +6 -2
  26. inspect_ai/_eval/task/sandbox.py +35 -2
  27. inspect_ai/_eval/task/task.py +49 -46
  28. inspect_ai/_util/__init__.py +0 -0
  29. inspect_ai/_util/constants.py +1 -1
  30. inspect_ai/_util/content.py +8 -0
  31. inspect_ai/_util/error.py +2 -0
  32. inspect_ai/_util/file.py +15 -1
  33. inspect_ai/_util/logger.py +4 -2
  34. inspect_ai/_util/registry.py +7 -1
  35. inspect_ai/_view/view.py +1 -2
  36. inspect_ai/_view/www/App.css +8 -3
  37. inspect_ai/_view/www/README.md +1 -1
  38. inspect_ai/_view/www/dist/assets/index.css +66 -38
  39. inspect_ai/_view/www/dist/assets/index.js +525 -523
  40. inspect_ai/_view/www/log-schema.json +86 -73
  41. inspect_ai/_view/www/package.json +1 -1
  42. inspect_ai/_view/www/src/App.tsx +1 -0
  43. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +1 -1
  44. inspect_ai/_view/www/src/components/JsonPanel.tsx +1 -1
  45. inspect_ai/_view/www/src/components/LargeModal.tsx +39 -49
  46. inspect_ai/_view/www/src/components/NavPills.tsx +3 -1
  47. inspect_ai/_view/www/src/components/TabSet.tsx +19 -4
  48. inspect_ai/_view/www/src/logfile/remoteLogFile.ts +0 -1
  49. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +1 -1
  50. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +1 -1
  51. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +6 -13
  52. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +17 -2
  53. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +1 -1
  54. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +14 -5
  55. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +4 -2
  56. inspect_ai/_view/www/src/samples/SamplesTools.tsx +16 -24
  57. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +1 -1
  58. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +1 -0
  59. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +27 -13
  60. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +19 -17
  61. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +12 -10
  62. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +56 -66
  63. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +12 -5
  64. inspect_ai/_view/www/src/samples/chat/tools/tool.ts +21 -36
  65. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +3 -1
  66. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +27 -25
  67. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +5 -1
  68. inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +13 -13
  69. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +1 -1
  70. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +2 -2
  71. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +9 -5
  72. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +1 -1
  73. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -4
  74. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +1 -0
  75. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +1 -0
  76. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +17 -6
  77. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +14 -19
  78. inspect_ai/_view/www/src/types/log.d.ts +107 -19
  79. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +7 -1
  80. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +5 -3
  81. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +25 -27
  82. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +12 -11
  83. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +25 -2
  84. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +60 -36
  85. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +4 -0
  86. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +6 -4
  87. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +16 -14
  88. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +9 -19
  89. inspect_ai/_view/www/src/workspace/utils.ts +34 -0
  90. inspect_ai/approval/_approval.py +2 -0
  91. inspect_ai/approval/_approver.py +4 -4
  92. inspect_ai/approval/_auto.py +1 -1
  93. inspect_ai/approval/_human/approver.py +3 -0
  94. inspect_ai/approval/_policy.py +5 -0
  95. inspect_ai/approval/_registry.py +2 -2
  96. inspect_ai/dataset/_dataset.py +36 -45
  97. inspect_ai/dataset/_sources/__init__.py +0 -0
  98. inspect_ai/dataset/_sources/csv.py +13 -13
  99. inspect_ai/dataset/_sources/hf.py +29 -29
  100. inspect_ai/dataset/_sources/json.py +10 -10
  101. inspect_ai/log/__init__.py +2 -0
  102. inspect_ai/log/_convert.py +3 -3
  103. inspect_ai/log/_file.py +24 -9
  104. inspect_ai/log/_log.py +98 -7
  105. inspect_ai/log/_message.py +3 -1
  106. inspect_ai/log/_recorders/file.py +4 -0
  107. inspect_ai/log/_recorders/recorder.py +3 -0
  108. inspect_ai/log/_transcript.py +19 -8
  109. inspect_ai/model/__init__.py +2 -0
  110. inspect_ai/model/_cache.py +39 -21
  111. inspect_ai/model/_call_tools.py +2 -2
  112. inspect_ai/model/_chat_message.py +14 -4
  113. inspect_ai/model/_generate_config.py +1 -1
  114. inspect_ai/model/_model.py +31 -24
  115. inspect_ai/model/_model_output.py +14 -1
  116. inspect_ai/model/_openai.py +10 -18
  117. inspect_ai/model/_providers/google.py +9 -5
  118. inspect_ai/model/_providers/openai.py +5 -9
  119. inspect_ai/model/_providers/openrouter.py +1 -1
  120. inspect_ai/scorer/__init__.py +6 -1
  121. inspect_ai/scorer/_answer.py +1 -1
  122. inspect_ai/scorer/_classification.py +4 -0
  123. inspect_ai/scorer/_match.py +4 -5
  124. inspect_ai/scorer/_metric.py +87 -28
  125. inspect_ai/scorer/_metrics/__init__.py +3 -3
  126. inspect_ai/scorer/_metrics/accuracy.py +8 -10
  127. inspect_ai/scorer/_metrics/mean.py +3 -17
  128. inspect_ai/scorer/_metrics/std.py +111 -30
  129. inspect_ai/scorer/_model.py +12 -12
  130. inspect_ai/scorer/_pattern.py +3 -3
  131. inspect_ai/scorer/_reducer/reducer.py +36 -21
  132. inspect_ai/scorer/_reducer/registry.py +2 -2
  133. inspect_ai/scorer/_reducer/types.py +7 -1
  134. inspect_ai/scorer/_score.py +11 -1
  135. inspect_ai/scorer/_scorer.py +110 -16
  136. inspect_ai/solver/__init__.py +1 -1
  137. inspect_ai/solver/_basic_agent.py +19 -22
  138. inspect_ai/solver/_bridge/__init__.py +0 -3
  139. inspect_ai/solver/_bridge/bridge.py +3 -3
  140. inspect_ai/solver/_chain.py +1 -2
  141. inspect_ai/solver/_critique.py +3 -3
  142. inspect_ai/solver/_fork.py +2 -2
  143. inspect_ai/solver/_human_agent/__init__.py +0 -0
  144. inspect_ai/solver/_human_agent/agent.py +5 -8
  145. inspect_ai/solver/_human_agent/commands/clock.py +14 -10
  146. inspect_ai/solver/_human_agent/commands/note.py +1 -1
  147. inspect_ai/solver/_human_agent/commands/score.py +0 -11
  148. inspect_ai/solver/_multiple_choice.py +15 -18
  149. inspect_ai/solver/_prompt.py +7 -7
  150. inspect_ai/solver/_solver.py +53 -52
  151. inspect_ai/solver/_task_state.py +80 -69
  152. inspect_ai/solver/_use_tools.py +9 -9
  153. inspect_ai/tool/__init__.py +2 -1
  154. inspect_ai/tool/_tool.py +43 -14
  155. inspect_ai/tool/_tool_call.py +6 -2
  156. inspect_ai/tool/_tool_choice.py +3 -1
  157. inspect_ai/tool/_tool_def.py +10 -8
  158. inspect_ai/tool/_tool_params.py +24 -0
  159. inspect_ai/tool/_tool_with.py +7 -7
  160. inspect_ai/tool/_tools/__init__.py +0 -0
  161. inspect_ai/tool/_tools/_computer/_common.py +2 -2
  162. inspect_ai/tool/_tools/_computer/_computer.py +11 -0
  163. inspect_ai/tool/_tools/_execute.py +15 -9
  164. inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
  165. inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
  166. inspect_ai/tool/_tools/_web_search.py +7 -5
  167. inspect_ai/util/_concurrency.py +3 -3
  168. inspect_ai/util/_panel.py +2 -0
  169. inspect_ai/util/_resource.py +12 -12
  170. inspect_ai/util/_sandbox/docker/compose.py +23 -20
  171. inspect_ai/util/_sandbox/docker/config.py +2 -1
  172. inspect_ai/util/_sandbox/docker/docker.py +10 -1
  173. inspect_ai/util/_sandbox/docker/service.py +100 -0
  174. inspect_ai/util/_sandbox/environment.py +99 -96
  175. inspect_ai/util/_subprocess.py +5 -3
  176. inspect_ai/util/_subtask.py +15 -16
  177. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/LICENSE +1 -1
  178. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/METADATA +10 -6
  179. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/RECORD +182 -175
  180. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/WHEEL +0 -0
  181. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/entry_points.txt +0 -0
  182. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/top_level.txt +0 -0
inspect_ai/_eval/eval.py CHANGED
@@ -89,67 +89,67 @@ def eval(
89
89
  r"""Evaluate tasks using a Model.
90
90
 
91
91
  Args:
92
- tasks: (Tasks): Task(s) to evaluate. If None, attempt
92
+ tasks: Task(s) to evaluate. If None, attempt
93
93
  to evaluate a task in the current working directory
94
- model (str | Model | list[str] | list[Model] | None): Model(s) for
94
+ model: Model(s) for
95
95
  evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
96
96
  environment variable.
97
- model_base_url: (str | None): Base URL for communicating
97
+ model_base_url: Base URL for communicating
98
98
  with the model API.
99
- model_args (dict[str,Any] | str): Model creation args
99
+ model_args: Model creation args
100
100
  (as a dictionary or as a path to a JSON or YAML config file)
101
- task_args (dict[str,Any] | str): Task creation arguments
101
+ task_args: Task creation arguments
102
102
  (as a dictionary or as a path to a JSON or YAML config file)
103
- sandbox (SandboxEnvironmentType | None): Sandbox environment type
104
- (or optionally a str or tuple with a shorthand spec)
105
- sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
106
- (defaults to True)
107
- solver (Solver | list[Solver] | SolverSpec | None): Alternative solver for task(s).
108
- Optional (uses task solver by default).
109
- tags (list[str] | None): Tags to associate with this evaluation run.
110
- trace (bool | None): Trace message interactions with evaluated model to terminal.
111
- display (DisplayType | None): Task display type (defaults to 'full').
112
- approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
113
- Either a path to an approval policy config file or a list of approval policies.
114
- Defaults to no approval policy.
115
- log_level (str | None): Level for logging to the console: "debug", "http", "sandbox",
116
- "info", "warning", "error", or "critical" (defaults to "warning")
117
- log_level_transcript (str | None): Level for logging to the log file (defaults to "info")
118
- log_dir (str | None): Output path for logging results
119
- (defaults to file log in ./logs directory).
120
- log_format (Literal["eval", "json"] | None): Format for writing log files (defaults
121
- to "eval", the native high-performance format).
122
- limit (int | tuple[int, int] | None): Limit evaluated samples
123
- (defaults to all samples).
124
- sample_id (str | int | list[str | int] | None): Evaluate specific sample(s) from the dataset.
125
- epochs (int | Epochs | None): Epochs to repeat samples for and optional score
126
- reducer function(s) used to combine sample scores (defaults to "mean")
127
- fail_on_error (bool | float | None): `True` to fail on first sample error
128
- (default); `False` to never fail on sample errors; Value between 0 and 1
129
- to fail if a proportion of total samples fails. Value greater than 1 to fail
130
- eval if a count of samples fails.
131
- debug_errors (bool | None): Raise task errors (rather than logging them)
132
- so they can be debugged (defaults to False).
133
- message_limit (int | None): Limit on total messages used for each sample.
134
- token_limit (int | None): Limit on total tokens used for each sample.
135
- time_limit (int | None): Limit on time (in seconds) for execution of each sample.
136
- max_samples (int | None): Maximum number of samples to run in parallel
137
- (default is max_connections)
138
- max_tasks (int | None): Maximum number of tasks to run in parallel
139
- (default is 1)
140
- max_subprocesses (int | None): Maximum number of subprocesses to
141
- run in parallel (default is os.cpu_count())
142
- max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
143
- to run in parallel.
144
- log_samples: (bool | None): Log detailed samples and scores (defaults to True)
145
- log_images: (bool | None): Log base64 encoded version of images,
146
- even if specified as a filename or URL (defaults to False)
147
- log_buffer: (int | None): Number of samples to buffer before writing log file.
148
- If not specified, an appropriate default for the format and filesystem is
149
- chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
150
- score (bool): Score output (defaults to True)
151
- score_display (bool | None): Show scoring metrics in realtime (defaults to True)
152
- **kwargs (GenerateConfigArgs): Model generation options.
103
+ sandbox: Sandbox environment type
104
+ (or optionally a str or tuple with a shorthand spec)
105
+ sandbox_cleanup: Cleanup sandbox environments after task completes
106
+ (defaults to True)
107
+ solver: Alternative solver for task(s).
108
+ Optional (uses task solver by default).
109
+ tags: Tags to associate with this evaluation run.
110
+ trace: Trace message interactions with evaluated model to terminal.
111
+ display: Task display type (defaults to 'full').
112
+ approval: Tool use approval policies.
113
+ Either a path to an approval policy config file or a list of approval policies.
114
+ Defaults to no approval policy.
115
+ log_level: Level for logging to the console: "debug", "http", "sandbox",
116
+ "info", "warning", "error", or "critical" (defaults to "warning")
117
+ log_level_transcript: Level for logging to the log file (defaults to "info")
118
+ log_dir: Output path for logging results
119
+ (defaults to file log in ./logs directory).
120
+ log_format: Format for writing log files (defaults
121
+ to "eval", the native high-performance format).
122
+ limit: Limit evaluated samples
123
+ (defaults to all samples).
124
+ sample_id: Evaluate specific sample(s) from the dataset.
125
+ epochs: Epochs to repeat samples for and optional score
126
+ reducer function(s) used to combine sample scores (defaults to "mean")
127
+ fail_on_error: `True` to fail on first sample error
128
+ (default); `False` to never fail on sample errors; Value between 0 and 1
129
+ to fail if a proportion of total samples fails. Value greater than 1 to fail
130
+ eval if a count of samples fails.
131
+ debug_errors: Raise task errors (rather than logging them)
132
+ so they can be debugged (defaults to False).
133
+ message_limit: Limit on total messages used for each sample.
134
+ token_limit: Limit on total tokens used for each sample.
135
+ time_limit: Limit on time (in seconds) for execution of each sample.
136
+ max_samples: Maximum number of samples to run in parallel
137
+ (default is max_connections)
138
+ max_tasks: Maximum number of tasks to run in parallel
139
+ (default is 1)
140
+ max_subprocesses: Maximum number of subprocesses to
141
+ run in parallel (default is os.cpu_count())
142
+ max_sandboxes: Maximum number of sandboxes (per-provider)
143
+ to run in parallel.
144
+ log_samples: Log detailed samples and scores (defaults to True)
145
+ log_images: Log base64 encoded version of images,
146
+ even if specified as a filename or URL (defaults to False)
147
+ log_buffer: Number of samples to buffer before writing log file.
148
+ If not specified, an appropriate default for the format and filesystem is
149
+ chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
150
+ score: Score output (defaults to True)
151
+ score_display: Show scoring metrics in realtime (defaults to True)
152
+ **kwargs: Model generation options.
153
153
 
154
154
  Returns:
155
155
  List of EvalLog (one for each task)
@@ -359,10 +359,14 @@ async def eval_async(
359
359
  "Trace mode cannot be used when evaluating multiple models."
360
360
  )
361
361
 
362
- # resolve recorder
362
+ # resolve recorder (confirm writeable)
363
363
  log_dir = log_dir if log_dir else os.environ.get("INSPECT_LOG_DIR", "./logs")
364
364
  log_dir = absolute_file_path(log_dir)
365
365
  recorder = create_recorder_for_format(log_format or DEFAULT_LOG_FORMAT, log_dir)
366
+ if not recorder.is_writeable():
367
+ raise PrerequisiteError(
368
+ f"ERROR: You do not have write permission for the log_dir '{log_dir}'"
369
+ )
366
370
 
367
371
  # resolve solver
368
372
  solver = chain(solver) if isinstance(solver, list) else solver
@@ -492,47 +496,46 @@ def eval_retry(
492
496
  """Retry a previously failed evaluation task.
493
497
 
494
498
  Args:
495
- tasks: (str | EvalLogInfo | EvalLog | list[str] | list[EvalLogInfo] | list[EvalLog]):
496
- Log files for task(s) to retry.
497
- log_level (str | None): Level for logging to the console: "debug", "http", "sandbox",
498
- "info", "warning", "error", or "critical" (defaults to "warning")
499
- log_level_transcript (str | None): Level for logging to the log file (defaults to "info")
500
- log_dir (str | None): Output path for logging results
501
- (defaults to file log in ./logs directory).
502
- log_format (Literal["eval", "json"] | None): Format for writing log files (defaults
503
- to "eval", the native high-performance format).
504
- max_samples (int | None): Maximum number of samples to run in parallel
505
- (default is max_connections)
506
- max_tasks (int | None): Maximum number of tasks to run in parallel
507
- (default is 1)
508
- max_subprocesses (int | None): Maximum number of subprocesses to
509
- run in parallel (default is os.cpu_count())
510
- max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
511
- to run in parallel.
512
- sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
513
- (defaults to True)
514
- trace (bool | None): Trace message interactions with evaluated model to terminal.
515
- display (DisplayType | None): Task display type (defaults to 'full').
516
- fail_on_error (bool | float | None): `True` to fail on first sample error
517
- (default); `False` to never fail on sample errors; Value between 0 and 1
518
- to fail if a proportion of total samples fails. Value greater than 1 to fail
519
- eval if a count of samples fails.
520
- debug_errors (bool | None): Raise task errors (rather than logging them)
521
- so they can be debugged (defaults to False).
522
- log_samples: (bool | None): Log detailed samples and scores (defaults to True)
523
- log_images: (bool | None): Log base64 encoded version of images,
524
- even if specified as a filename or URL (defaults to False)
525
- log_buffer: (int | None): Number of samples to buffer before writing log file.
526
- If not specified, an appropriate default for the format and filesystem is
527
- chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
528
- score (bool): Score output (defaults to True)
529
- score_display (bool | None): Show scoring metrics in realtime (defaults to True)
530
- max_retries (int | None):
531
- Maximum number of times to retry request.
532
- timeout: (int | None):
533
- Request timeout (in seconds)
534
- max_connections (int | None):
535
- Maximum number of concurrent connections to Model API (default is per Model API)
499
+ tasks: Log files for task(s) to retry.
500
+ log_level: Level for logging to the console: "debug", "http", "sandbox",
501
+ "info", "warning", "error", or "critical" (defaults to "warning")
502
+ log_level_transcript: Level for logging to the log file (defaults to "info")
503
+ log_dir: Output path for logging results
504
+ (defaults to file log in ./logs directory).
505
+ log_format: Format for writing log files (defaults
506
+ to "eval", the native high-performance format).
507
+ max_samples: Maximum number of samples to run in parallel
508
+ (default is max_connections)
509
+ max_tasks: Maximum number of tasks to run in parallel
510
+ (default is 1)
511
+ max_subprocesses: Maximum number of subprocesses to
512
+ run in parallel (default is os.cpu_count())
513
+ max_sandboxes: Maximum number of sandboxes (per-provider)
514
+ to run in parallel.
515
+ sandbox_cleanup: Cleanup sandbox environments after task completes
516
+ (defaults to True)
517
+ trace: Trace message interactions with evaluated model to terminal.
518
+ display: Task display type (defaults to 'full').
519
+ fail_on_error: `True` to fail on first sample error
520
+ (default); `False` to never fail on sample errors; Value between 0 and 1
521
+ to fail if a proportion of total samples fails. Value greater than 1 to fail
522
+ eval if a count of samples fails.
523
+ debug_errors: Raise task errors (rather than logging them)
524
+ so they can be debugged (defaults to False).
525
+ log_samples: Log detailed samples and scores (defaults to True)
526
+ log_images: Log base64 encoded version of images,
527
+ even if specified as a filename or URL (defaults to False)
528
+ log_buffer: Number of samples to buffer before writing log file.
529
+ If not specified, an appropriate default for the format and filesystem is
530
+ chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
531
+ score: Score output (defaults to True)
532
+ score_display: Show scoring metrics in realtime (defaults to True)
533
+ max_retries:
534
+ Maximum number of times to retry request.
535
+ timeout:
536
+ Request timeout (in seconds)
537
+ max_connections:
538
+ Maximum number of concurrent connections to Model API (default is per Model API)
536
539
 
537
540
  Returns:
538
541
  List of EvalLog (one for each task)
@@ -93,79 +93,79 @@ def eval_set(
93
93
  r"""Evaluate a set of tasks.
94
94
 
95
95
  Args:
96
- tasks: (Tasks): Task(s) to evaluate. If None, attempt
96
+ tasks: Task(s) to evaluate. If None, attempt
97
97
  to evaluate a task in the current working directory
98
- log_dir (str): Output path for logging results
99
- (required to ensure that a unique storage scope is assigned for the set).
100
- retry_attempts: (int | None): Maximum number of retry attempts before giving up
101
- (defaults to 10).
102
- retry_wait (float | None): Time to wait between attempts, increased exponentially.
103
- (defaults to 30, resulting in waits of 30, 60, 120, 240, etc.). Wait time
104
- per-retry will in no case by longer than 1 hour.
105
- retry_connections (float | None): Reduce max_connections at this rate with each retry
106
- (defaults to 0.5)
107
- retry_cleanup (bool | None): Cleanup failed log files after retries
108
- (defaults to True)
109
- model (str | Model | list[str] | list[Model] | None): Model(s) for
110
- evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
111
- environment variable.
112
- model_base_url: (str | None): Base URL for communicating
113
- with the model API.
114
- model_args (dict[str,Any] | str): Model creation args
115
- (as a dictionary or as a path to a JSON or YAML config file)
116
- task_args (dict[str,Any] | str): Task creation arguments
117
- (as a dictionary or as a path to a JSON or YAML config file)
118
- sandbox (SandboxEnvironmentType | None): Sandbox environment type
119
- (or optionally a str or tuple with a shorthand spec)
120
- sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
121
- (defaults to True)
122
- solver (Solver | list[Solver] | SolverSpec | None): Alternative solver(s) for
123
- evaluating task(s). ptional (uses task solver by default).
124
- tags (list[str] | None): Tags to associate with this evaluation run.
125
- trace: (bool | None): Trace message interactions with evaluated model to terminal.
126
- display (DisplayType | None): Task display type (defaults to 'full').
127
- approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
128
- Either a path to an approval policy config file or a list of approval policies.
129
- Defaults to no approval policy.
130
- score (bool): Score output (defaults to True)
131
- log_level (str | None): Level for logging to the console: "debug", "http", "sandbox",
132
- "info", "warning", "error", or "critical" (defaults to "warning")
133
- log_level_transcript (str | None): Level for logging to the log file (defaults to "info")
134
- log_format (Literal["eval", "json"] | None): Format for writing
135
- log files (defaults to "eval", the native high-performance format).
136
- limit (int | tuple[int, int] | None): Limit evaluated samples
137
- (defaults to all samples).
138
- sample_id (str | int | list[str | int] | None): Evaluate specific sample(s) from the dataset.
139
- epochs (int | Epochs | None): Epochs to repeat samples for and optional score
140
- reducer function(s) used to combine sample scores (defaults to "mean")
141
- fail_on_error (bool | float | None): `True` to fail on first sample error
142
- (default); `False` to never fail on sample errors; Value between 0 and 1
143
- to fail if a proportion of total samples fails. Value greater than 1 to fail
144
- eval if a count of samples fails.
145
- debug_errors (bool | None): Raise task errors (rather than logging them)
146
- so they can be debugged (defaults to False).
147
- message_limit (int | None): Limit on total messages used for each sample.
148
- token_limit (int | None): Limit on total tokens used for each sample.
149
- time_limit (int | None): Limit on time (in seconds) for execution of each sample.
150
- max_samples (int | None): Maximum number of samples to run in parallel
151
- (default is max_connections)
152
- max_tasks (int | None): Maximum number of tasks to run in parallel
153
- (default is 1)
154
- max_subprocesses (int | None): Maximum number of subprocesses to
155
- run in parallel (default is os.cpu_count())
156
- max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
157
- to run in parallel.
158
- log_samples: (bool | None): Log detailed samples and scores (defaults to True)
159
- log_images: (bool | None): Log base64 encoded version of images,
98
+ log_dir: Output path for logging results
99
+ (required to ensure that a unique storage scope is assigned for the set).
100
+ retry_attempts: Maximum number of retry attempts before giving up
101
+ (defaults to 10).
102
+ retry_wait: Time to wait between attempts, increased exponentially.
103
+ (defaults to 30, resulting in waits of 30, 60, 120, 240, etc.). Wait time
104
+ per-retry will in no case by longer than 1 hour.
105
+ retry_connections: Reduce max_connections at this rate with each retry
106
+ (defaults to 0.5)
107
+ retry_cleanup: Cleanup failed log files after retries
108
+ (defaults to True)
109
+ model: Model(s) for
110
+ evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
111
+ environment variable.
112
+ model_base_url: Base URL for communicating
113
+ with the model API.
114
+ model_args: Model creation args
115
+ (as a dictionary or as a path to a JSON or YAML config file)
116
+ task_args: Task creation arguments
117
+ (as a dictionary or as a path to a JSON or YAML config file)
118
+ sandbox: Sandbox environment type
119
+ (or optionally a str or tuple with a shorthand spec)
120
+ sandbox_cleanup: Cleanup sandbox environments after task completes
121
+ (defaults to True)
122
+ solver: Alternative solver(s) for
123
+ evaluating task(s). ptional (uses task solver by default).
124
+ tags: Tags to associate with this evaluation run.
125
+ trace: Trace message interactions with evaluated model to terminal.
126
+ display: Task display type (defaults to 'full').
127
+ approval: Tool use approval policies.
128
+ Either a path to an approval policy config file or a list of approval policies.
129
+ Defaults to no approval policy.
130
+ score: Score output (defaults to True)
131
+ log_level: Level for logging to the console: "debug", "http", "sandbox",
132
+ "info", "warning", "error", or "critical" (defaults to "warning")
133
+ log_level_transcript: Level for logging to the log file (defaults to "info")
134
+ log_format: Format for writing
135
+ log files (defaults to "eval", the native high-performance format).
136
+ limit: Limit evaluated samples
137
+ (defaults to all samples).
138
+ sample_id: Evaluate specific sample(s) from the dataset.
139
+ epochs: Epochs to repeat samples for and optional score
140
+ reducer function(s) used to combine sample scores (defaults to "mean")
141
+ fail_on_error: `True` to fail on first sample error
142
+ (default); `False` to never fail on sample errors; Value between 0 and 1
143
+ to fail if a proportion of total samples fails. Value greater than 1 to fail
144
+ eval if a count of samples fails.
145
+ debug_errors: Raise task errors (rather than logging them)
146
+ so they can be debugged (defaults to False).
147
+ message_limit: Limit on total messages used for each sample.
148
+ token_limit: Limit on total tokens used for each sample.
149
+ time_limit: Limit on time (in seconds) for execution of each sample.
150
+ max_samples: Maximum number of samples to run in parallel
151
+ (default is max_connections)
152
+ max_tasks: Maximum number of tasks to run in parallel
153
+ (default is 1)
154
+ max_subprocesses: Maximum number of subprocesses to
155
+ run in parallel (default is os.cpu_count())
156
+ max_sandboxes: Maximum number of sandboxes (per-provider)
157
+ to run in parallel.
158
+ log_samples: Log detailed samples and scores (defaults to True)
159
+ log_images: Log base64 encoded version of images,
160
160
  even if specified as a filename or URL (defaults to False)
161
- log_buffer: (int | None): Number of samples to buffer before writing log file.
162
- If not specified, an appropriate default for the format and filesystem is
163
- chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
164
- bundle_dir: (str | None): If specified, the log viewer and logs generated
161
+ log_buffer: Number of samples to buffer before writing log file.
162
+ If not specified, an appropriate default for the format and filesystem is
163
+ chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
164
+ bundle_dir: If specified, the log viewer and logs generated
165
165
  by this eval set will be bundled into this directory.
166
- bundle_overwrite (bool): Whether to overwrite files in the bundle_dir.
166
+ bundle_overwrite: Whether to overwrite files in the bundle_dir.
167
167
  (defaults to False).
168
- **kwargs (GenerateConfigArgs): Model generation options.
168
+ **kwargs: Model generation options.
169
169
 
170
170
  Returns:
171
171
  Tuple of bool (whether all tasks completed successfully) and list of EvalLog
@@ -8,7 +8,7 @@ from importlib.util import module_from_spec, spec_from_loader
8
8
  from logging import getLogger
9
9
  from pathlib import Path
10
10
  from types import ModuleType
11
- from typing import Any, Callable, cast
11
+ from typing import Any, Callable, Tuple, cast
12
12
 
13
13
  from typing_extensions import overload
14
14
 
@@ -26,6 +26,7 @@ from inspect_ai._util.registry import (
26
26
  registry_params,
27
27
  )
28
28
  from inspect_ai.model import Model, ModelName
29
+ from inspect_ai.scorer._scorer import Scorer, ScorerSpec, scorer_create
29
30
  from inspect_ai.solver._bridge import bridge
30
31
  from inspect_ai.solver._solver import Solver, SolverSpec
31
32
  from inspect_ai.util import SandboxEnvironmentSpec, SandboxEnvironmentType
@@ -421,16 +422,7 @@ def as_solver_spec(solver: Solver) -> SolverSpec:
421
422
 
422
423
  def solver_from_spec(spec: SolverSpec) -> Solver:
423
424
  # resolve @ reference
424
- spec_split = split_spec(spec.solver)
425
- if spec_split[1] is not None:
426
- solver_file: Path | None = Path(spec_split[0]).resolve()
427
- solver_name: str | None = spec_split[1]
428
- elif Path(spec_split[0]).exists():
429
- solver_file = Path(spec_split[0]).resolve()
430
- solver_name = None
431
- else:
432
- solver_file = None
433
- solver_name = spec_split[0]
425
+ solver_file, solver_name = parse_spec_str(spec.solver)
434
426
 
435
427
  # switch contexts if we are loading from a file
436
428
  create_cm = (
@@ -501,7 +493,7 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
501
493
  else:
502
494
  agent_fn = getattr(solver_module, solver_name, None)
503
495
  if inspect.isfunction(agent_fn):
504
- return bridge(agent_fn(**spec.args))
496
+ return bridge.bridge(agent_fn(**spec.args))
505
497
  elif agent_fn is not None:
506
498
  raise PrerequisiteError(
507
499
  f"The object {solver_name} in file {pretty_solver_file} is not a Python function."
@@ -510,3 +502,121 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
510
502
  raise PrerequisiteError(
511
503
  f"The function {solver_name} was not found in file {pretty_solver_file}."
512
504
  )
505
+
506
+
507
+ def scorer_from_spec(spec: ScorerSpec, task_path: Path | None, **kwargs: Any) -> Scorer:
508
+ """
509
+ Load a scorer
510
+
511
+ Args:
512
+ spec: The scorer spec
513
+ task_path: An optional path to the task file
514
+ **kwargs: Additional keyword arguments passed to the scorer initialization
515
+
516
+ Returns:
517
+ Scorer: the loaded scorer
518
+
519
+ Raises:
520
+ PrerequisiteError: If the scorer cannot be found, loaded, or lacks required type annotations
521
+ """
522
+ # resolve @ reference
523
+ scorer_file, scorer_name = parse_spec_str(spec.scorer)
524
+
525
+ # switch contexts if we are loading from a file
526
+ create_cm = (
527
+ chdir_python(scorer_file.parent.as_posix())
528
+ if scorer_file is not None
529
+ else contextlib.nullcontext()
530
+ )
531
+
532
+ # pretty solver name for error messages
533
+ pretty_scorer_file = (
534
+ cwd_relative_path(scorer_file.as_posix()) if scorer_file else None
535
+ )
536
+
537
+ with create_cm:
538
+ # is there a scorer file being provided? if not, load from registry
539
+ if scorer_file is None:
540
+ if scorer_name is None:
541
+ raise ValueError(f"Unable to resolve scorer name from {spec.scorer}")
542
+
543
+ try:
544
+ return scorer_create(scorer_name, **kwargs)
545
+ except ValueError:
546
+ # We need a valid path to a scorer file to try to load the scorer from there
547
+ if not task_path:
548
+ raise PrerequisiteError(
549
+ f"The scorer '{scorer_name}' couldn't be loaded. Please provide a path to the file containing the scorer using the '--scorer' parameter"
550
+ )
551
+
552
+ task_pretty_path = task_path.as_posix()
553
+ if not task_path.exists():
554
+ raise PrerequisiteError(
555
+ f"The scorer `{scorer_name}` couldn't be loaded. The file '{task_pretty_path}' was not found. Please provide a path to the file containing the scorer using the '--scorer' parameter"
556
+ )
557
+
558
+ # We have the path to a file, so load that and try again
559
+ try:
560
+ load_module(task_path)
561
+ scorer_fn = scorer_create(scorer_name, **kwargs)
562
+
563
+ # See if the scorer doesn't have type annotations. Currently the registry will not load
564
+ # the function without type annotations.
565
+ # TODO: We could consider calling this ourselves if we're certain it is what we're looking for
566
+ signature = inspect.signature(scorer_fn)
567
+ if signature.return_annotation is inspect.Signature.empty:
568
+ raise PrerequisiteError(
569
+ f"The scorer '{scorer_name}' in the file '{task_pretty_path}' requires return type annotations. Please add type annotations to load the scorer."
570
+ )
571
+ return scorer_fn
572
+ except ValueError:
573
+ # we still couldn't load this, request the user provide a path
574
+ raise PrerequisiteError(
575
+ f"The scorer '{scorer_name}' in the file '{task_pretty_path}' couldn't be loaded. Please provide a path to the file containing the scorer using the '--scorer' parameter."
576
+ )
577
+ except ModuleNotFoundError:
578
+ # we still couldn't load this, request the user provide a path
579
+ raise PrerequisiteError(
580
+ f"The scorer '{scorer_name}' in the file '{task_pretty_path}' couldn't be loaded. Please provide a path to the file containing the scorer using the '--scorer' parameter."
581
+ )
582
+
583
+ # solver is a path, so load it that way
584
+ else:
585
+ load_module(scorer_file)
586
+ decorators = parse_decorators(scorer_file, "scorer")
587
+
588
+ # if there is no solver_name see if we can discover it
589
+ if scorer_name is None:
590
+ if len(decorators) == 1:
591
+ # decorator based solver
592
+ scorer_name = decorators[0][0]
593
+ elif len(decorators) == 0:
594
+ raise PrerequisiteError(
595
+ f"The source file {pretty_scorer_file} does not contain any @scorer functions."
596
+ )
597
+ else:
598
+ raise PrerequisiteError(
599
+ f"The source file {pretty_scorer_file} has more than one @solver function (qualify which solver using e.g. '{scorer_file.name}y@solver_fn')"
600
+ )
601
+
602
+ # create decorator based solvers using the registry
603
+ if any(solver[0] == scorer_name for solver in decorators):
604
+ return scorer_create(scorer_name, **kwargs)
605
+ else:
606
+ raise PrerequisiteError(
607
+ f"The function {scorer_name} was not found in file {pretty_scorer_file}."
608
+ )
609
+
610
+
611
+ def parse_spec_str(spec_str: str) -> Tuple[Path | None, str | None]:
612
+ spec_split = split_spec(spec_str)
613
+ if spec_split[1] is not None:
614
+ file: Path | None = Path(spec_split[0]).resolve()
615
+ name: str | None = spec_split[1]
616
+ elif Path(spec_split[0]).exists():
617
+ file = Path(spec_split[0]).resolve()
618
+ name = None
619
+ else:
620
+ file = None
621
+ name = spec_split[0]
622
+ return file, name
@@ -148,7 +148,7 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
148
148
  # module import, so set its task file and run dir
149
149
  if get_installed_package_name(task_type) is None:
150
150
  module = inspect.getmodule(task_type)
151
- if module and hasattr(module, "__file__"):
151
+ if module and hasattr(module, "__file__") and module.__file__:
152
152
  file = Path(getattr(module, "__file__"))
153
153
  setattr(task_instance, TASK_FILE_ATTR, file.as_posix())
154
154
  setattr(task_instance, TASK_RUN_DIR_ATTR, file.parent.as_posix())
inspect_ai/_eval/run.py CHANGED
@@ -20,8 +20,10 @@ from inspect_ai.log import EvalConfig, EvalLog
20
20
  from inspect_ai.log._recorders import Recorder
21
21
  from inspect_ai.model import GenerateConfigArgs
22
22
  from inspect_ai.model._model import ModelName
23
+ from inspect_ai.scorer._metric import to_metric_specs
23
24
  from inspect_ai.scorer._reducer import ScoreReducer, reducer_log_names
24
25
  from inspect_ai.scorer._reducer.registry import validate_reducer
26
+ from inspect_ai.scorer._scorer import as_scorer_spec
25
27
  from inspect_ai.solver._solver import Solver, SolverSpec
26
28
  from inspect_ai.util._sandbox.environment import (
27
29
  SandboxEnvironmentConfigType,
@@ -100,6 +102,16 @@ async def eval_run(
100
102
  eval_solver = None
101
103
  eval_solver_spec = None
102
104
 
105
+ # resolve the task scorers
106
+ eval_scorer_specs = (
107
+ [as_scorer_spec(scorer) for scorer in task.scorer]
108
+ if task.scorer is not None
109
+ else None
110
+ )
111
+
112
+ # resolve task metrics
113
+ eval_metrics = to_metric_specs(task.metrics) if task.metrics is not None else None
114
+
103
115
  try:
104
116
  # create run tasks
105
117
  task_run_options: list[TaskRunOptions] = []
@@ -168,6 +180,8 @@ async def eval_run(
168
180
  tags=tags,
169
181
  model=resolved_task.model,
170
182
  dataset=task.dataset,
183
+ scorer=eval_scorer_specs,
184
+ metrics=eval_metrics,
171
185
  sandbox=resolved_task.sandbox,
172
186
  task_attribs=task.attribs,
173
187
  task_args=resolved_task.task_args,