inspect-ai 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. inspect_ai/_cli/cache.py +8 -7
  2. inspect_ai/_cli/common.py +0 -12
  3. inspect_ai/_cli/eval.py +32 -4
  4. inspect_ai/_cli/info.py +1 -0
  5. inspect_ai/_cli/list.py +1 -1
  6. inspect_ai/_cli/log.py +2 -0
  7. inspect_ai/_cli/sandbox.py +4 -1
  8. inspect_ai/_cli/score.py +181 -32
  9. inspect_ai/_cli/trace.py +2 -0
  10. inspect_ai/_cli/view.py +4 -2
  11. inspect_ai/_display/core/config.py +7 -1
  12. inspect_ai/_display/core/progress.py +1 -1
  13. inspect_ai/_display/textual/app.py +8 -4
  14. inspect_ai/_display/textual/widgets/samples.py +6 -5
  15. inspect_ai/_display/textual/widgets/sandbox.py +6 -0
  16. inspect_ai/_eval/__init__.py +0 -0
  17. inspect_ai/_eval/eval.py +100 -97
  18. inspect_ai/_eval/evalset.py +69 -69
  19. inspect_ai/_eval/loader.py +122 -12
  20. inspect_ai/_eval/registry.py +1 -1
  21. inspect_ai/_eval/run.py +14 -0
  22. inspect_ai/_eval/score.py +125 -36
  23. inspect_ai/_eval/task/log.py +105 -4
  24. inspect_ai/_eval/task/results.py +92 -38
  25. inspect_ai/_eval/task/run.py +6 -2
  26. inspect_ai/_eval/task/sandbox.py +35 -2
  27. inspect_ai/_eval/task/task.py +49 -46
  28. inspect_ai/_util/__init__.py +0 -0
  29. inspect_ai/_util/constants.py +1 -1
  30. inspect_ai/_util/content.py +8 -0
  31. inspect_ai/_util/error.py +2 -0
  32. inspect_ai/_util/file.py +15 -1
  33. inspect_ai/_util/logger.py +4 -2
  34. inspect_ai/_util/registry.py +7 -1
  35. inspect_ai/_view/view.py +1 -2
  36. inspect_ai/_view/www/App.css +8 -3
  37. inspect_ai/_view/www/README.md +1 -1
  38. inspect_ai/_view/www/dist/assets/index.css +66 -38
  39. inspect_ai/_view/www/dist/assets/index.js +525 -523
  40. inspect_ai/_view/www/log-schema.json +86 -73
  41. inspect_ai/_view/www/package.json +1 -1
  42. inspect_ai/_view/www/src/App.tsx +1 -0
  43. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +1 -1
  44. inspect_ai/_view/www/src/components/JsonPanel.tsx +1 -1
  45. inspect_ai/_view/www/src/components/LargeModal.tsx +39 -49
  46. inspect_ai/_view/www/src/components/NavPills.tsx +3 -1
  47. inspect_ai/_view/www/src/components/TabSet.tsx +19 -4
  48. inspect_ai/_view/www/src/logfile/remoteLogFile.ts +0 -1
  49. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +1 -1
  50. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +1 -1
  51. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +6 -13
  52. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +17 -2
  53. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +1 -1
  54. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +14 -5
  55. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +4 -2
  56. inspect_ai/_view/www/src/samples/SamplesTools.tsx +16 -24
  57. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +1 -1
  58. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +1 -0
  59. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +27 -13
  60. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +19 -17
  61. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +12 -10
  62. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +56 -66
  63. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +12 -5
  64. inspect_ai/_view/www/src/samples/chat/tools/tool.ts +21 -36
  65. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +3 -1
  66. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +27 -25
  67. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +5 -1
  68. inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +13 -13
  69. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +1 -1
  70. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +2 -2
  71. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +9 -5
  72. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +1 -1
  73. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -4
  74. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +1 -0
  75. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +1 -0
  76. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +17 -6
  77. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +14 -19
  78. inspect_ai/_view/www/src/types/log.d.ts +107 -19
  79. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +7 -1
  80. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +5 -3
  81. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +25 -27
  82. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +12 -11
  83. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +25 -2
  84. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +60 -36
  85. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +4 -0
  86. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +6 -4
  87. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +16 -14
  88. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +9 -19
  89. inspect_ai/_view/www/src/workspace/utils.ts +34 -0
  90. inspect_ai/approval/_approval.py +2 -0
  91. inspect_ai/approval/_approver.py +4 -4
  92. inspect_ai/approval/_auto.py +1 -1
  93. inspect_ai/approval/_human/approver.py +3 -0
  94. inspect_ai/approval/_policy.py +5 -0
  95. inspect_ai/approval/_registry.py +2 -2
  96. inspect_ai/dataset/_dataset.py +36 -45
  97. inspect_ai/dataset/_sources/__init__.py +0 -0
  98. inspect_ai/dataset/_sources/csv.py +13 -13
  99. inspect_ai/dataset/_sources/hf.py +29 -29
  100. inspect_ai/dataset/_sources/json.py +10 -10
  101. inspect_ai/log/__init__.py +2 -0
  102. inspect_ai/log/_convert.py +3 -3
  103. inspect_ai/log/_file.py +24 -9
  104. inspect_ai/log/_log.py +98 -7
  105. inspect_ai/log/_message.py +3 -1
  106. inspect_ai/log/_recorders/file.py +4 -0
  107. inspect_ai/log/_recorders/recorder.py +3 -0
  108. inspect_ai/log/_transcript.py +19 -8
  109. inspect_ai/model/__init__.py +2 -0
  110. inspect_ai/model/_cache.py +39 -21
  111. inspect_ai/model/_call_tools.py +2 -2
  112. inspect_ai/model/_chat_message.py +14 -4
  113. inspect_ai/model/_generate_config.py +1 -1
  114. inspect_ai/model/_model.py +31 -24
  115. inspect_ai/model/_model_output.py +14 -1
  116. inspect_ai/model/_openai.py +10 -18
  117. inspect_ai/model/_providers/google.py +9 -5
  118. inspect_ai/model/_providers/openai.py +5 -9
  119. inspect_ai/model/_providers/openrouter.py +1 -1
  120. inspect_ai/scorer/__init__.py +6 -1
  121. inspect_ai/scorer/_answer.py +1 -1
  122. inspect_ai/scorer/_classification.py +4 -0
  123. inspect_ai/scorer/_match.py +4 -5
  124. inspect_ai/scorer/_metric.py +87 -28
  125. inspect_ai/scorer/_metrics/__init__.py +3 -3
  126. inspect_ai/scorer/_metrics/accuracy.py +8 -10
  127. inspect_ai/scorer/_metrics/mean.py +3 -17
  128. inspect_ai/scorer/_metrics/std.py +111 -30
  129. inspect_ai/scorer/_model.py +12 -12
  130. inspect_ai/scorer/_pattern.py +3 -3
  131. inspect_ai/scorer/_reducer/reducer.py +36 -21
  132. inspect_ai/scorer/_reducer/registry.py +2 -2
  133. inspect_ai/scorer/_reducer/types.py +7 -1
  134. inspect_ai/scorer/_score.py +11 -1
  135. inspect_ai/scorer/_scorer.py +110 -16
  136. inspect_ai/solver/__init__.py +1 -1
  137. inspect_ai/solver/_basic_agent.py +19 -22
  138. inspect_ai/solver/_bridge/__init__.py +0 -3
  139. inspect_ai/solver/_bridge/bridge.py +3 -3
  140. inspect_ai/solver/_chain.py +1 -2
  141. inspect_ai/solver/_critique.py +3 -3
  142. inspect_ai/solver/_fork.py +2 -2
  143. inspect_ai/solver/_human_agent/__init__.py +0 -0
  144. inspect_ai/solver/_human_agent/agent.py +5 -8
  145. inspect_ai/solver/_human_agent/commands/clock.py +14 -10
  146. inspect_ai/solver/_human_agent/commands/note.py +1 -1
  147. inspect_ai/solver/_human_agent/commands/score.py +0 -11
  148. inspect_ai/solver/_multiple_choice.py +15 -18
  149. inspect_ai/solver/_prompt.py +7 -7
  150. inspect_ai/solver/_solver.py +53 -52
  151. inspect_ai/solver/_task_state.py +80 -69
  152. inspect_ai/solver/_use_tools.py +9 -9
  153. inspect_ai/tool/__init__.py +2 -1
  154. inspect_ai/tool/_tool.py +43 -14
  155. inspect_ai/tool/_tool_call.py +6 -2
  156. inspect_ai/tool/_tool_choice.py +3 -1
  157. inspect_ai/tool/_tool_def.py +10 -8
  158. inspect_ai/tool/_tool_params.py +24 -0
  159. inspect_ai/tool/_tool_with.py +7 -7
  160. inspect_ai/tool/_tools/__init__.py +0 -0
  161. inspect_ai/tool/_tools/_computer/_common.py +2 -2
  162. inspect_ai/tool/_tools/_computer/_computer.py +11 -0
  163. inspect_ai/tool/_tools/_execute.py +15 -9
  164. inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
  165. inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
  166. inspect_ai/tool/_tools/_web_search.py +7 -5
  167. inspect_ai/util/_concurrency.py +3 -3
  168. inspect_ai/util/_panel.py +2 -0
  169. inspect_ai/util/_resource.py +12 -12
  170. inspect_ai/util/_sandbox/docker/compose.py +23 -20
  171. inspect_ai/util/_sandbox/docker/config.py +2 -1
  172. inspect_ai/util/_sandbox/docker/docker.py +10 -1
  173. inspect_ai/util/_sandbox/docker/service.py +100 -0
  174. inspect_ai/util/_sandbox/environment.py +99 -96
  175. inspect_ai/util/_subprocess.py +5 -3
  176. inspect_ai/util/_subtask.py +15 -16
  177. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/LICENSE +1 -1
  178. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/METADATA +10 -6
  179. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/RECORD +182 -175
  180. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/WHEEL +0 -0
  181. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/entry_points.txt +0 -0
  182. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/top_level.txt +0 -0
@@ -40,9 +40,9 @@ def score_reducer(
40
40
  """Decorator for registering Score Reducers.
41
41
 
42
42
  Args:
43
- func (ScoreReducerType | None): Function returning `ScoreReducer` targeted by
43
+ func: Function returning `ScoreReducer` targeted by
44
44
  plain task decorator without attributes (e.g. `@score_reducer`)
45
- name (str | None): Optional name for reducer. If the decorator has no name
45
+ name: Optional name for reducer. If the decorator has no name
46
46
  argument then the name of the function will be used to automatically assign a name.
47
47
 
48
48
  Returns:
@@ -5,7 +5,13 @@ from .._metric import Score
5
5
 
6
6
  @runtime_checkable
7
7
  class ScoreReducer(Protocol):
8
- def __call__(self, scores: list[Score]) -> Score: ...
8
+ def __call__(self, scores: list[Score]) -> Score:
9
+ """Reduce a set of scores to a single score.
10
+
11
+ Args:
12
+ scores: List of scores.
13
+ """
14
+ ...
9
15
 
10
16
  @property
11
17
  def __name__(self) -> str: ...
@@ -23,6 +23,8 @@ async def score(state: TaskState) -> list[Score]:
23
23
  a task that does not have a scorer.
24
24
 
25
25
  """
26
+ from inspect_ai.log._transcript import ScoreEvent, transcript
27
+
26
28
  scorers = _scorers.get(None)
27
29
  target = _target.get(None)
28
30
  if scorers is None or target is None:
@@ -30,7 +32,15 @@ async def score(state: TaskState) -> list[Score]:
30
32
  "The score() function can only be called while executing a task with a scorer."
31
33
  )
32
34
 
33
- return [await scorer(state, target) for scorer in scorers]
35
+ scores: list[Score] = []
36
+ for scorer in scorers:
37
+ score = await scorer(state, target)
38
+ scores.append(score)
39
+ transcript()._event(
40
+ ScoreEvent(score=score, target=target.target, intermediate=True)
41
+ )
42
+
43
+ return scores
34
44
 
35
45
 
36
46
  def init_scoring_context(scorers: list[Scorer], target: Target) -> None:
@@ -1,3 +1,5 @@
1
+ from copy import deepcopy
2
+ from dataclasses import dataclass, field
1
3
  from functools import wraps
2
4
  from typing import (
3
5
  Any,
@@ -9,38 +11,74 @@ from typing import (
9
11
  )
10
12
 
11
13
  from inspect_ai._util._async import is_callable_coroutine
14
+ from inspect_ai._util.error import PrerequisiteError
12
15
  from inspect_ai._util.registry import (
13
16
  RegistryInfo,
17
+ is_registry_object,
14
18
  registry_add,
15
19
  registry_create,
16
20
  registry_info,
17
21
  registry_name,
22
+ registry_params,
18
23
  registry_tag,
19
24
  registry_unqualified_name,
20
25
  )
21
26
  from inspect_ai.solver._task_state import TaskState
22
27
 
23
- from ._metric import Metric, Score
28
+ from ._metric import Metric, MetricSpec, Score, as_metric_spec
24
29
  from ._target import Target
25
30
 
26
31
 
27
32
  @runtime_checkable
28
33
  class Scorer(Protocol):
29
- r"""Score model outputs.
30
-
31
- Evaluate the passed outputs and targets and return a
32
- dictionary with scoring outcomes and context.
33
-
34
- Args:
35
- state (TaskState): Task state
36
- target (Target): Ideal target for the output.
37
- """
38
-
39
34
  async def __call__(
40
35
  self,
41
36
  state: TaskState,
42
37
  target: Target,
43
- ) -> Score: ...
38
+ ) -> Score:
39
+ r"""Score model outputs.
40
+
41
+ Evaluate the passed outputs and targets and return a
42
+ dictionary with scoring outcomes and context.
43
+
44
+ Args:
45
+ state: Task state
46
+ target: Ideal target for the output.
47
+
48
+ Examples:
49
+ ```python
50
+ @scorer
51
+ def custom_scorer() -> Scorer:
52
+ async def score(state: TaskState, target: Target) -> Score:
53
+ # Compare state / model output with target
54
+ # to yield a score
55
+ return Score(value=...)
56
+
57
+ return score
58
+ ````
59
+ """
60
+ ...
61
+
62
+
63
+ @dataclass(frozen=True)
64
+ class ScorerSpec:
65
+ """Scorer specification used to (re-)create scorers."""
66
+
67
+ scorer: str
68
+ """Scorer name"""
69
+
70
+ args: dict[str, Any] = field(default_factory=dict)
71
+ """Scorer arguments."""
72
+
73
+ metadata: dict[str, Any] | None = field(default=None)
74
+ """Scorer metadata"""
75
+
76
+ metrics: (
77
+ list[MetricSpec | dict[str, list[MetricSpec]]]
78
+ | dict[str, list[MetricSpec]]
79
+ | None
80
+ ) = field(default=None)
81
+ """Scorer metrics"""
44
82
 
45
83
 
46
84
  P = ParamSpec("P")
@@ -90,17 +128,28 @@ def scorer(
90
128
  r"""Decorator for registering scorers.
91
129
 
92
130
  Args:
93
- metrics (list[Metric] | dict[str, list[Metric]]): One or more metrics to calculate
131
+ metrics: One or more metrics to calculate
94
132
  over the scores.
95
- name (str | None):
96
- Optional name for scorer. If the decorator has no name
133
+ name: Optional name for scorer. If the decorator has no name
97
134
  argument then the name of the underlying ScorerType
98
135
  object will be used to automatically assign a name.
99
- **metadata (dict[str,Any]): Additional values to serialize
136
+ **metadata: Additional values to serialize
100
137
  in metadata.
101
138
 
102
139
  Returns:
103
140
  Scorer with registry attributes.
141
+
142
+ Examples:
143
+ ```python
144
+ @scorer
145
+ def custom_scorer() -> Scorer:
146
+ async def score(state: TaskState, target: Target) -> Score:
147
+ # Compare state / model output with target
148
+ # to yield a score
149
+ return Score(value=...)
150
+
151
+ return score
152
+ ````
104
153
  """
105
154
 
106
155
  def wrapper(scorer_type: Callable[P, Scorer]) -> Callable[P, Scorer]:
@@ -142,6 +191,51 @@ def scorer(
142
191
  return wrapper
143
192
 
144
193
 
194
+ def as_scorer_spec(scorer: Scorer) -> ScorerSpec:
195
+ if not is_registry_object(scorer):
196
+ raise PrerequisiteError(
197
+ f"The scorer {getattr(scorer, '__name__', '<unknown>')} was not created by a function decorated with @scorer so cannot be recorded."
198
+ )
199
+ name = registry_unqualified_name(scorer)
200
+ metrics = scorer_metrics(scorer)
201
+ resolved_metrics = resolve_metrics(metrics)
202
+
203
+ args = registry_params(scorer)
204
+ metadata = deepcopy(registry_info(scorer).metadata)
205
+ del metadata[SCORER_METRICS]
206
+
207
+ return ScorerSpec(
208
+ scorer=name, args=args, metadata=metadata, metrics=resolved_metrics
209
+ )
210
+
211
+
212
+ def resolve_metrics(
213
+ metrics: list[Metric | dict[str, list[Metric]]] | dict[str, list[Metric]],
214
+ ) -> (
215
+ list[MetricSpec | dict[str, list[MetricSpec]]] | dict[str, list[MetricSpec]] | None
216
+ ):
217
+ if isinstance(metrics, list):
218
+ resolved_metrics: list[MetricSpec | dict[str, list[MetricSpec]]] = []
219
+ for metric_item in metrics:
220
+ if isinstance(metric_item, Metric):
221
+ resolved_metrics.append(as_metric_spec(metric_item))
222
+ else:
223
+ resolved_metrics.append(
224
+ {
225
+ metric_group: [
226
+ as_metric_spec(metric) for metric in metrics_list
227
+ ]
228
+ for metric_group, metrics_list in metric_item.items()
229
+ }
230
+ )
231
+ return resolved_metrics
232
+ else:
233
+ return {
234
+ metric_group: [as_metric_spec(metric) for metric in metrics_list]
235
+ for metric_group, metrics_list in metrics.items()
236
+ }
237
+
238
+
145
239
  def scorer_metrics(
146
240
  scorer: Scorer,
147
241
  ) -> list[Metric | dict[str, list[Metric]]] | dict[str, list[Metric]]:
@@ -1,7 +1,7 @@
1
1
  from inspect_ai._util.deprecation import relocated_module_attribute
2
2
 
3
3
  from ._basic_agent import basic_agent
4
- from ._bridge import bridge
4
+ from ._bridge.bridge import bridge
5
5
  from ._chain import chain
6
6
  from ._critique import self_critique
7
7
  from ._fork import fork
@@ -81,31 +81,28 @@ def basic_agent(
81
81
  alternate conversion scheme as required via `score_value`.
82
82
 
83
83
  Args:
84
- init: (Solver | list[Solver] | None): Agent initialisation
85
- (defaults to system_message with basic ReAct prompt)
86
- tools (list[Tool | ToolDef] | Solver | None): Tools available for the agent. Either a
87
- list of tools or a Solver that can yield dynamic tools per-sample.
88
- cache: (bool | CachePolicy): Caching behaviour for generate responses
89
- (defaults to no caching).
90
- max_attempts (int): Maximum number of submissions to accept before terminating.
91
- message_limit (int | None): Limit on messages in sample before terminating agent.
84
+ init: Agent initialisation (defaults to system_message with basic ReAct prompt)
85
+ tools: Tools available for the agent. Either a list of tools or a Solver that
86
+ can yield dynamic tools per-sample.
87
+ cache: Caching behaviour for generate responses (defaults to no caching).
88
+ max_attempts: Maximum number of submissions to accept before terminating.
89
+ message_limit: Limit on messages in sample before terminating agent.
92
90
  If not specified, will use limit_messages defined for the task. If there is none
93
91
  defined for the task, 50 will be used as a default.
94
- token_limit (int | None): Limit on tokens used in sample before terminating agent.
95
- max_tool_output (int | None): Maximum output length (in bytes).
92
+ token_limit: Limit on tokens used in sample before terminating agent.
93
+ max_tool_output: Maximum output length (in bytes).
96
94
  Defaults to max_tool_output from active GenerateConfig.
97
- score_value (ValueToFloat): Function used to extract float from scores (defaults
98
- to standard value_to_float())
99
- incorrect_message (str | Callable[[TaskState, list[Score]], str | Awaitable[str]]):
100
- User message reply for an incorrect submission from the model. Alternatively,
101
- a function which returns a message (function may optionally be async)
102
- continue_message (str): User message to urge the model to continue when it
103
- doesn't make a tool call.
104
- submit_name (str): Name for tool used to make submissions
105
- (defaults to 'submit')
106
- submit_description (str): Description of submit tool (defaults to
107
- 'Submit an answer for evaluation')
108
- **kwargs (Any): Deprecated arguments for backward compatibility.
95
+ score_value: Function used to extract float from scores (defaults
96
+ to standard value_to_float())
97
+ incorrect_message: User message reply for an incorrect submission from the model.
98
+ Alternatively, a function which returns a message (function may optionally be async)
99
+ continue_message: User message to urge the model to continue when it
100
+ doesn't make a tool call.
101
+ submit_name: Name for tool used to make submissions
102
+ (defaults to 'submit')
103
+ submit_description: Description of submit tool (defaults to
104
+ 'Submit an answer for evaluation')
105
+ **kwargs: Deprecated arguments for backward compatibility.
109
106
 
110
107
  Returns:
111
108
  Plan for agent.
@@ -1,3 +0,0 @@
1
- from .bridge import bridge
2
-
3
- __all__ = ["bridge"]
@@ -17,7 +17,7 @@ from .._task_state import TaskState
17
17
  def bridge(agent: Callable[[dict[str, Any]], Awaitable[dict[str, Any]]]) -> Solver:
18
18
  """Bridge an external agent into an Inspect Solver.
19
19
 
20
- See documentation at https://inspect.ai-safety-institute.org.uk/agent-bridge.html
20
+ See documentation at <https://inspect.ai-safety-institute.org.uk/agent-bridge.html>
21
21
 
22
22
  Args:
23
23
  agent: Callable which takes a sample `dict` and returns a result `dict`.
@@ -63,11 +63,11 @@ def bridge(agent: Callable[[dict[str, Any]], Awaitable[dict[str, Any]]]) -> Solv
63
63
  else state.input
64
64
  )
65
65
 
66
- # create sample
66
+ # create sample (use standard gpt-4 message encoding -- i.e. no 'developer' messages)
67
67
  sample = BridgeSample(
68
68
  sample_id=str(state.sample_id),
69
69
  epoch=state.epoch,
70
- input=await openai_chat_messages(input, state.model.name),
70
+ input=await openai_chat_messages(input, model="gpt-4"),
71
71
  metadata=state.metadata,
72
72
  target=list(state.target),
73
73
  )
@@ -15,8 +15,7 @@ def chain(*solvers: Solver | list[Solver]) -> Solver:
15
15
  early.
16
16
 
17
17
  Args:
18
- solvers (*Solver | list[Solver]): One or more solvers
19
- or lists of solvers to chain together.
18
+ *solvers: One or more solvers or lists of solvers to chain together.
20
19
 
21
20
  Returns:
22
21
  Solver that executes the passed solvers as a chain.
@@ -25,15 +25,15 @@ def self_critique(
25
25
  need to use the model being evaluated).
26
26
 
27
27
  Args:
28
- critique_template (str | None): String or path to file
28
+ critique_template: String or path to file
29
29
  containing critique template. The template uses two
30
30
  variables: `question` and `completion`.
31
31
  Variables from sample `metadata` are also available
32
32
  in the template.
33
- completion_template (str | None): String or path to file
33
+ completion_template: String or path to file
34
34
  containing completion template. The template uses
35
35
  three variables: `question`, `completion`, and `critique`
36
- model (str | Model | None): Alternate model to be used
36
+ model: Alternate model to be used
37
37
  for critique (by default the model being evaluated
38
38
  is used).
39
39
  """
@@ -32,8 +32,8 @@ async def fork(
32
32
  Store that doesn't affect the Store of other subtasks or the parent).
33
33
 
34
34
  Args:
35
- state (TaskState): Beginning TaskState
36
- solvers (Solver | list[Solver]): Solvers to apply on the TaskState.
35
+ state: Beginning TaskState
36
+ solvers: Solvers to apply on the TaskState.
37
37
  Each Solver will get a standalone copy of the TaskState.
38
38
 
39
39
  Returns:
File without changes
@@ -30,14 +30,11 @@ def human_agent(
30
30
  using a VS Code Window or Terminal.
31
31
 
32
32
  Args:
33
- answer (bool | str): Is an explicit answer required for this
34
- task or is it scored based on files in the container? Pass a
35
- `str` with a regex to validate that the answer matches
36
- the expected format.
37
- intermediate_scoring (bool): Allow the human agent to
38
- check their score while working.
39
- record_session (bool): Record all user commands and outputs in
40
- the sandbox bash session.
33
+ answer: Is an explicit answer required for this task or is it scored
34
+ based on files in the container? Pass a `str` with a regex to validate
35
+ that the answer matches the expected format.
36
+ intermediate_scoring: Allow the human agent to check their score while working.
37
+ record_session: Record all user commands and outputs in the sandbox bash session.
41
38
 
42
39
  Returns:
43
40
  Solver: Human agent solver.
@@ -27,14 +27,10 @@ class StartCommand(HumanAgentCommand):
27
27
  print(call_human_agent("start"))
28
28
 
29
29
  def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
30
- from inspect_ai.log._transcript import transcript
31
-
32
30
  async def start() -> str:
33
31
  if not state.running:
34
32
  state.running = True
35
- transcript().info(
36
- f"Task started (total time: {format_progress_time(state.time)})"
37
- )
33
+ clock_action_event("start", state)
38
34
  return render_status(state)
39
35
 
40
36
  return start
@@ -57,14 +53,22 @@ class StopCommand(HumanAgentCommand):
57
53
  print(call_human_agent("stop"))
58
54
 
59
55
  def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
60
- from inspect_ai.log._transcript import transcript
61
-
62
56
  async def stop() -> str:
63
57
  if state.running:
64
58
  state.running = False
65
- transcript().info(
66
- f"Task stopped (total time: {format_progress_time(state.time)})"
67
- )
59
+ clock_action_event("stop", state)
68
60
  return render_status(state)
69
61
 
70
62
  return stop
63
+
64
+
65
+ def clock_action_event(action: str, state: HumanAgentState) -> None:
66
+ from inspect_ai.log._transcript import transcript
67
+
68
+ transcript().info(
69
+ {
70
+ "action": action,
71
+ "total_time": format_progress_time(state.time, False),
72
+ },
73
+ source="human_agent",
74
+ )
@@ -37,6 +37,6 @@ class NoteCommand(HumanAgentCommand):
37
37
  from inspect_ai.log._transcript import transcript
38
38
 
39
39
  async def note(content: str) -> None:
40
- transcript().info(content)
40
+ transcript().info(content, source="human_agent")
41
41
 
42
42
  return note
@@ -1,6 +1,5 @@
1
1
  from argparse import Namespace
2
2
  from copy import deepcopy
3
- from textwrap import dedent
4
3
  from typing import Awaitable, Callable, Literal
5
4
 
6
5
  from pydantic import JsonValue
@@ -51,8 +50,6 @@ class ScoreCommand(HumanAgentCommand):
51
50
 
52
51
  def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
53
52
  async def score_task(answer: str | None) -> str:
54
- from inspect_ai.log._transcript import transcript
55
-
56
53
  # make a copy of TaskState, add the answer, then score
57
54
  if answer:
58
55
  task_state = deepcopy(self._state)
@@ -64,14 +61,6 @@ class ScoreCommand(HumanAgentCommand):
64
61
  # record the scoring action in our state
65
62
  state.scorings.append(IntermediateScoring(time=state.time, scores=result))
66
63
 
67
- # record to transcript
68
- transcript().info(
69
- dedent(f"""
70
- ### Intermediate Score
71
- **Answer:** {result[0].answer}, **Score:** {result[0].as_str()}
72
- """)
73
- )
74
-
75
64
  # notify user
76
65
  return render_text(
77
66
  f"[bold]Answer:[/bold] {result[0].answer}, [bold]Score:[/bold] {result[0].as_str()}"
@@ -219,38 +219,35 @@ def multiple_choice(
219
219
  multiple_correct: bool = False,
220
220
  **kwargs: Unpack[DeprecatedArgs],
221
221
  ) -> Solver:
222
- """Multiple choice question solver.
223
-
224
- Formats a multiple choice question prompt, then calls `generate()`
225
-
226
- ### Usage
222
+ """Multiple choice question solver. Formats a multiple choice question prompt, then calls `generate()`.
227
223
 
228
224
  Note that due to the way this solver works, it has some constraints:
229
225
 
230
- 1. The `Sample` must have the `choices` attribute set.
231
- 2. The only built-in compatible scorer is the `choice` scorer.
232
- 3. It calls `generate()` internally, so you don't need to call it again
233
-
234
- ### Shuffling
235
-
236
- You can shuffle choices when you load your dataset by using the `shuffle_choices` method or parameter of the datasets API.
226
+ 1. The `Sample` must have the `choices` attribute set.
227
+ 2. The only built-in compatible scorer is the `choice` scorer.
228
+ 3. It calls `generate()` internally, so you don't need to call it again
237
229
 
238
230
  Args:
239
- template (str | None): Template to use for the multiple choice question.
231
+ template: Template to use for the multiple choice question.
240
232
  The defaults vary based on the options and are taken from the `MultipleChoiceTemplate` enum. The template will have questions and possible answers substituted into it before being sent to the model. Consequently it requires three specific template variables:
241
- - `{question}`: The question to be asked.
242
- - `{choices}`: The choices available, which will be formatted as a
233
+
234
+ - `{question}`: The question to be asked.
235
+ - `{choices}`: The choices available, which will be formatted as a
243
236
  list of A) ... B) ... etc. before sending to the model.
244
- - `{letters}`: (optional) A string of letters representing the choices, e.g.
237
+ - `{letters}`: (optional) A string of letters representing the choices, e.g.
245
238
  "A,B,C". Used to be explicit to the model about the possible answers.
246
- cot (bool): Default `False`. Whether the solver should perform chain-of-thought
239
+ cot: Default `False`. Whether the solver should perform chain-of-thought
247
240
  reasoning before answering. NOTE: this has no effect if you provide a custom template.
248
- multiple_correct (bool): Default `False`. Whether to allow multiple
241
+ multiple_correct: Default `False`. Whether to allow multiple
249
242
  answers to the multiple choice question. For example, "What numbers are
250
243
  squares? A) 3, B) 4, C) 9" has multiple correct answers, B and C. Leave
251
244
  as `False` if there's exactly one correct answer from the choices
252
245
  available. NOTE: this has no effect if you provide a custom template.
253
246
  **kwargs (Any): Deprecated arguments for backward compatibility.
247
+
248
+ #### Shuffling
249
+
250
+ You can shuffle choices when you load your dataset by using the `shuffle_choices` method or parameter of the datasets API.
254
251
  """
255
252
  shuffle: bool | Random = False
256
253
  if "shuffle" in kwargs:
@@ -20,8 +20,8 @@ def prompt_template(template: str, **params: Any) -> Solver:
20
20
  `params`.
21
21
 
22
22
  Args:
23
- template: (str): Template for prompt.
24
- **params (dict[str,Any]): Parameters to fill into the template.
23
+ template: Template for prompt.
24
+ **params: Parameters to fill into the template.
25
25
 
26
26
  Returns:
27
27
  A solver that uses the specified prompt template.
@@ -51,8 +51,8 @@ def system_message(template: str, **params: Any) -> Solver:
51
51
  are none it will be inserted at the beginning of the conversation).
52
52
 
53
53
  Args:
54
- template (str): Template for system message.
55
- **params (dict[str,Any]): Parameters to fill into the template.
54
+ template: Template for system message.
55
+ **params: Parameters to fill into the template.
56
56
 
57
57
  Returns:
58
58
  A solver that inserts the parameterised system message.
@@ -80,8 +80,8 @@ def user_message(template: str, **params: Any) -> Solver:
80
80
  included in the `params`.
81
81
 
82
82
  Args:
83
- template (str): Template for user message.
84
- **params (dict[str,Any]): Parameters to fill into the template.
83
+ template: Template for user message.
84
+ **params: Parameters to fill into the template.
85
85
 
86
86
  Returns:
87
87
  A solver that inserts the parameterised user message.
@@ -109,7 +109,7 @@ def chain_of_thought(template: str = DEFAULT_COT_TEMPLATE) -> Solver:
109
109
  """Solver which modifies the user prompt to encourage chain of thought.
110
110
 
111
111
  Args:
112
- template (str): String or path to file containing CoT template.
112
+ template: String or path to file containing CoT template.
113
113
  The template uses a single variable: `prompt`.
114
114
  """
115
115