inspect-ai 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. inspect_ai/_cli/cache.py +8 -7
  2. inspect_ai/_cli/common.py +0 -12
  3. inspect_ai/_cli/eval.py +32 -4
  4. inspect_ai/_cli/info.py +1 -0
  5. inspect_ai/_cli/list.py +1 -1
  6. inspect_ai/_cli/log.py +2 -0
  7. inspect_ai/_cli/sandbox.py +4 -1
  8. inspect_ai/_cli/score.py +181 -32
  9. inspect_ai/_cli/trace.py +2 -0
  10. inspect_ai/_cli/view.py +4 -2
  11. inspect_ai/_display/core/config.py +7 -1
  12. inspect_ai/_display/core/progress.py +1 -1
  13. inspect_ai/_display/textual/app.py +8 -4
  14. inspect_ai/_display/textual/widgets/samples.py +6 -5
  15. inspect_ai/_display/textual/widgets/sandbox.py +6 -0
  16. inspect_ai/_eval/__init__.py +0 -0
  17. inspect_ai/_eval/eval.py +100 -97
  18. inspect_ai/_eval/evalset.py +69 -69
  19. inspect_ai/_eval/loader.py +122 -12
  20. inspect_ai/_eval/registry.py +1 -1
  21. inspect_ai/_eval/run.py +14 -0
  22. inspect_ai/_eval/score.py +125 -36
  23. inspect_ai/_eval/task/log.py +105 -4
  24. inspect_ai/_eval/task/results.py +92 -38
  25. inspect_ai/_eval/task/run.py +6 -2
  26. inspect_ai/_eval/task/sandbox.py +35 -2
  27. inspect_ai/_eval/task/task.py +49 -46
  28. inspect_ai/_util/__init__.py +0 -0
  29. inspect_ai/_util/constants.py +1 -1
  30. inspect_ai/_util/content.py +8 -0
  31. inspect_ai/_util/error.py +2 -0
  32. inspect_ai/_util/file.py +15 -1
  33. inspect_ai/_util/logger.py +4 -2
  34. inspect_ai/_util/registry.py +7 -1
  35. inspect_ai/_view/view.py +1 -2
  36. inspect_ai/_view/www/App.css +8 -3
  37. inspect_ai/_view/www/README.md +1 -1
  38. inspect_ai/_view/www/dist/assets/index.css +66 -38
  39. inspect_ai/_view/www/dist/assets/index.js +525 -523
  40. inspect_ai/_view/www/log-schema.json +86 -73
  41. inspect_ai/_view/www/package.json +1 -1
  42. inspect_ai/_view/www/src/App.tsx +1 -0
  43. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +1 -1
  44. inspect_ai/_view/www/src/components/JsonPanel.tsx +1 -1
  45. inspect_ai/_view/www/src/components/LargeModal.tsx +39 -49
  46. inspect_ai/_view/www/src/components/NavPills.tsx +3 -1
  47. inspect_ai/_view/www/src/components/TabSet.tsx +19 -4
  48. inspect_ai/_view/www/src/logfile/remoteLogFile.ts +0 -1
  49. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +1 -1
  50. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +1 -1
  51. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +6 -13
  52. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +17 -2
  53. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +1 -1
  54. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +14 -5
  55. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +4 -2
  56. inspect_ai/_view/www/src/samples/SamplesTools.tsx +16 -24
  57. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +1 -1
  58. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +1 -0
  59. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +27 -13
  60. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +19 -17
  61. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +12 -10
  62. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +56 -66
  63. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +12 -5
  64. inspect_ai/_view/www/src/samples/chat/tools/tool.ts +21 -36
  65. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +3 -1
  66. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +27 -25
  67. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +5 -1
  68. inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +13 -13
  69. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +1 -1
  70. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +2 -2
  71. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +9 -5
  72. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +1 -1
  73. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -4
  74. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +1 -0
  75. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +1 -0
  76. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +17 -6
  77. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +14 -19
  78. inspect_ai/_view/www/src/types/log.d.ts +107 -19
  79. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +7 -1
  80. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +5 -3
  81. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +25 -27
  82. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +12 -11
  83. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +25 -2
  84. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +60 -36
  85. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +4 -0
  86. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +6 -4
  87. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +16 -14
  88. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +9 -19
  89. inspect_ai/_view/www/src/workspace/utils.ts +34 -0
  90. inspect_ai/approval/_approval.py +2 -0
  91. inspect_ai/approval/_approver.py +4 -4
  92. inspect_ai/approval/_auto.py +1 -1
  93. inspect_ai/approval/_human/approver.py +3 -0
  94. inspect_ai/approval/_policy.py +5 -0
  95. inspect_ai/approval/_registry.py +2 -2
  96. inspect_ai/dataset/_dataset.py +36 -45
  97. inspect_ai/dataset/_sources/__init__.py +0 -0
  98. inspect_ai/dataset/_sources/csv.py +13 -13
  99. inspect_ai/dataset/_sources/hf.py +29 -29
  100. inspect_ai/dataset/_sources/json.py +10 -10
  101. inspect_ai/log/__init__.py +2 -0
  102. inspect_ai/log/_convert.py +3 -3
  103. inspect_ai/log/_file.py +24 -9
  104. inspect_ai/log/_log.py +98 -7
  105. inspect_ai/log/_message.py +3 -1
  106. inspect_ai/log/_recorders/file.py +4 -0
  107. inspect_ai/log/_recorders/recorder.py +3 -0
  108. inspect_ai/log/_transcript.py +19 -8
  109. inspect_ai/model/__init__.py +2 -0
  110. inspect_ai/model/_cache.py +39 -21
  111. inspect_ai/model/_call_tools.py +2 -2
  112. inspect_ai/model/_chat_message.py +14 -4
  113. inspect_ai/model/_generate_config.py +1 -1
  114. inspect_ai/model/_model.py +31 -24
  115. inspect_ai/model/_model_output.py +14 -1
  116. inspect_ai/model/_openai.py +10 -18
  117. inspect_ai/model/_providers/google.py +9 -5
  118. inspect_ai/model/_providers/openai.py +5 -9
  119. inspect_ai/model/_providers/openrouter.py +1 -1
  120. inspect_ai/scorer/__init__.py +6 -1
  121. inspect_ai/scorer/_answer.py +1 -1
  122. inspect_ai/scorer/_classification.py +4 -0
  123. inspect_ai/scorer/_match.py +4 -5
  124. inspect_ai/scorer/_metric.py +87 -28
  125. inspect_ai/scorer/_metrics/__init__.py +3 -3
  126. inspect_ai/scorer/_metrics/accuracy.py +8 -10
  127. inspect_ai/scorer/_metrics/mean.py +3 -17
  128. inspect_ai/scorer/_metrics/std.py +111 -30
  129. inspect_ai/scorer/_model.py +12 -12
  130. inspect_ai/scorer/_pattern.py +3 -3
  131. inspect_ai/scorer/_reducer/reducer.py +36 -21
  132. inspect_ai/scorer/_reducer/registry.py +2 -2
  133. inspect_ai/scorer/_reducer/types.py +7 -1
  134. inspect_ai/scorer/_score.py +11 -1
  135. inspect_ai/scorer/_scorer.py +110 -16
  136. inspect_ai/solver/__init__.py +1 -1
  137. inspect_ai/solver/_basic_agent.py +19 -22
  138. inspect_ai/solver/_bridge/__init__.py +0 -3
  139. inspect_ai/solver/_bridge/bridge.py +3 -3
  140. inspect_ai/solver/_chain.py +1 -2
  141. inspect_ai/solver/_critique.py +3 -3
  142. inspect_ai/solver/_fork.py +2 -2
  143. inspect_ai/solver/_human_agent/__init__.py +0 -0
  144. inspect_ai/solver/_human_agent/agent.py +5 -8
  145. inspect_ai/solver/_human_agent/commands/clock.py +14 -10
  146. inspect_ai/solver/_human_agent/commands/note.py +1 -1
  147. inspect_ai/solver/_human_agent/commands/score.py +0 -11
  148. inspect_ai/solver/_multiple_choice.py +15 -18
  149. inspect_ai/solver/_prompt.py +7 -7
  150. inspect_ai/solver/_solver.py +53 -52
  151. inspect_ai/solver/_task_state.py +80 -69
  152. inspect_ai/solver/_use_tools.py +9 -9
  153. inspect_ai/tool/__init__.py +2 -1
  154. inspect_ai/tool/_tool.py +43 -14
  155. inspect_ai/tool/_tool_call.py +6 -2
  156. inspect_ai/tool/_tool_choice.py +3 -1
  157. inspect_ai/tool/_tool_def.py +10 -8
  158. inspect_ai/tool/_tool_params.py +24 -0
  159. inspect_ai/tool/_tool_with.py +7 -7
  160. inspect_ai/tool/_tools/__init__.py +0 -0
  161. inspect_ai/tool/_tools/_computer/_common.py +2 -2
  162. inspect_ai/tool/_tools/_computer/_computer.py +11 -0
  163. inspect_ai/tool/_tools/_execute.py +15 -9
  164. inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
  165. inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
  166. inspect_ai/tool/_tools/_web_search.py +7 -5
  167. inspect_ai/util/_concurrency.py +3 -3
  168. inspect_ai/util/_panel.py +2 -0
  169. inspect_ai/util/_resource.py +12 -12
  170. inspect_ai/util/_sandbox/docker/compose.py +23 -20
  171. inspect_ai/util/_sandbox/docker/config.py +2 -1
  172. inspect_ai/util/_sandbox/docker/docker.py +10 -1
  173. inspect_ai/util/_sandbox/docker/service.py +100 -0
  174. inspect_ai/util/_sandbox/environment.py +99 -96
  175. inspect_ai/util/_subprocess.py +5 -3
  176. inspect_ai/util/_subtask.py +15 -16
  177. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/LICENSE +1 -1
  178. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/METADATA +10 -6
  179. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/RECORD +182 -175
  180. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/WHEEL +0 -0
  181. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/entry_points.txt +0 -0
  182. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/top_level.txt +0 -0
@@ -190,7 +190,7 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
190
190
  if task.setup:
191
191
  plan.steps = unroll(task.setup) + plan.steps
192
192
 
193
- # reaolve the scorer
193
+ # resolve the scorer
194
194
  score = score and task.scorer is not None
195
195
  scorers: list[Scorer] | None = task.scorer if (score and task.scorer) else None
196
196
  scorer_profiles = (
@@ -519,6 +519,7 @@ async def task_run_sample(
519
519
  key: SampleScore(
520
520
  score=score,
521
521
  sample_id=previous_sample.id,
522
+ sample_metadata=previous_sample.metadata,
522
523
  )
523
524
  for key, score in previous_sample.scores.items()
524
525
  }
@@ -696,6 +697,7 @@ async def task_run_sample(
696
697
  sample_score = SampleScore(
697
698
  score=score_result,
698
699
  sample_id=sample.id,
700
+ sample_metadata=sample.metadata,
699
701
  scorer=registry_unqualified_name(scorer),
700
702
  )
701
703
  transcript()._event(
@@ -709,7 +711,9 @@ async def task_run_sample(
709
711
  if state.scores is not None:
710
712
  for name, score in state.scores.items():
711
713
  results[name] = SampleScore(
712
- score=score, sample_id=state.sample_id
714
+ score=score,
715
+ sample_id=state.sample_id,
716
+ sample_metadata=state.metadata,
713
717
  )
714
718
  transcript()._event(
715
719
  ScoreEvent(score=score, target=sample.target)
@@ -5,11 +5,20 @@ from random import random
5
5
  from typing import AsyncGenerator, Callable, NamedTuple, cast
6
6
 
7
7
  import httpx
8
+ from tenacity import (
9
+ retry,
10
+ retry_if_exception,
11
+ stop_after_attempt,
12
+ stop_after_delay,
13
+ wait_exponential_jitter,
14
+ )
8
15
 
9
16
  from inspect_ai._eval.task.task import Task
10
17
  from inspect_ai._eval.task.util import task_run_dir
18
+ from inspect_ai._util.constants import DEFAULT_MAX_RETRIES, DEFAULT_TIMEOUT
11
19
  from inspect_ai._util.file import file, filesystem
12
20
  from inspect_ai._util.registry import registry_unqualified_name
21
+ from inspect_ai._util.retry import httpx_should_retry, log_retry_attempt
13
22
  from inspect_ai._util.url import data_uri_to_base64, is_data_uri, is_http_url
14
23
  from inspect_ai.dataset import Sample
15
24
  from inspect_ai.util._concurrency import concurrency
@@ -115,8 +124,7 @@ async def read_sandboxenv_file(contents: str) -> bytes:
115
124
  contents_base64 = data_uri_to_base64(contents)
116
125
  file_bytes = base64.b64decode(contents_base64)
117
126
  elif is_http_url(contents):
118
- client = httpx.AsyncClient()
119
- file_bytes = (await client.get(contents, follow_redirects=True)).content
127
+ file_bytes = await _retrying_httpx_get(contents)
120
128
  else:
121
129
  # try to read as a file (if it doesn't exist or has a path not cool w/
122
130
  # the filesystem then we fall back to contents)
@@ -172,3 +180,28 @@ def resolve_sandbox(
172
180
  return sample.sandbox
173
181
  else:
174
182
  return None
183
+
184
+
185
+ async def _retrying_httpx_get(
186
+ url: str,
187
+ client: httpx.AsyncClient = httpx.AsyncClient(),
188
+ timeout: int = 30, # per-attempt timeout
189
+ max_retries: int = DEFAULT_MAX_RETRIES,
190
+ total_timeout: int = DEFAULT_TIMEOUT, # timeout for the whole retry loop. not for an individual attempt
191
+ ) -> bytes:
192
+ @retry(
193
+ wait=wait_exponential_jitter(),
194
+ stop=(stop_after_attempt(max_retries) | stop_after_delay(total_timeout)),
195
+ retry=retry_if_exception(httpx_should_retry),
196
+ before_sleep=log_retry_attempt(url),
197
+ )
198
+ async def do_get() -> bytes:
199
+ response = await client.get(
200
+ url=url,
201
+ follow_redirects=True,
202
+ timeout=(timeout, timeout, timeout, timeout),
203
+ )
204
+ response.raise_for_status()
205
+ return response.content
206
+
207
+ return await do_get()
@@ -39,38 +39,6 @@ class Task:
39
39
  r"""Evaluation task.
40
40
 
41
41
  Tasks are the basis for defining and running evaluations.
42
-
43
- Args:
44
- dataset (Dataset | Sequence[Sample]): Dataset to evaluate
45
- setup: (Solver | list[Solver] | None): Setup step (always run
46
- even when the main `solver` is replaced).
47
- solver: (Solver | list[Solver]): Solver or list of solvers.
48
- Defaults to generate(), a normal call to the model.
49
- scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
50
- metrics (list[Metric] | dict[str, list[Metric]] | None):
51
- Alternative metrics (overrides the metrics provided by the specified scorer).
52
- config (GenerateConfig): Model generation config.
53
- sandbox (SandboxEnvironmentType | None): Sandbox environment type
54
- (or optionally a str or tuple with a shorthand spec)
55
- approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
56
- Either a path to an approval policy config file or a list of approval policies.
57
- Defaults to no approval policy.
58
- epochs (int | Epochs | None): Epochs to repeat samples for and optional score
59
- reducer function(s) used to combine sample scores (defaults to "mean")
60
- fail_on_error (bool | float | None): `True` to fail on first sample error
61
- (default); `False` to never fail on sample errors; Value between 0 and 1
62
- to fail if a proportion of total samples fails. Value greater than 1 to fail
63
- eval if a count of samples fails.
64
- message_limit (int | None): Limit on total messages used for each sample.
65
- token_limit (int | None): Limit on total tokens used for each sample.
66
- time_limit (int | None): Limit on time (in seconds) for execution of each sample.
67
- name: (str | None): Task name. If not specified is automatically
68
- determined based on the name of the task directory (or "task")
69
- if its anonymous task (e.g. created in a notebook and passed to
70
- eval() directly)
71
- version: (int): Version of task (to distinguish evolutions
72
- of the task spec or breaking changes to it)
73
- metadata: (dict[str, Any] | None): Additional metadata to associate with the task.
74
42
  """
75
43
 
76
44
  def __init__(
@@ -93,6 +61,41 @@ class Task:
93
61
  metadata: dict[str, Any] | None = None,
94
62
  **kwargs: Unpack[TaskDeprecatedArgs],
95
63
  ) -> None:
64
+ """Create a task.
65
+
66
+ Args:
67
+ dataset (Dataset | Sequence[Sample]): Dataset to evaluate
68
+ setup: (Solver | list[Solver] | None): Setup step (always run
69
+ even when the main `solver` is replaced).
70
+ solver: (Solver | list[Solver]): Solver or list of solvers.
71
+ Defaults to generate(), a normal call to the model.
72
+ scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
73
+ metrics (list[Metric] | dict[str, list[Metric]] | None):
74
+ Alternative metrics (overrides the metrics provided by the specified scorer).
75
+ config (GenerateConfig): Model generation config.
76
+ sandbox (SandboxEnvironmentType | None): Sandbox environment type
77
+ (or optionally a str or tuple with a shorthand spec)
78
+ approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
79
+ Either a path to an approval policy config file or a list of approval policies.
80
+ Defaults to no approval policy.
81
+ epochs (int | Epochs | None): Epochs to repeat samples for and optional score
82
+ reducer function(s) used to combine sample scores (defaults to "mean")
83
+ fail_on_error (bool | float | None): `True` to fail on first sample error
84
+ (default); `False` to never fail on sample errors; Value between 0 and 1
85
+ to fail if a proportion of total samples fails. Value greater than 1 to fail
86
+ eval if a count of samples fails.
87
+ message_limit (int | None): Limit on total messages used for each sample.
88
+ token_limit (int | None): Limit on total tokens used for each sample.
89
+ time_limit (int | None): Limit on time (in seconds) for execution of each sample.
90
+ name: (str | None): Task name. If not specified is automatically
91
+ determined based on the name of the task directory (or "task")
92
+ if its anonymous task (e.g. created in a notebook and passed to
93
+ eval() directly)
94
+ version: (int): Version of task (to distinguish evolutions
95
+ of the task spec or breaking changes to it)
96
+ metadata: (dict[str, Any] | None): Additional metadata to associate with the task.
97
+ **kwargs: Deprecated arguments.
98
+ """
96
99
  # handle deprecated args
97
100
  for arg, value in kwargs.items():
98
101
  newarg = ""
@@ -179,33 +182,33 @@ def task_with(
179
182
  task (Task): Task to adapt (it is deep copied prior to mutating options)
180
183
  dataset (Dataset | Sequence[Sample]): Dataset to evaluate
181
184
  setup: (Solver | list[Solver] | None): Setup step (always run
182
- even when the main `solver` is replaced).
185
+ even when the main `solver` is replaced).
183
186
  solver: (Solver | list[Solver]): Solver or list of solvers.
184
- Defaults to generate(), a normal call to the model.
187
+ Defaults to generate(), a normal call to the model.
185
188
  scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
186
189
  metrics (list[Metric] | dict[str, list[Metric]] | None):
187
- Alternative metrics (overrides the metrics provided by the specified scorer).
190
+ Alternative metrics (overrides the metrics provided by the specified scorer).
188
191
  config (GenerateConfig): Model generation config.
189
192
  sandbox (SandboxEnvironmentType | None): Sandbox environment type
190
- (or optionally a str or tuple with a shorthand spec)
193
+ (or optionally a str or tuple with a shorthand spec)
191
194
  approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
192
- Either a path to an approval policy config file or a list of approval policies.
193
- Defaults to no approval policy.
195
+ Either a path to an approval policy config file or a list of approval policies.
196
+ Defaults to no approval policy.
194
197
  epochs (int | Epochs | None): Epochs to repeat samples for and optional score
195
- reducer function(s) used to combine sample scores (defaults to "mean")
198
+ reducer function(s) used to combine sample scores (defaults to "mean")
196
199
  fail_on_error (bool | float | None): `True` to fail on first sample error
197
- (default); `False` to never fail on sample errors; Value between 0 and 1
198
- to fail if a proportion of total samples fails. Value greater than 1 to fail
199
- eval if a count of samples fails.
200
+ (default); `False` to never fail on sample errors; Value between 0 and 1
201
+ to fail if a proportion of total samples fails. Value greater than 1 to fail
202
+ eval if a count of samples fails.
200
203
  message_limit (int | None): Limit on total messages used for each sample.
201
204
  token_limit (int | None): Limit on total tokens used for each sample.
202
205
  time_limit (int | None): Limit on time (in seconds) for execution of each sample.
203
206
  name: (str | None): Task name. If not specified is automatically
204
- determined based on the name of the task directory (or "task")
205
- if its anonymous task (e.g. created in a notebook and passed to
206
- eval() directly)
207
+ determined based on the name of the task directory (or "task")
208
+ if its anonymous task (e.g. created in a notebook and passed to
209
+ eval() directly)
207
210
  version: (int): Version of task (to distinguish evolutions
208
- of the task spec or breaking changes to it)
211
+ of the task spec or breaking changes to it)
209
212
  metadata: (dict[str, Any] | None): Additional metadata to associate with the task.
210
213
 
211
214
  Returns:
File without changes
@@ -1,7 +1,7 @@
1
1
  from pathlib import Path
2
2
  from typing import Literal
3
3
 
4
- PKG_AUTHOR = "UK AI Safety Institute"
4
+ PKG_AUTHOR = "UK AI Security Institute"
5
5
  PKG_AUTHOR_DIR = "UK-AISI"
6
6
  PKG_NAME = Path(__file__).parent.parent.stem
7
7
  PKG_PATH = Path(__file__).parent.parent
@@ -4,6 +4,8 @@ from pydantic import BaseModel, Field
4
4
 
5
5
 
6
6
  class ContentText(BaseModel):
7
+ """Text content."""
8
+
7
9
  type: Literal["text"] = Field(default="text")
8
10
  """Type."""
9
11
 
@@ -12,6 +14,8 @@ class ContentText(BaseModel):
12
14
 
13
15
 
14
16
  class ContentImage(BaseModel):
17
+ """Image content."""
18
+
15
19
  type: Literal["image"] = Field(default="image")
16
20
  """Type."""
17
21
 
@@ -26,6 +30,8 @@ class ContentImage(BaseModel):
26
30
 
27
31
 
28
32
  class ContentAudio(BaseModel):
33
+ """Audio content."""
34
+
29
35
  type: Literal["audio"] = Field(default="audio")
30
36
  """Type."""
31
37
 
@@ -37,6 +43,8 @@ class ContentAudio(BaseModel):
37
43
 
38
44
 
39
45
  class ContentVideo(BaseModel):
46
+ """Video content."""
47
+
40
48
  type: Literal["video"] = Field(default="video")
41
49
  """Type."""
42
50
 
inspect_ai/_util/error.py CHANGED
@@ -9,6 +9,8 @@ from rich.console import RenderableType
9
9
 
10
10
 
11
11
  class EvalError(BaseModel):
12
+ """Eval error details."""
13
+
12
14
  message: str
13
15
  """Error message."""
14
16
 
inspect_ai/_util/file.py CHANGED
@@ -18,6 +18,7 @@ from fsspec.core import split_protocol # type: ignore # type: ignore
18
18
  from fsspec.implementations.local import make_path_posix # type: ignore
19
19
  from pydantic import BaseModel
20
20
  from s3fs import S3FileSystem # type: ignore
21
+ from shortuuid import uuid
21
22
 
22
23
  # https://filesystem-spec.readthedocs.io/en/latest/_modules/fsspec/spec.html#AbstractFileSystem
23
24
  # https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.generic.GenericFileSystem
@@ -169,6 +170,9 @@ class FileSystem:
169
170
  def exists(self, path: str) -> bool:
170
171
  return self.fs.exists(path) is True
171
172
 
173
+ def touch(self, path: str) -> None:
174
+ self.fs.touch(path)
175
+
172
176
  def rm(
173
177
  self, path: str, recursive: bool = False, maxdepth: int | None = None
174
178
  ) -> None:
@@ -218,6 +222,16 @@ class FileSystem:
218
222
  def is_local(self) -> bool:
219
223
  return isinstance(self.fs, fsspec.implementations.local.LocalFileSystem)
220
224
 
225
+ def is_writeable(self, path: str) -> bool:
226
+ try:
227
+ path = path.rstrip("/\\")
228
+ touch_file = f"{path}{self.fs.sep}{uuid()}"
229
+ self.touch(touch_file)
230
+ self.rm(touch_file)
231
+ return True
232
+ except PermissionError:
233
+ return False
234
+
221
235
  def is_async(self) -> bool:
222
236
  return isinstance(self.fs, fsspec.asyn.AsyncFileSystem)
223
237
 
@@ -354,7 +368,7 @@ def safe_filename(s: str, max_length: int = 255) -> str:
354
368
  Returns:
355
369
  str: A safe filename string
356
370
 
357
- Example:
371
+ Examples:
358
372
  >>> safe_filename("Hello/World?.txt")
359
373
  'Hello_World.txt'
360
374
  """
@@ -161,7 +161,7 @@ def init_logger(
161
161
  getLogger().addHandler(_logHandler)
162
162
 
163
163
  # establish default capture level
164
- capture_level = min(TRACE, levelno)
164
+ capture_level = min(TRACE, levelno, transcript_levelno)
165
165
 
166
166
  # see all the messages (we won't actually display/write all of them)
167
167
  getLogger().setLevel(capture_level)
@@ -181,7 +181,9 @@ def notify_logger_record(record: LogRecord, write: bool) -> None:
181
181
  from inspect_ai.log._transcript import LoggerEvent, transcript
182
182
 
183
183
  if write:
184
- transcript()._event(LoggerEvent(message=LoggingMessage.from_log_record(record)))
184
+ transcript()._event(
185
+ LoggerEvent(message=LoggingMessage._from_log_record(record))
186
+ )
185
187
  global _rate_limit_count
186
188
  if (record.levelno <= INFO and re.search(r"\b429\b", record.getMessage())) or (
187
189
  record.levelno == DEBUG
@@ -209,7 +209,13 @@ def registry_create(type: RegistryType, name: str, **kwargs: Any) -> object:
209
209
  if isclass(obj):
210
210
  return with_registry_info(obj(**kwargs))
211
211
  elif callable(obj):
212
- return_type = getattr(get_annotations(obj)["return"], "__name__", None)
212
+ return_type = get_annotations(obj).get("return")
213
+ # Until we remove the MetricDeprecated symbol we need this extra
214
+ # bit to map the Metric union back to Metric
215
+ if "_metric.Metric" in str(return_type):
216
+ return_type = "Metric"
217
+ else:
218
+ return_type = getattr(return_type, "__name__", None)
213
219
  if return_type and return_type.lower() == type:
214
220
  return with_registry_info(obj(**kwargs))
215
221
  else:
inspect_ai/_view/view.py CHANGED
@@ -28,11 +28,10 @@ def view(
28
28
  port: int = DEFAULT_VIEW_PORT,
29
29
  authorization: str | None = None,
30
30
  log_level: str | None = None,
31
- log_level_transcript: str | None = None,
32
31
  fs_options: dict[str, Any] = {},
33
32
  ) -> None:
34
33
  init_dotenv()
35
- init_logger(log_level, log_level_transcript)
34
+ init_logger(log_level)
36
35
 
37
36
  # initialize the log_dir
38
37
  log_dir = log_dir if log_dir else os.getenv("INSPECT_LOG_DIR", "./logs")
@@ -25,6 +25,7 @@
25
25
  /* Inspect Font Sizes */
26
26
  --inspect-font-size-title: 1.5rem;
27
27
  --inspect-font-size-title-secondary: 1.3rem;
28
+ --inspect-font-size-largest: 1.2rem;
28
29
  --inspect-font-size-larger: 1.1rem;
29
30
  --inspect-font-size-large: 1rem;
30
31
  --inspect-font-size-base: 0.9rem;
@@ -64,15 +65,15 @@ body[class^="vscode-"] .app-main-grid {
64
65
 
65
66
  /* Inspect Text Styles */
66
67
  .text-style-label {
67
- text-transform: uppercase;
68
+ text-transform: uppercase !important;
68
69
  }
69
70
 
70
71
  .text-style-secondary {
71
- color: var(--bs-secondary);
72
+ color: var(--bs-secondary) !important;
72
73
  }
73
74
 
74
75
  .text-style-tertiary {
75
- color: var(--bs-tertiary-color);
76
+ color: var(--bs-tertiary-color) !important;
76
77
  }
77
78
 
78
79
  /* Inspect Font Size Styles */
@@ -84,6 +85,10 @@ body[class^="vscode-"] .app-main-grid {
84
85
  font-size: var(--inspect-font-size-title-secondary);
85
86
  }
86
87
 
88
+ .text-size-largest {
89
+ font-size: var(--inspect-font-size-largest);
90
+ }
91
+
87
92
  .text-size-larger {
88
93
  font-size: var(--inspect-font-size-larger);
89
94
  }
@@ -19,7 +19,7 @@ Use the following commands (run in the `src/inspect_ai/_view/www` dir) to ensure
19
19
  yarn prettier:write
20
20
  ```
21
21
 
22
- 3. Build the bundled output to `dist`
22
+ 3. Build the bundled output into the `dist` directory.
23
23
 
24
24
  ```bash
25
25
  yarn build
@@ -14298,6 +14298,7 @@ pre[class*="language-"] {
14298
14298
  /* Inspect Font Sizes */
14299
14299
  --inspect-font-size-title: 1.5rem;
14300
14300
  --inspect-font-size-title-secondary: 1.3rem;
14301
+ --inspect-font-size-largest: 1.2rem;
14301
14302
  --inspect-font-size-larger: 1.1rem;
14302
14303
  --inspect-font-size-large: 1rem;
14303
14304
  --inspect-font-size-base: 0.9rem;
@@ -14337,15 +14338,15 @@ body[class^="vscode-"] .app-main-grid {
14337
14338
 
14338
14339
  /* Inspect Text Styles */
14339
14340
  .text-style-label {
14340
- text-transform: uppercase;
14341
+ text-transform: uppercase !important;
14341
14342
  }
14342
14343
 
14343
14344
  .text-style-secondary {
14344
- color: var(--bs-secondary);
14345
+ color: var(--bs-secondary) !important;
14345
14346
  }
14346
14347
 
14347
14348
  .text-style-tertiary {
14348
- color: var(--bs-tertiary-color);
14349
+ color: var(--bs-tertiary-color) !important;
14349
14350
  }
14350
14351
 
14351
14352
  /* Inspect Font Size Styles */
@@ -14357,6 +14358,10 @@ body[class^="vscode-"] .app-main-grid {
14357
14358
  font-size: var(--inspect-font-size-title-secondary);
14358
14359
  }
14359
14360
 
14361
+ .text-size-largest {
14362
+ font-size: var(--inspect-font-size-largest);
14363
+ }
14364
+
14360
14365
  .text-size-larger {
14361
14366
  font-size: var(--inspect-font-size-larger);
14362
14367
  }
@@ -16195,58 +16200,58 @@ ul.jsondiffpatch-textdiff {
16195
16200
  grid-template-columns: max-content max-content;
16196
16201
  column-gap: 1em;
16197
16202
  }
16198
- ._container_43lfg_1 {
16203
+ ._container_1jqar_1 {
16199
16204
  margin-top: 0.5em;
16200
16205
  padding-left: 0;
16201
16206
  }
16202
16207
 
16203
- ._label_43lfg_6 {
16204
- padding-right: 2em;
16205
- padding-left: 0;
16206
- padding-bottom: 0;
16208
+ ._label_1jqar_6 {
16209
+ padding-right: 2em !important;
16210
+ padding-left: 0 !important;
16211
+ padding-bottom: 0 !important;
16207
16212
  font-weight: 400;
16208
- padding-bottom: 0;
16213
+ padding-bottom: 0 !important;
16209
16214
  }
16210
16215
 
16211
- ._wordBreak_43lfg_14 {
16216
+ ._wordBreak_1jqar_14 {
16212
16217
  word-break: break-all;
16213
16218
  }
16214
16219
 
16215
- ._scoreTable_43lfg_18 {
16220
+ ._scoreTable_1jqar_18 {
16216
16221
  width: 100%;
16217
16222
  margin-bottom: 1em;
16218
16223
  }
16219
16224
 
16220
- ._bottomBorder_43lfg_23 {
16225
+ ._bottomBorder_1jqar_23 {
16221
16226
  border-bottom-color: #00000000;
16222
16227
  }
16223
16228
 
16224
- ._headerScore_43lfg_27 {
16229
+ ._headerScore_1jqar_27 {
16225
16230
  padding-left: 2em;
16226
16231
  }
16227
16232
 
16228
- ._targetValue_43lfg_31 {
16229
- padding-right: 2em;
16230
- padding-left: 0;
16231
- padding-top: 0;
16233
+ ._targetValue_1jqar_31 {
16234
+ padding-right: 2em !important;
16235
+ padding-left: 0 !important;
16236
+ padding-top: 0 !important;
16232
16237
  }
16233
16238
 
16234
- ._answerValue_43lfg_37 {
16235
- padding-left: 0;
16236
- padding-top: 0;
16239
+ ._answerValue_1jqar_37 {
16240
+ padding-left: 0 !important;
16241
+ padding-top: 0 !important;
16237
16242
  }
16238
16243
 
16239
- ._scoreValue_43lfg_42 {
16240
- padding-left: 2em;
16241
- padding-top: 0;
16244
+ ._scoreValue_1jqar_42 {
16245
+ padding-left: 2em !important;
16246
+ padding-top: 0 !important;
16242
16247
  }
16243
16248
 
16244
- ._noLeft_43lfg_47 {
16245
- padding-left: 0;
16249
+ ._noLeft_1jqar_47 {
16250
+ padding-left: 0 !important;
16246
16251
  }
16247
16252
 
16248
- ._noTop_43lfg_51 {
16249
- margin-top: 0;
16253
+ ._noTop_1jqar_51 {
16254
+ margin-top: 0 !important;
16250
16255
  }
16251
16256
  ._wrapper_b0it4_1 {
16252
16257
  display: grid;
@@ -19490,7 +19495,7 @@ span.ap-marker-container:hover span.ap-marker {
19490
19495
  display: grid;
19491
19496
  grid-template-columns: minmax(0, max-content) max-content;
19492
19497
  }
19493
- ._simpleMetricsRows_13pa9_1 {
19498
+ ._simpleMetricsRows_tnqkm_1 {
19494
19499
  display: flex;
19495
19500
  flex-direction: row;
19496
19501
  flex-wrap: wrap;
@@ -19501,28 +19506,28 @@ span.ap-marker-container:hover span.ap-marker {
19501
19506
  overflow: scroll;
19502
19507
  }
19503
19508
 
19504
- ._multiMetricsRows_13pa9_12 {
19509
+ ._multiMetricsRows_tnqkm_12 {
19505
19510
  display: flex;
19506
19511
  flex-direction: row;
19507
19512
  flex-wrap: wrap;
19508
19513
  justify-content: end;
19509
- height: 100%;
19510
19514
  align-items: center;
19511
19515
  margin-top: 0.2rem;
19512
19516
  padding-bottom: 0.4rem;
19513
19517
  row-gap: 1em;
19514
19518
  max-height: 15em;
19515
19519
  overflow: scroll;
19520
+ align-items: baseline;
19516
19521
  }
19517
19522
 
19518
- ._verticalMetricReducer_13pa9_26 {
19523
+ ._verticalMetricReducer_tnqkm_26 {
19519
19524
  font-size: var(--inspect-font-size-smaller);
19520
19525
  text-align: center;
19521
19526
  padding-top: 0.3rem;
19522
19527
  margin-bottom: -0.3rem;
19523
19528
  }
19524
19529
 
19525
- ._verticalMetricName_13pa9_33 {
19530
+ ._verticalMetricName_tnqkm_33 {
19526
19531
  font-size: var(--inspect-font-size-smaller);
19527
19532
  text-align: center;
19528
19533
  padding-top: 0.3rem;
@@ -19530,32 +19535,55 @@ span.ap-marker-container:hover span.ap-marker {
19530
19535
  border-bottom: solid var(--bs-border-color) 1px;
19531
19536
  }
19532
19537
 
19533
- ._verticalMetricValue_13pa9_41 {
19534
- font-size: var(--inspect-font-size-larger);
19538
+ ._verticalMetricValue_tnqkm_41 {
19535
19539
  font-weight: 500;
19536
19540
  text-align: center;
19537
19541
  }
19538
19542
 
19539
- ._multiScorerReducer_13pa9_47 {
19543
+ ._multiScorer_tnqkm_46 {
19544
+ padding-left: 0;
19545
+ height: 100%;
19546
+ display: flex;
19547
+ flex-direction: column;
19548
+ padding: 0.5em 1em;
19549
+ }
19550
+
19551
+ ._multiScorerIndent_tnqkm_54 {
19552
+ padding-left: 1.5em;
19553
+ }
19554
+
19555
+ ._multiScorerReducer_tnqkm_58 {
19540
19556
  text-align: center;
19541
19557
  margin-bottom: -0.3rem;
19558
+ margin-top: 0.2em;
19542
19559
  }
19543
19560
 
19544
- ._multiScorerLabel_13pa9_52 {
19561
+ ._multiScorerLabel_tnqkm_64 {
19545
19562
  text-align: center;
19546
19563
  border-bottom: solid var(--bs-border-color) 1px;
19547
19564
  margin-bottom: -0.1rem;
19548
19565
  }
19549
19566
 
19550
- ._multiScorerValue_13pa9_58 {
19567
+ ._multiScorerValue_tnqkm_70 {
19551
19568
  display: grid;
19552
19569
  grid-template-columns: auto auto;
19570
+ grid-auto-rows: auto;
19553
19571
  grid-column-gap: 0.3rem;
19554
19572
  grid-row-gap: 0;
19573
+ padding-top: 0.3em;
19555
19574
  }
19556
19575
 
19557
- ._multiScorerValueContent_13pa9_65 {
19576
+ ._multiScorerValueContent_tnqkm_79 {
19558
19577
  font-weight: 600;
19578
+ text-align: center;
19579
+ }
19580
+
19581
+ ._multiScoreMetricGrid_tnqkm_84 {
19582
+ display: grid;
19583
+ grid-template-rows: auto auto;
19584
+ column-gap: 1em;
19585
+ padding: 0 0.2em;
19586
+ justify-content: center;
19559
19587
  }
19560
19588
  ._statusPanel_1fzh4_1 {
19561
19589
  padding: 1em;