inspect-ai 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/cache.py +8 -7
- inspect_ai/_cli/common.py +0 -12
- inspect_ai/_cli/eval.py +32 -4
- inspect_ai/_cli/info.py +1 -0
- inspect_ai/_cli/list.py +1 -1
- inspect_ai/_cli/log.py +2 -0
- inspect_ai/_cli/sandbox.py +4 -1
- inspect_ai/_cli/score.py +181 -32
- inspect_ai/_cli/trace.py +2 -0
- inspect_ai/_cli/view.py +4 -2
- inspect_ai/_display/core/config.py +7 -1
- inspect_ai/_display/core/progress.py +1 -1
- inspect_ai/_display/textual/app.py +8 -4
- inspect_ai/_display/textual/widgets/samples.py +6 -5
- inspect_ai/_display/textual/widgets/sandbox.py +6 -0
- inspect_ai/_eval/__init__.py +0 -0
- inspect_ai/_eval/eval.py +100 -97
- inspect_ai/_eval/evalset.py +69 -69
- inspect_ai/_eval/loader.py +122 -12
- inspect_ai/_eval/registry.py +1 -1
- inspect_ai/_eval/run.py +14 -0
- inspect_ai/_eval/score.py +125 -36
- inspect_ai/_eval/task/log.py +105 -4
- inspect_ai/_eval/task/results.py +92 -38
- inspect_ai/_eval/task/run.py +6 -2
- inspect_ai/_eval/task/sandbox.py +35 -2
- inspect_ai/_eval/task/task.py +49 -46
- inspect_ai/_util/__init__.py +0 -0
- inspect_ai/_util/constants.py +1 -1
- inspect_ai/_util/content.py +8 -0
- inspect_ai/_util/error.py +2 -0
- inspect_ai/_util/file.py +15 -1
- inspect_ai/_util/logger.py +4 -2
- inspect_ai/_util/registry.py +7 -1
- inspect_ai/_view/view.py +1 -2
- inspect_ai/_view/www/App.css +8 -3
- inspect_ai/_view/www/README.md +1 -1
- inspect_ai/_view/www/dist/assets/index.css +66 -38
- inspect_ai/_view/www/dist/assets/index.js +525 -523
- inspect_ai/_view/www/log-schema.json +86 -73
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/App.tsx +1 -0
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +1 -1
- inspect_ai/_view/www/src/components/JsonPanel.tsx +1 -1
- inspect_ai/_view/www/src/components/LargeModal.tsx +39 -49
- inspect_ai/_view/www/src/components/NavPills.tsx +3 -1
- inspect_ai/_view/www/src/components/TabSet.tsx +19 -4
- inspect_ai/_view/www/src/logfile/remoteLogFile.ts +0 -1
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +1 -1
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +1 -1
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +6 -13
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +17 -2
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +1 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +14 -5
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +4 -2
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +16 -24
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +1 -1
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +1 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +27 -13
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +19 -17
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +12 -10
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +56 -66
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +12 -5
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +21 -36
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +3 -1
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +27 -25
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +5 -1
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +13 -13
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +1 -1
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +9 -5
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +1 -1
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -4
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +1 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +1 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +17 -6
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +14 -19
- inspect_ai/_view/www/src/types/log.d.ts +107 -19
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +7 -1
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +5 -3
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +25 -27
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +12 -11
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +25 -2
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +60 -36
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +4 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +6 -4
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +16 -14
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +9 -19
- inspect_ai/_view/www/src/workspace/utils.ts +34 -0
- inspect_ai/approval/_approval.py +2 -0
- inspect_ai/approval/_approver.py +4 -4
- inspect_ai/approval/_auto.py +1 -1
- inspect_ai/approval/_human/approver.py +3 -0
- inspect_ai/approval/_policy.py +5 -0
- inspect_ai/approval/_registry.py +2 -2
- inspect_ai/dataset/_dataset.py +36 -45
- inspect_ai/dataset/_sources/__init__.py +0 -0
- inspect_ai/dataset/_sources/csv.py +13 -13
- inspect_ai/dataset/_sources/hf.py +29 -29
- inspect_ai/dataset/_sources/json.py +10 -10
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_convert.py +3 -3
- inspect_ai/log/_file.py +24 -9
- inspect_ai/log/_log.py +98 -7
- inspect_ai/log/_message.py +3 -1
- inspect_ai/log/_recorders/file.py +4 -0
- inspect_ai/log/_recorders/recorder.py +3 -0
- inspect_ai/log/_transcript.py +19 -8
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_cache.py +39 -21
- inspect_ai/model/_call_tools.py +2 -2
- inspect_ai/model/_chat_message.py +14 -4
- inspect_ai/model/_generate_config.py +1 -1
- inspect_ai/model/_model.py +31 -24
- inspect_ai/model/_model_output.py +14 -1
- inspect_ai/model/_openai.py +10 -18
- inspect_ai/model/_providers/google.py +9 -5
- inspect_ai/model/_providers/openai.py +5 -9
- inspect_ai/model/_providers/openrouter.py +1 -1
- inspect_ai/scorer/__init__.py +6 -1
- inspect_ai/scorer/_answer.py +1 -1
- inspect_ai/scorer/_classification.py +4 -0
- inspect_ai/scorer/_match.py +4 -5
- inspect_ai/scorer/_metric.py +87 -28
- inspect_ai/scorer/_metrics/__init__.py +3 -3
- inspect_ai/scorer/_metrics/accuracy.py +8 -10
- inspect_ai/scorer/_metrics/mean.py +3 -17
- inspect_ai/scorer/_metrics/std.py +111 -30
- inspect_ai/scorer/_model.py +12 -12
- inspect_ai/scorer/_pattern.py +3 -3
- inspect_ai/scorer/_reducer/reducer.py +36 -21
- inspect_ai/scorer/_reducer/registry.py +2 -2
- inspect_ai/scorer/_reducer/types.py +7 -1
- inspect_ai/scorer/_score.py +11 -1
- inspect_ai/scorer/_scorer.py +110 -16
- inspect_ai/solver/__init__.py +1 -1
- inspect_ai/solver/_basic_agent.py +19 -22
- inspect_ai/solver/_bridge/__init__.py +0 -3
- inspect_ai/solver/_bridge/bridge.py +3 -3
- inspect_ai/solver/_chain.py +1 -2
- inspect_ai/solver/_critique.py +3 -3
- inspect_ai/solver/_fork.py +2 -2
- inspect_ai/solver/_human_agent/__init__.py +0 -0
- inspect_ai/solver/_human_agent/agent.py +5 -8
- inspect_ai/solver/_human_agent/commands/clock.py +14 -10
- inspect_ai/solver/_human_agent/commands/note.py +1 -1
- inspect_ai/solver/_human_agent/commands/score.py +0 -11
- inspect_ai/solver/_multiple_choice.py +15 -18
- inspect_ai/solver/_prompt.py +7 -7
- inspect_ai/solver/_solver.py +53 -52
- inspect_ai/solver/_task_state.py +80 -69
- inspect_ai/solver/_use_tools.py +9 -9
- inspect_ai/tool/__init__.py +2 -1
- inspect_ai/tool/_tool.py +43 -14
- inspect_ai/tool/_tool_call.py +6 -2
- inspect_ai/tool/_tool_choice.py +3 -1
- inspect_ai/tool/_tool_def.py +10 -8
- inspect_ai/tool/_tool_params.py +24 -0
- inspect_ai/tool/_tool_with.py +7 -7
- inspect_ai/tool/_tools/__init__.py +0 -0
- inspect_ai/tool/_tools/_computer/_common.py +2 -2
- inspect_ai/tool/_tools/_computer/_computer.py +11 -0
- inspect_ai/tool/_tools/_execute.py +15 -9
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
- inspect_ai/tool/_tools/_web_search.py +7 -5
- inspect_ai/util/_concurrency.py +3 -3
- inspect_ai/util/_panel.py +2 -0
- inspect_ai/util/_resource.py +12 -12
- inspect_ai/util/_sandbox/docker/compose.py +23 -20
- inspect_ai/util/_sandbox/docker/config.py +2 -1
- inspect_ai/util/_sandbox/docker/docker.py +10 -1
- inspect_ai/util/_sandbox/docker/service.py +100 -0
- inspect_ai/util/_sandbox/environment.py +99 -96
- inspect_ai/util/_subprocess.py +5 -3
- inspect_ai/util/_subtask.py +15 -16
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/LICENSE +1 -1
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/METADATA +10 -6
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/RECORD +182 -175
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/top_level.txt +0 -0
inspect_ai/_eval/task/run.py
CHANGED
@@ -190,7 +190,7 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
|
|
190
190
|
if task.setup:
|
191
191
|
plan.steps = unroll(task.setup) + plan.steps
|
192
192
|
|
193
|
-
#
|
193
|
+
# resolve the scorer
|
194
194
|
score = score and task.scorer is not None
|
195
195
|
scorers: list[Scorer] | None = task.scorer if (score and task.scorer) else None
|
196
196
|
scorer_profiles = (
|
@@ -519,6 +519,7 @@ async def task_run_sample(
|
|
519
519
|
key: SampleScore(
|
520
520
|
score=score,
|
521
521
|
sample_id=previous_sample.id,
|
522
|
+
sample_metadata=previous_sample.metadata,
|
522
523
|
)
|
523
524
|
for key, score in previous_sample.scores.items()
|
524
525
|
}
|
@@ -696,6 +697,7 @@ async def task_run_sample(
|
|
696
697
|
sample_score = SampleScore(
|
697
698
|
score=score_result,
|
698
699
|
sample_id=sample.id,
|
700
|
+
sample_metadata=sample.metadata,
|
699
701
|
scorer=registry_unqualified_name(scorer),
|
700
702
|
)
|
701
703
|
transcript()._event(
|
@@ -709,7 +711,9 @@ async def task_run_sample(
|
|
709
711
|
if state.scores is not None:
|
710
712
|
for name, score in state.scores.items():
|
711
713
|
results[name] = SampleScore(
|
712
|
-
score=score,
|
714
|
+
score=score,
|
715
|
+
sample_id=state.sample_id,
|
716
|
+
sample_metadata=state.metadata,
|
713
717
|
)
|
714
718
|
transcript()._event(
|
715
719
|
ScoreEvent(score=score, target=sample.target)
|
inspect_ai/_eval/task/sandbox.py
CHANGED
@@ -5,11 +5,20 @@ from random import random
|
|
5
5
|
from typing import AsyncGenerator, Callable, NamedTuple, cast
|
6
6
|
|
7
7
|
import httpx
|
8
|
+
from tenacity import (
|
9
|
+
retry,
|
10
|
+
retry_if_exception,
|
11
|
+
stop_after_attempt,
|
12
|
+
stop_after_delay,
|
13
|
+
wait_exponential_jitter,
|
14
|
+
)
|
8
15
|
|
9
16
|
from inspect_ai._eval.task.task import Task
|
10
17
|
from inspect_ai._eval.task.util import task_run_dir
|
18
|
+
from inspect_ai._util.constants import DEFAULT_MAX_RETRIES, DEFAULT_TIMEOUT
|
11
19
|
from inspect_ai._util.file import file, filesystem
|
12
20
|
from inspect_ai._util.registry import registry_unqualified_name
|
21
|
+
from inspect_ai._util.retry import httpx_should_retry, log_retry_attempt
|
13
22
|
from inspect_ai._util.url import data_uri_to_base64, is_data_uri, is_http_url
|
14
23
|
from inspect_ai.dataset import Sample
|
15
24
|
from inspect_ai.util._concurrency import concurrency
|
@@ -115,8 +124,7 @@ async def read_sandboxenv_file(contents: str) -> bytes:
|
|
115
124
|
contents_base64 = data_uri_to_base64(contents)
|
116
125
|
file_bytes = base64.b64decode(contents_base64)
|
117
126
|
elif is_http_url(contents):
|
118
|
-
|
119
|
-
file_bytes = (await client.get(contents, follow_redirects=True)).content
|
127
|
+
file_bytes = await _retrying_httpx_get(contents)
|
120
128
|
else:
|
121
129
|
# try to read as a file (if it doesn't exist or has a path not cool w/
|
122
130
|
# the filesystem then we fall back to contents)
|
@@ -172,3 +180,28 @@ def resolve_sandbox(
|
|
172
180
|
return sample.sandbox
|
173
181
|
else:
|
174
182
|
return None
|
183
|
+
|
184
|
+
|
185
|
+
async def _retrying_httpx_get(
|
186
|
+
url: str,
|
187
|
+
client: httpx.AsyncClient = httpx.AsyncClient(),
|
188
|
+
timeout: int = 30, # per-attempt timeout
|
189
|
+
max_retries: int = DEFAULT_MAX_RETRIES,
|
190
|
+
total_timeout: int = DEFAULT_TIMEOUT, # timeout for the whole retry loop. not for an individual attempt
|
191
|
+
) -> bytes:
|
192
|
+
@retry(
|
193
|
+
wait=wait_exponential_jitter(),
|
194
|
+
stop=(stop_after_attempt(max_retries) | stop_after_delay(total_timeout)),
|
195
|
+
retry=retry_if_exception(httpx_should_retry),
|
196
|
+
before_sleep=log_retry_attempt(url),
|
197
|
+
)
|
198
|
+
async def do_get() -> bytes:
|
199
|
+
response = await client.get(
|
200
|
+
url=url,
|
201
|
+
follow_redirects=True,
|
202
|
+
timeout=(timeout, timeout, timeout, timeout),
|
203
|
+
)
|
204
|
+
response.raise_for_status()
|
205
|
+
return response.content
|
206
|
+
|
207
|
+
return await do_get()
|
inspect_ai/_eval/task/task.py
CHANGED
@@ -39,38 +39,6 @@ class Task:
|
|
39
39
|
r"""Evaluation task.
|
40
40
|
|
41
41
|
Tasks are the basis for defining and running evaluations.
|
42
|
-
|
43
|
-
Args:
|
44
|
-
dataset (Dataset | Sequence[Sample]): Dataset to evaluate
|
45
|
-
setup: (Solver | list[Solver] | None): Setup step (always run
|
46
|
-
even when the main `solver` is replaced).
|
47
|
-
solver: (Solver | list[Solver]): Solver or list of solvers.
|
48
|
-
Defaults to generate(), a normal call to the model.
|
49
|
-
scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
|
50
|
-
metrics (list[Metric] | dict[str, list[Metric]] | None):
|
51
|
-
Alternative metrics (overrides the metrics provided by the specified scorer).
|
52
|
-
config (GenerateConfig): Model generation config.
|
53
|
-
sandbox (SandboxEnvironmentType | None): Sandbox environment type
|
54
|
-
(or optionally a str or tuple with a shorthand spec)
|
55
|
-
approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
|
56
|
-
Either a path to an approval policy config file or a list of approval policies.
|
57
|
-
Defaults to no approval policy.
|
58
|
-
epochs (int | Epochs | None): Epochs to repeat samples for and optional score
|
59
|
-
reducer function(s) used to combine sample scores (defaults to "mean")
|
60
|
-
fail_on_error (bool | float | None): `True` to fail on first sample error
|
61
|
-
(default); `False` to never fail on sample errors; Value between 0 and 1
|
62
|
-
to fail if a proportion of total samples fails. Value greater than 1 to fail
|
63
|
-
eval if a count of samples fails.
|
64
|
-
message_limit (int | None): Limit on total messages used for each sample.
|
65
|
-
token_limit (int | None): Limit on total tokens used for each sample.
|
66
|
-
time_limit (int | None): Limit on time (in seconds) for execution of each sample.
|
67
|
-
name: (str | None): Task name. If not specified is automatically
|
68
|
-
determined based on the name of the task directory (or "task")
|
69
|
-
if its anonymous task (e.g. created in a notebook and passed to
|
70
|
-
eval() directly)
|
71
|
-
version: (int): Version of task (to distinguish evolutions
|
72
|
-
of the task spec or breaking changes to it)
|
73
|
-
metadata: (dict[str, Any] | None): Additional metadata to associate with the task.
|
74
42
|
"""
|
75
43
|
|
76
44
|
def __init__(
|
@@ -93,6 +61,41 @@ class Task:
|
|
93
61
|
metadata: dict[str, Any] | None = None,
|
94
62
|
**kwargs: Unpack[TaskDeprecatedArgs],
|
95
63
|
) -> None:
|
64
|
+
"""Create a task.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
dataset (Dataset | Sequence[Sample]): Dataset to evaluate
|
68
|
+
setup: (Solver | list[Solver] | None): Setup step (always run
|
69
|
+
even when the main `solver` is replaced).
|
70
|
+
solver: (Solver | list[Solver]): Solver or list of solvers.
|
71
|
+
Defaults to generate(), a normal call to the model.
|
72
|
+
scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
|
73
|
+
metrics (list[Metric] | dict[str, list[Metric]] | None):
|
74
|
+
Alternative metrics (overrides the metrics provided by the specified scorer).
|
75
|
+
config (GenerateConfig): Model generation config.
|
76
|
+
sandbox (SandboxEnvironmentType | None): Sandbox environment type
|
77
|
+
(or optionally a str or tuple with a shorthand spec)
|
78
|
+
approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
|
79
|
+
Either a path to an approval policy config file or a list of approval policies.
|
80
|
+
Defaults to no approval policy.
|
81
|
+
epochs (int | Epochs | None): Epochs to repeat samples for and optional score
|
82
|
+
reducer function(s) used to combine sample scores (defaults to "mean")
|
83
|
+
fail_on_error (bool | float | None): `True` to fail on first sample error
|
84
|
+
(default); `False` to never fail on sample errors; Value between 0 and 1
|
85
|
+
to fail if a proportion of total samples fails. Value greater than 1 to fail
|
86
|
+
eval if a count of samples fails.
|
87
|
+
message_limit (int | None): Limit on total messages used for each sample.
|
88
|
+
token_limit (int | None): Limit on total tokens used for each sample.
|
89
|
+
time_limit (int | None): Limit on time (in seconds) for execution of each sample.
|
90
|
+
name: (str | None): Task name. If not specified is automatically
|
91
|
+
determined based on the name of the task directory (or "task")
|
92
|
+
if its anonymous task (e.g. created in a notebook and passed to
|
93
|
+
eval() directly)
|
94
|
+
version: (int): Version of task (to distinguish evolutions
|
95
|
+
of the task spec or breaking changes to it)
|
96
|
+
metadata: (dict[str, Any] | None): Additional metadata to associate with the task.
|
97
|
+
**kwargs: Deprecated arguments.
|
98
|
+
"""
|
96
99
|
# handle deprecated args
|
97
100
|
for arg, value in kwargs.items():
|
98
101
|
newarg = ""
|
@@ -179,33 +182,33 @@ def task_with(
|
|
179
182
|
task (Task): Task to adapt (it is deep copied prior to mutating options)
|
180
183
|
dataset (Dataset | Sequence[Sample]): Dataset to evaluate
|
181
184
|
setup: (Solver | list[Solver] | None): Setup step (always run
|
182
|
-
|
185
|
+
even when the main `solver` is replaced).
|
183
186
|
solver: (Solver | list[Solver]): Solver or list of solvers.
|
184
|
-
|
187
|
+
Defaults to generate(), a normal call to the model.
|
185
188
|
scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
|
186
189
|
metrics (list[Metric] | dict[str, list[Metric]] | None):
|
187
|
-
|
190
|
+
Alternative metrics (overrides the metrics provided by the specified scorer).
|
188
191
|
config (GenerateConfig): Model generation config.
|
189
192
|
sandbox (SandboxEnvironmentType | None): Sandbox environment type
|
190
|
-
|
193
|
+
(or optionally a str or tuple with a shorthand spec)
|
191
194
|
approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
|
192
|
-
|
193
|
-
|
195
|
+
Either a path to an approval policy config file or a list of approval policies.
|
196
|
+
Defaults to no approval policy.
|
194
197
|
epochs (int | Epochs | None): Epochs to repeat samples for and optional score
|
195
|
-
|
198
|
+
reducer function(s) used to combine sample scores (defaults to "mean")
|
196
199
|
fail_on_error (bool | float | None): `True` to fail on first sample error
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
+
(default); `False` to never fail on sample errors; Value between 0 and 1
|
201
|
+
to fail if a proportion of total samples fails. Value greater than 1 to fail
|
202
|
+
eval if a count of samples fails.
|
200
203
|
message_limit (int | None): Limit on total messages used for each sample.
|
201
204
|
token_limit (int | None): Limit on total tokens used for each sample.
|
202
205
|
time_limit (int | None): Limit on time (in seconds) for execution of each sample.
|
203
206
|
name: (str | None): Task name. If not specified is automatically
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
+
determined based on the name of the task directory (or "task")
|
208
|
+
if its anonymous task (e.g. created in a notebook and passed to
|
209
|
+
eval() directly)
|
207
210
|
version: (int): Version of task (to distinguish evolutions
|
208
|
-
|
211
|
+
of the task spec or breaking changes to it)
|
209
212
|
metadata: (dict[str, Any] | None): Additional metadata to associate with the task.
|
210
213
|
|
211
214
|
Returns:
|
File without changes
|
inspect_ai/_util/constants.py
CHANGED
inspect_ai/_util/content.py
CHANGED
@@ -4,6 +4,8 @@ from pydantic import BaseModel, Field
|
|
4
4
|
|
5
5
|
|
6
6
|
class ContentText(BaseModel):
|
7
|
+
"""Text content."""
|
8
|
+
|
7
9
|
type: Literal["text"] = Field(default="text")
|
8
10
|
"""Type."""
|
9
11
|
|
@@ -12,6 +14,8 @@ class ContentText(BaseModel):
|
|
12
14
|
|
13
15
|
|
14
16
|
class ContentImage(BaseModel):
|
17
|
+
"""Image content."""
|
18
|
+
|
15
19
|
type: Literal["image"] = Field(default="image")
|
16
20
|
"""Type."""
|
17
21
|
|
@@ -26,6 +30,8 @@ class ContentImage(BaseModel):
|
|
26
30
|
|
27
31
|
|
28
32
|
class ContentAudio(BaseModel):
|
33
|
+
"""Audio content."""
|
34
|
+
|
29
35
|
type: Literal["audio"] = Field(default="audio")
|
30
36
|
"""Type."""
|
31
37
|
|
@@ -37,6 +43,8 @@ class ContentAudio(BaseModel):
|
|
37
43
|
|
38
44
|
|
39
45
|
class ContentVideo(BaseModel):
|
46
|
+
"""Video content."""
|
47
|
+
|
40
48
|
type: Literal["video"] = Field(default="video")
|
41
49
|
"""Type."""
|
42
50
|
|
inspect_ai/_util/error.py
CHANGED
inspect_ai/_util/file.py
CHANGED
@@ -18,6 +18,7 @@ from fsspec.core import split_protocol # type: ignore # type: ignore
|
|
18
18
|
from fsspec.implementations.local import make_path_posix # type: ignore
|
19
19
|
from pydantic import BaseModel
|
20
20
|
from s3fs import S3FileSystem # type: ignore
|
21
|
+
from shortuuid import uuid
|
21
22
|
|
22
23
|
# https://filesystem-spec.readthedocs.io/en/latest/_modules/fsspec/spec.html#AbstractFileSystem
|
23
24
|
# https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.generic.GenericFileSystem
|
@@ -169,6 +170,9 @@ class FileSystem:
|
|
169
170
|
def exists(self, path: str) -> bool:
|
170
171
|
return self.fs.exists(path) is True
|
171
172
|
|
173
|
+
def touch(self, path: str) -> None:
|
174
|
+
self.fs.touch(path)
|
175
|
+
|
172
176
|
def rm(
|
173
177
|
self, path: str, recursive: bool = False, maxdepth: int | None = None
|
174
178
|
) -> None:
|
@@ -218,6 +222,16 @@ class FileSystem:
|
|
218
222
|
def is_local(self) -> bool:
|
219
223
|
return isinstance(self.fs, fsspec.implementations.local.LocalFileSystem)
|
220
224
|
|
225
|
+
def is_writeable(self, path: str) -> bool:
|
226
|
+
try:
|
227
|
+
path = path.rstrip("/\\")
|
228
|
+
touch_file = f"{path}{self.fs.sep}{uuid()}"
|
229
|
+
self.touch(touch_file)
|
230
|
+
self.rm(touch_file)
|
231
|
+
return True
|
232
|
+
except PermissionError:
|
233
|
+
return False
|
234
|
+
|
221
235
|
def is_async(self) -> bool:
|
222
236
|
return isinstance(self.fs, fsspec.asyn.AsyncFileSystem)
|
223
237
|
|
@@ -354,7 +368,7 @@ def safe_filename(s: str, max_length: int = 255) -> str:
|
|
354
368
|
Returns:
|
355
369
|
str: A safe filename string
|
356
370
|
|
357
|
-
|
371
|
+
Examples:
|
358
372
|
>>> safe_filename("Hello/World?.txt")
|
359
373
|
'Hello_World.txt'
|
360
374
|
"""
|
inspect_ai/_util/logger.py
CHANGED
@@ -161,7 +161,7 @@ def init_logger(
|
|
161
161
|
getLogger().addHandler(_logHandler)
|
162
162
|
|
163
163
|
# establish default capture level
|
164
|
-
capture_level = min(TRACE, levelno)
|
164
|
+
capture_level = min(TRACE, levelno, transcript_levelno)
|
165
165
|
|
166
166
|
# see all the messages (we won't actually display/write all of them)
|
167
167
|
getLogger().setLevel(capture_level)
|
@@ -181,7 +181,9 @@ def notify_logger_record(record: LogRecord, write: bool) -> None:
|
|
181
181
|
from inspect_ai.log._transcript import LoggerEvent, transcript
|
182
182
|
|
183
183
|
if write:
|
184
|
-
transcript()._event(
|
184
|
+
transcript()._event(
|
185
|
+
LoggerEvent(message=LoggingMessage._from_log_record(record))
|
186
|
+
)
|
185
187
|
global _rate_limit_count
|
186
188
|
if (record.levelno <= INFO and re.search(r"\b429\b", record.getMessage())) or (
|
187
189
|
record.levelno == DEBUG
|
inspect_ai/_util/registry.py
CHANGED
@@ -209,7 +209,13 @@ def registry_create(type: RegistryType, name: str, **kwargs: Any) -> object:
|
|
209
209
|
if isclass(obj):
|
210
210
|
return with_registry_info(obj(**kwargs))
|
211
211
|
elif callable(obj):
|
212
|
-
return_type =
|
212
|
+
return_type = get_annotations(obj).get("return")
|
213
|
+
# Until we remove the MetricDeprecated symbol we need this extra
|
214
|
+
# bit to map the Metric union back to Metric
|
215
|
+
if "_metric.Metric" in str(return_type):
|
216
|
+
return_type = "Metric"
|
217
|
+
else:
|
218
|
+
return_type = getattr(return_type, "__name__", None)
|
213
219
|
if return_type and return_type.lower() == type:
|
214
220
|
return with_registry_info(obj(**kwargs))
|
215
221
|
else:
|
inspect_ai/_view/view.py
CHANGED
@@ -28,11 +28,10 @@ def view(
|
|
28
28
|
port: int = DEFAULT_VIEW_PORT,
|
29
29
|
authorization: str | None = None,
|
30
30
|
log_level: str | None = None,
|
31
|
-
log_level_transcript: str | None = None,
|
32
31
|
fs_options: dict[str, Any] = {},
|
33
32
|
) -> None:
|
34
33
|
init_dotenv()
|
35
|
-
init_logger(log_level
|
34
|
+
init_logger(log_level)
|
36
35
|
|
37
36
|
# initialize the log_dir
|
38
37
|
log_dir = log_dir if log_dir else os.getenv("INSPECT_LOG_DIR", "./logs")
|
inspect_ai/_view/www/App.css
CHANGED
@@ -25,6 +25,7 @@
|
|
25
25
|
/* Inspect Font Sizes */
|
26
26
|
--inspect-font-size-title: 1.5rem;
|
27
27
|
--inspect-font-size-title-secondary: 1.3rem;
|
28
|
+
--inspect-font-size-largest: 1.2rem;
|
28
29
|
--inspect-font-size-larger: 1.1rem;
|
29
30
|
--inspect-font-size-large: 1rem;
|
30
31
|
--inspect-font-size-base: 0.9rem;
|
@@ -64,15 +65,15 @@ body[class^="vscode-"] .app-main-grid {
|
|
64
65
|
|
65
66
|
/* Inspect Text Styles */
|
66
67
|
.text-style-label {
|
67
|
-
text-transform: uppercase;
|
68
|
+
text-transform: uppercase !important;
|
68
69
|
}
|
69
70
|
|
70
71
|
.text-style-secondary {
|
71
|
-
color: var(--bs-secondary);
|
72
|
+
color: var(--bs-secondary) !important;
|
72
73
|
}
|
73
74
|
|
74
75
|
.text-style-tertiary {
|
75
|
-
color: var(--bs-tertiary-color);
|
76
|
+
color: var(--bs-tertiary-color) !important;
|
76
77
|
}
|
77
78
|
|
78
79
|
/* Inspect Font Size Styles */
|
@@ -84,6 +85,10 @@ body[class^="vscode-"] .app-main-grid {
|
|
84
85
|
font-size: var(--inspect-font-size-title-secondary);
|
85
86
|
}
|
86
87
|
|
88
|
+
.text-size-largest {
|
89
|
+
font-size: var(--inspect-font-size-largest);
|
90
|
+
}
|
91
|
+
|
87
92
|
.text-size-larger {
|
88
93
|
font-size: var(--inspect-font-size-larger);
|
89
94
|
}
|
inspect_ai/_view/www/README.md
CHANGED
@@ -14298,6 +14298,7 @@ pre[class*="language-"] {
|
|
14298
14298
|
/* Inspect Font Sizes */
|
14299
14299
|
--inspect-font-size-title: 1.5rem;
|
14300
14300
|
--inspect-font-size-title-secondary: 1.3rem;
|
14301
|
+
--inspect-font-size-largest: 1.2rem;
|
14301
14302
|
--inspect-font-size-larger: 1.1rem;
|
14302
14303
|
--inspect-font-size-large: 1rem;
|
14303
14304
|
--inspect-font-size-base: 0.9rem;
|
@@ -14337,15 +14338,15 @@ body[class^="vscode-"] .app-main-grid {
|
|
14337
14338
|
|
14338
14339
|
/* Inspect Text Styles */
|
14339
14340
|
.text-style-label {
|
14340
|
-
text-transform: uppercase;
|
14341
|
+
text-transform: uppercase !important;
|
14341
14342
|
}
|
14342
14343
|
|
14343
14344
|
.text-style-secondary {
|
14344
|
-
color: var(--bs-secondary);
|
14345
|
+
color: var(--bs-secondary) !important;
|
14345
14346
|
}
|
14346
14347
|
|
14347
14348
|
.text-style-tertiary {
|
14348
|
-
color: var(--bs-tertiary-color);
|
14349
|
+
color: var(--bs-tertiary-color) !important;
|
14349
14350
|
}
|
14350
14351
|
|
14351
14352
|
/* Inspect Font Size Styles */
|
@@ -14357,6 +14358,10 @@ body[class^="vscode-"] .app-main-grid {
|
|
14357
14358
|
font-size: var(--inspect-font-size-title-secondary);
|
14358
14359
|
}
|
14359
14360
|
|
14361
|
+
.text-size-largest {
|
14362
|
+
font-size: var(--inspect-font-size-largest);
|
14363
|
+
}
|
14364
|
+
|
14360
14365
|
.text-size-larger {
|
14361
14366
|
font-size: var(--inspect-font-size-larger);
|
14362
14367
|
}
|
@@ -16195,58 +16200,58 @@ ul.jsondiffpatch-textdiff {
|
|
16195
16200
|
grid-template-columns: max-content max-content;
|
16196
16201
|
column-gap: 1em;
|
16197
16202
|
}
|
16198
|
-
.
|
16203
|
+
._container_1jqar_1 {
|
16199
16204
|
margin-top: 0.5em;
|
16200
16205
|
padding-left: 0;
|
16201
16206
|
}
|
16202
16207
|
|
16203
|
-
.
|
16204
|
-
padding-right: 2em;
|
16205
|
-
padding-left: 0;
|
16206
|
-
padding-bottom: 0;
|
16208
|
+
._label_1jqar_6 {
|
16209
|
+
padding-right: 2em !important;
|
16210
|
+
padding-left: 0 !important;
|
16211
|
+
padding-bottom: 0 !important;
|
16207
16212
|
font-weight: 400;
|
16208
|
-
padding-bottom: 0;
|
16213
|
+
padding-bottom: 0 !important;
|
16209
16214
|
}
|
16210
16215
|
|
16211
|
-
.
|
16216
|
+
._wordBreak_1jqar_14 {
|
16212
16217
|
word-break: break-all;
|
16213
16218
|
}
|
16214
16219
|
|
16215
|
-
.
|
16220
|
+
._scoreTable_1jqar_18 {
|
16216
16221
|
width: 100%;
|
16217
16222
|
margin-bottom: 1em;
|
16218
16223
|
}
|
16219
16224
|
|
16220
|
-
.
|
16225
|
+
._bottomBorder_1jqar_23 {
|
16221
16226
|
border-bottom-color: #00000000;
|
16222
16227
|
}
|
16223
16228
|
|
16224
|
-
.
|
16229
|
+
._headerScore_1jqar_27 {
|
16225
16230
|
padding-left: 2em;
|
16226
16231
|
}
|
16227
16232
|
|
16228
|
-
.
|
16229
|
-
padding-right: 2em;
|
16230
|
-
padding-left: 0;
|
16231
|
-
padding-top: 0;
|
16233
|
+
._targetValue_1jqar_31 {
|
16234
|
+
padding-right: 2em !important;
|
16235
|
+
padding-left: 0 !important;
|
16236
|
+
padding-top: 0 !important;
|
16232
16237
|
}
|
16233
16238
|
|
16234
|
-
.
|
16235
|
-
padding-left: 0;
|
16236
|
-
padding-top: 0;
|
16239
|
+
._answerValue_1jqar_37 {
|
16240
|
+
padding-left: 0 !important;
|
16241
|
+
padding-top: 0 !important;
|
16237
16242
|
}
|
16238
16243
|
|
16239
|
-
.
|
16240
|
-
padding-left: 2em;
|
16241
|
-
padding-top: 0;
|
16244
|
+
._scoreValue_1jqar_42 {
|
16245
|
+
padding-left: 2em !important;
|
16246
|
+
padding-top: 0 !important;
|
16242
16247
|
}
|
16243
16248
|
|
16244
|
-
.
|
16245
|
-
padding-left: 0;
|
16249
|
+
._noLeft_1jqar_47 {
|
16250
|
+
padding-left: 0 !important;
|
16246
16251
|
}
|
16247
16252
|
|
16248
|
-
.
|
16249
|
-
margin-top: 0;
|
16253
|
+
._noTop_1jqar_51 {
|
16254
|
+
margin-top: 0 !important;
|
16250
16255
|
}
|
16251
16256
|
._wrapper_b0it4_1 {
|
16252
16257
|
display: grid;
|
@@ -19490,7 +19495,7 @@ span.ap-marker-container:hover span.ap-marker {
|
|
19490
19495
|
display: grid;
|
19491
19496
|
grid-template-columns: minmax(0, max-content) max-content;
|
19492
19497
|
}
|
19493
|
-
.
|
19498
|
+
._simpleMetricsRows_tnqkm_1 {
|
19494
19499
|
display: flex;
|
19495
19500
|
flex-direction: row;
|
19496
19501
|
flex-wrap: wrap;
|
@@ -19501,28 +19506,28 @@ span.ap-marker-container:hover span.ap-marker {
|
|
19501
19506
|
overflow: scroll;
|
19502
19507
|
}
|
19503
19508
|
|
19504
|
-
.
|
19509
|
+
._multiMetricsRows_tnqkm_12 {
|
19505
19510
|
display: flex;
|
19506
19511
|
flex-direction: row;
|
19507
19512
|
flex-wrap: wrap;
|
19508
19513
|
justify-content: end;
|
19509
|
-
height: 100%;
|
19510
19514
|
align-items: center;
|
19511
19515
|
margin-top: 0.2rem;
|
19512
19516
|
padding-bottom: 0.4rem;
|
19513
19517
|
row-gap: 1em;
|
19514
19518
|
max-height: 15em;
|
19515
19519
|
overflow: scroll;
|
19520
|
+
align-items: baseline;
|
19516
19521
|
}
|
19517
19522
|
|
19518
|
-
.
|
19523
|
+
._verticalMetricReducer_tnqkm_26 {
|
19519
19524
|
font-size: var(--inspect-font-size-smaller);
|
19520
19525
|
text-align: center;
|
19521
19526
|
padding-top: 0.3rem;
|
19522
19527
|
margin-bottom: -0.3rem;
|
19523
19528
|
}
|
19524
19529
|
|
19525
|
-
.
|
19530
|
+
._verticalMetricName_tnqkm_33 {
|
19526
19531
|
font-size: var(--inspect-font-size-smaller);
|
19527
19532
|
text-align: center;
|
19528
19533
|
padding-top: 0.3rem;
|
@@ -19530,32 +19535,55 @@ span.ap-marker-container:hover span.ap-marker {
|
|
19530
19535
|
border-bottom: solid var(--bs-border-color) 1px;
|
19531
19536
|
}
|
19532
19537
|
|
19533
|
-
.
|
19534
|
-
font-size: var(--inspect-font-size-larger);
|
19538
|
+
._verticalMetricValue_tnqkm_41 {
|
19535
19539
|
font-weight: 500;
|
19536
19540
|
text-align: center;
|
19537
19541
|
}
|
19538
19542
|
|
19539
|
-
.
|
19543
|
+
._multiScorer_tnqkm_46 {
|
19544
|
+
padding-left: 0;
|
19545
|
+
height: 100%;
|
19546
|
+
display: flex;
|
19547
|
+
flex-direction: column;
|
19548
|
+
padding: 0.5em 1em;
|
19549
|
+
}
|
19550
|
+
|
19551
|
+
._multiScorerIndent_tnqkm_54 {
|
19552
|
+
padding-left: 1.5em;
|
19553
|
+
}
|
19554
|
+
|
19555
|
+
._multiScorerReducer_tnqkm_58 {
|
19540
19556
|
text-align: center;
|
19541
19557
|
margin-bottom: -0.3rem;
|
19558
|
+
margin-top: 0.2em;
|
19542
19559
|
}
|
19543
19560
|
|
19544
|
-
.
|
19561
|
+
._multiScorerLabel_tnqkm_64 {
|
19545
19562
|
text-align: center;
|
19546
19563
|
border-bottom: solid var(--bs-border-color) 1px;
|
19547
19564
|
margin-bottom: -0.1rem;
|
19548
19565
|
}
|
19549
19566
|
|
19550
|
-
.
|
19567
|
+
._multiScorerValue_tnqkm_70 {
|
19551
19568
|
display: grid;
|
19552
19569
|
grid-template-columns: auto auto;
|
19570
|
+
grid-auto-rows: auto;
|
19553
19571
|
grid-column-gap: 0.3rem;
|
19554
19572
|
grid-row-gap: 0;
|
19573
|
+
padding-top: 0.3em;
|
19555
19574
|
}
|
19556
19575
|
|
19557
|
-
.
|
19576
|
+
._multiScorerValueContent_tnqkm_79 {
|
19558
19577
|
font-weight: 600;
|
19578
|
+
text-align: center;
|
19579
|
+
}
|
19580
|
+
|
19581
|
+
._multiScoreMetricGrid_tnqkm_84 {
|
19582
|
+
display: grid;
|
19583
|
+
grid-template-rows: auto auto;
|
19584
|
+
column-gap: 1em;
|
19585
|
+
padding: 0 0.2em;
|
19586
|
+
justify-content: center;
|
19559
19587
|
}
|
19560
19588
|
._statusPanel_1fzh4_1 {
|
19561
19589
|
padding: 1em;
|