inspect-ai 0.3.74__py3-none-any.whl → 0.3.76__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +3 -2
- inspect_ai/_cli/cache.py +1 -1
- inspect_ai/_cli/common.py +15 -0
- inspect_ai/_cli/eval.py +4 -5
- inspect_ai/_cli/log.py +1 -1
- inspect_ai/_cli/sandbox.py +1 -1
- inspect_ai/_cli/trace.py +1 -1
- inspect_ai/_cli/view.py +1 -1
- inspect_ai/_display/core/config.py +3 -1
- inspect_ai/_eval/eval.py +55 -61
- inspect_ai/_eval/evalset.py +64 -154
- inspect_ai/_eval/loader.py +27 -54
- inspect_ai/_eval/registry.py +4 -15
- inspect_ai/_eval/run.py +7 -4
- inspect_ai/_eval/task/__init__.py +8 -2
- inspect_ai/_eval/task/log.py +9 -1
- inspect_ai/_eval/task/resolved.py +35 -0
- inspect_ai/_eval/task/run.py +4 -0
- inspect_ai/_eval/task/task.py +50 -69
- inspect_ai/_eval/task/tasks.py +30 -0
- inspect_ai/_util/constants.py +3 -0
- inspect_ai/_util/dotenv.py +17 -0
- inspect_ai/_util/logger.py +3 -0
- inspect_ai/_util/registry.py +43 -2
- inspect_ai/_view/server.py +28 -10
- inspect_ai/_view/www/dist/assets/index.css +32 -19
- inspect_ai/_view/www/dist/assets/index.js +17682 -29989
- inspect_ai/_view/www/log-schema.json +79 -9
- inspect_ai/_view/www/package.json +2 -2
- inspect_ai/_view/www/src/appearance/styles.ts +6 -5
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +2 -2
- inspect_ai/_view/www/src/constants.ts +3 -0
- inspect_ai/_view/www/src/logfile/remoteZipFile.ts +141 -20
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +2 -1
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +1 -1
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +7 -5
- inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +1 -1
- inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -2
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +1 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +3 -1
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +1 -1
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +5 -2
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +2 -2
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +5 -1
- inspect_ai/_view/www/src/types/log.d.ts +11 -5
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +17 -12
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -1
- inspect_ai/_view/www/yarn.lock +12 -5
- inspect_ai/log/_log.py +10 -1
- inspect_ai/log/_recorders/eval.py +27 -8
- inspect_ai/log/_recorders/json.py +10 -2
- inspect_ai/log/_transcript.py +13 -4
- inspect_ai/model/_call_tools.py +13 -4
- inspect_ai/model/_chat_message.py +15 -1
- inspect_ai/model/_model.py +30 -12
- inspect_ai/model/_model_output.py +6 -1
- inspect_ai/model/_openai.py +11 -6
- inspect_ai/model/_providers/anthropic.py +167 -77
- inspect_ai/model/_providers/google.py +6 -2
- inspect_ai/model/_providers/none.py +31 -0
- inspect_ai/model/_providers/openai.py +11 -8
- inspect_ai/model/_providers/providers.py +7 -0
- inspect_ai/model/_providers/vertex.py +5 -2
- inspect_ai/solver/_bridge/bridge.py +1 -1
- inspect_ai/solver/_chain.py +7 -6
- inspect_ai/tool/__init__.py +4 -0
- inspect_ai/tool/_tool_call.py +5 -2
- inspect_ai/tool/_tool_support_helpers.py +200 -0
- inspect_ai/tool/_tools/_bash_session.py +119 -0
- inspect_ai/tool/_tools/_computer/_computer.py +1 -1
- inspect_ai/tool/_tools/_text_editor.py +121 -0
- inspect_ai/tool/_tools/_web_browser/_back_compat.py +150 -0
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +75 -130
- inspect_ai/tool/_tools/_web_search.py +2 -2
- inspect_ai/util/_json.py +28 -0
- inspect_ai/util/_sandbox/context.py +18 -8
- inspect_ai/util/_sandbox/docker/config.py +1 -1
- inspect_ai/util/_sandbox/docker/internal.py +3 -3
- inspect_ai/util/_sandbox/environment.py +17 -2
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/METADATA +8 -5
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/RECORD +85 -108
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/WHEEL +1 -1
- inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +0 -8
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +0 -24
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +0 -25
- inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +0 -22
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +0 -63
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +0 -71
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +0 -323
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +0 -5
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +0 -279
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +0 -9
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +0 -293
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +0 -94
- inspect_ai/tool/_tools/_web_browser/_resources/constants.py +0 -2
- inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +0 -2
- inspect_ai/tool/_tools/_web_browser/_resources/mock_environment.py +0 -45
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +0 -50
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +0 -48
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +0 -280
- inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +0 -65
- inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +0 -64
- inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +0 -146
- inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +0 -64
- inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +0 -180
- inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +0 -99
- inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +0 -15
- inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +0 -44
- inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +0 -39
- inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +0 -214
- inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +0 -35
- inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +0 -192
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info/licenses}/LICENSE +0 -0
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,35 @@
|
|
1
|
+
from dataclasses import dataclass, field
|
2
|
+
from typing import Any, Set
|
3
|
+
|
4
|
+
from inspect_ai._eval.task import Task
|
5
|
+
from inspect_ai._eval.task.run import EvalSampleSource
|
6
|
+
from inspect_ai.model import Model
|
7
|
+
from inspect_ai.util import SandboxEnvironmentSpec
|
8
|
+
|
9
|
+
|
10
|
+
@dataclass(frozen=True)
|
11
|
+
class ResolvedTask:
|
12
|
+
task: Task
|
13
|
+
task_args: dict[str, Any]
|
14
|
+
task_file: str | None
|
15
|
+
model: Model
|
16
|
+
sandbox: SandboxEnvironmentSpec | None
|
17
|
+
sequence: int
|
18
|
+
id: str | None = field(default=None)
|
19
|
+
sample_source: EvalSampleSource | None = field(default=None)
|
20
|
+
|
21
|
+
@property
|
22
|
+
def has_sandbox(self) -> bool:
|
23
|
+
if self.sandbox:
|
24
|
+
return True
|
25
|
+
else:
|
26
|
+
return any(
|
27
|
+
[True if sample.sandbox else False for sample in self.task.dataset]
|
28
|
+
)
|
29
|
+
|
30
|
+
|
31
|
+
def resolved_model_names(tasks: list[ResolvedTask]) -> list[str]:
|
32
|
+
models: Set[str] = set()
|
33
|
+
for task in tasks:
|
34
|
+
models.add(str(task.model))
|
35
|
+
return list(models)
|
inspect_ai/_eval/task/run.py
CHANGED
@@ -599,6 +599,10 @@ async def task_run_sample(
|
|
599
599
|
)
|
600
600
|
|
601
601
|
async with sandboxenv_cm:
|
602
|
+
timeout_cm: (
|
603
|
+
contextlib._GeneratorContextManager[anyio.CancelScope, None, None]
|
604
|
+
| contextlib.nullcontext[None]
|
605
|
+
) = contextlib.nullcontext()
|
602
606
|
try:
|
603
607
|
# update active sample wth sandboxes now that we are initialised
|
604
608
|
active.sandboxes = await sandbox_connections()
|
inspect_ai/_eval/task/task.py
CHANGED
@@ -13,6 +13,7 @@ from inspect_ai.approval._policy import ApprovalPolicy, approval_policies_from_c
|
|
13
13
|
from inspect_ai.dataset import Dataset, MemoryDataset, Sample
|
14
14
|
from inspect_ai.log import EvalLog
|
15
15
|
from inspect_ai.model import GenerateConfig
|
16
|
+
from inspect_ai.model._model import Model, get_model
|
16
17
|
from inspect_ai.scorer import Metric, Scorer
|
17
18
|
from inspect_ai.scorer._reducer import ScoreReducers, create_reducers
|
18
19
|
from inspect_ai.solver import Plan, Solver, generate
|
@@ -50,6 +51,7 @@ class Task:
|
|
50
51
|
cleanup: Callable[[TaskState], Awaitable[None]] | None = None,
|
51
52
|
scorer: Scorer | list[Scorer] | None = None,
|
52
53
|
metrics: list[Metric] | dict[str, list[Metric]] | None = None,
|
54
|
+
model: str | Model | None = None,
|
53
55
|
config: GenerateConfig = GenerateConfig(),
|
54
56
|
sandbox: SandboxEnvironmentType | None = None,
|
55
57
|
approval: str | list[ApprovalPolicy] | None = None,
|
@@ -67,42 +69,38 @@ class Task:
|
|
67
69
|
"""Create a task.
|
68
70
|
|
69
71
|
Args:
|
70
|
-
dataset
|
71
|
-
setup: (
|
72
|
-
|
73
|
-
solver: (Solver | list[Solver]): Solver or list of solvers.
|
74
|
-
Defaults to generate(), a normal call to the model.
|
72
|
+
dataset: Dataset to evaluate
|
73
|
+
setup: Setup step (always run even when the main `solver` is replaced).
|
74
|
+
solver: Solver or list of solvers. Defaults to generate(), a normal call to the model.
|
75
75
|
cleanup: Optional cleanup function for task. Called after
|
76
76
|
all solvers have run for each sample (including if an
|
77
77
|
exception occurs during the run)
|
78
|
-
scorer:
|
79
|
-
metrics (
|
80
|
-
|
81
|
-
config
|
82
|
-
sandbox (
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
Defaults to no approval policy.
|
87
|
-
epochs (int | Epochs | None): Epochs to repeat samples for and optional score
|
78
|
+
scorer: Scorer used to evaluate model output.
|
79
|
+
metrics: Alternative metrics (overrides the metrics provided by the specified scorer).
|
80
|
+
model: Default model for task (Optional, defaults to eval model).
|
81
|
+
config: Model generation config.
|
82
|
+
sandbox: Sandbox environment type (or optionally a str or tuple with a shorthand spec)
|
83
|
+
approval: Tool use approval policies.
|
84
|
+
Either a path to an approval policy config file or a list of approval policies. Defaults to no approval policy.
|
85
|
+
epochs: Epochs to repeat samples for and optional score
|
88
86
|
reducer function(s) used to combine sample scores (defaults to "mean")
|
89
|
-
fail_on_error
|
87
|
+
fail_on_error: `True` to fail on first sample error
|
90
88
|
(default); `False` to never fail on sample errors; Value between 0 and 1
|
91
89
|
to fail if a proportion of total samples fails. Value greater than 1 to fail
|
92
90
|
eval if a count of samples fails.
|
93
|
-
message_limit
|
94
|
-
token_limit
|
91
|
+
message_limit: Limit on total messages used for each sample.
|
92
|
+
token_limit: Limit on total tokens used for each sample.
|
95
93
|
time_limit: Limit on clock time (in seconds) for samples.
|
96
94
|
working_limit: Limit on working time (in seconds) for sample. Working
|
97
95
|
time includes model generation, tool calls, etc. but does not include
|
98
96
|
time spent waiting on retries or shared resources.
|
99
|
-
name:
|
97
|
+
name: Task name. If not specified is automatically
|
100
98
|
determined based on the name of the task directory (or "task")
|
101
99
|
if its anonymous task (e.g. created in a notebook and passed to
|
102
100
|
eval() directly)
|
103
|
-
version:
|
101
|
+
version: Version of task (to distinguish evolutions
|
104
102
|
of the task spec or breaking changes to it)
|
105
|
-
metadata:
|
103
|
+
metadata: Additional metadata to associate with the task.
|
106
104
|
**kwargs: Deprecated arguments.
|
107
105
|
"""
|
108
106
|
# handle deprecated args
|
@@ -135,6 +133,7 @@ class Task:
|
|
135
133
|
self.cleanup = cleanup
|
136
134
|
self.scorer = resolve_scorer(scorer)
|
137
135
|
self.metrics = metrics
|
136
|
+
self.model = resolve_model(model)
|
138
137
|
self.config = config
|
139
138
|
self.sandbox = resolve_sandbox_environment(sandbox)
|
140
139
|
self.approval = resolve_approval(approval)
|
@@ -176,6 +175,7 @@ def task_with(
|
|
176
175
|
cleanup: Callable[[TaskState], Awaitable[None]] | None | NotGiven = NOT_GIVEN,
|
177
176
|
scorer: Scorer | list[Scorer] | None | NotGiven = NOT_GIVEN,
|
178
177
|
metrics: list[Metric] | dict[str, list[Metric]] | None | NotGiven = NOT_GIVEN,
|
178
|
+
model: str | Model | NotGiven = NOT_GIVEN,
|
179
179
|
config: GenerateConfig | NotGiven = NOT_GIVEN,
|
180
180
|
sandbox: SandboxEnvironmentType | None | NotGiven = NOT_GIVEN,
|
181
181
|
approval: str | list[ApprovalPolicy] | None | NotGiven = NOT_GIVEN,
|
@@ -192,43 +192,39 @@ def task_with(
|
|
192
192
|
"""Task adapted with alternate values for one or more options.
|
193
193
|
|
194
194
|
Args:
|
195
|
-
task
|
196
|
-
dataset
|
197
|
-
setup: (
|
198
|
-
|
199
|
-
solver: (Solver | list[Solver]): Solver or list of solvers.
|
200
|
-
Defaults to generate(), a normal call to the model.
|
195
|
+
task: Task to adapt (it is deep copied prior to mutating options)
|
196
|
+
dataset: Dataset to evaluate
|
197
|
+
setup: Setup step (always run even when the main `solver` is replaced).
|
198
|
+
solver: Solver or list of solvers. Defaults to generate(), a normal call to the model.
|
201
199
|
cleanup: Optional cleanup function for task. Called after
|
202
200
|
all solvers have run for each sample (including if an
|
203
201
|
exception occurs during the run)
|
204
|
-
scorer:
|
205
|
-
metrics (
|
206
|
-
|
207
|
-
config
|
208
|
-
sandbox (
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
Defaults to no approval policy.
|
213
|
-
epochs (int | Epochs | None): Epochs to repeat samples for and optional score
|
202
|
+
scorer: Scorer used to evaluate model output.
|
203
|
+
metrics: Alternative metrics (overrides the metrics provided by the specified scorer).
|
204
|
+
model: Default model for task (Optional, defaults to eval model).
|
205
|
+
config: Model generation config.
|
206
|
+
sandbox: Sandbox environment type (or optionally a str or tuple with a shorthand spec)
|
207
|
+
approval: Tool use approval policies.
|
208
|
+
Either a path to an approval policy config file or a list of approval policies. Defaults to no approval policy.
|
209
|
+
epochs: Epochs to repeat samples for and optional score
|
214
210
|
reducer function(s) used to combine sample scores (defaults to "mean")
|
215
|
-
fail_on_error
|
211
|
+
fail_on_error: `True` to fail on first sample error
|
216
212
|
(default); `False` to never fail on sample errors; Value between 0 and 1
|
217
213
|
to fail if a proportion of total samples fails. Value greater than 1 to fail
|
218
214
|
eval if a count of samples fails.
|
219
|
-
message_limit
|
220
|
-
token_limit
|
215
|
+
message_limit: Limit on total messages used for each sample.
|
216
|
+
token_limit: Limit on total tokens used for each sample.
|
221
217
|
time_limit: Limit on clock time (in seconds) for samples.
|
222
|
-
working_limit: Limit on
|
218
|
+
working_limit: Limit on working time (in seconds) for sample. Working
|
223
219
|
time includes model generation, tool calls, etc. but does not include
|
224
220
|
time spent waiting on retries or shared resources.
|
225
|
-
name:
|
221
|
+
name: Task name. If not specified is automatically
|
226
222
|
determined based on the name of the task directory (or "task")
|
227
223
|
if its anonymous task (e.g. created in a notebook and passed to
|
228
224
|
eval() directly)
|
229
|
-
version:
|
225
|
+
version: Version of task (to distinguish evolutions
|
230
226
|
of the task spec or breaking changes to it)
|
231
|
-
metadata:
|
227
|
+
metadata: Additional metadata to associate with the task.
|
232
228
|
|
233
229
|
Returns:
|
234
230
|
Task: Task adapted with alternate options.
|
@@ -248,6 +244,8 @@ def task_with(
|
|
248
244
|
task.scorer = resolve_scorer(scorer)
|
249
245
|
if not isinstance(metrics, NotGiven):
|
250
246
|
task.metrics = metrics
|
247
|
+
if not isinstance(model, NotGiven):
|
248
|
+
task.model = resolve_model(model)
|
251
249
|
if not isinstance(config, NotGiven):
|
252
250
|
task.config = config
|
253
251
|
if not isinstance(sandbox, NotGiven):
|
@@ -307,34 +305,10 @@ class PreviousTask:
|
|
307
305
|
id: str
|
308
306
|
task: str | Task
|
309
307
|
task_args: dict[str, Any]
|
308
|
+
model: Model | None
|
310
309
|
log: EvalLog
|
311
310
|
|
312
311
|
|
313
|
-
Tasks = (
|
314
|
-
str
|
315
|
-
| PreviousTask
|
316
|
-
| TaskInfo
|
317
|
-
| Task
|
318
|
-
| Callable[..., Task]
|
319
|
-
| type[Task]
|
320
|
-
| list[str]
|
321
|
-
| list[PreviousTask]
|
322
|
-
| list[TaskInfo]
|
323
|
-
| list[Task]
|
324
|
-
| list[Callable[..., Task]]
|
325
|
-
| list[type[Task]]
|
326
|
-
| None
|
327
|
-
)
|
328
|
-
r"""One or more tasks.
|
329
|
-
|
330
|
-
Tasks to be evaluated. Many forms of task specification are
|
331
|
-
supported including directory names, task functions, task
|
332
|
-
classes, and task instances (a single task or list of tasks
|
333
|
-
can be specified). None is a request to read a task out
|
334
|
-
of the current working directory.
|
335
|
-
"""
|
336
|
-
|
337
|
-
|
338
312
|
def resolve_approval(
|
339
313
|
approval: str | list[ApprovalPolicy] | None,
|
340
314
|
) -> list[ApprovalPolicy] | None:
|
@@ -370,6 +344,13 @@ def resolve_solver(solver: Solver | list[Solver]) -> Solver:
|
|
370
344
|
return chain(solver) if isinstance(solver, list) else solver
|
371
345
|
|
372
346
|
|
347
|
+
def resolve_model(model: str | Model | None) -> Model | None:
|
348
|
+
if isinstance(model, str):
|
349
|
+
return get_model(model)
|
350
|
+
else:
|
351
|
+
return model
|
352
|
+
|
353
|
+
|
373
354
|
def resolve_scorer(scorer: Scorer | list[Scorer] | None) -> list[Scorer] | None:
|
374
355
|
return (
|
375
356
|
scorer if isinstance(scorer, list) else [scorer] if scorer is not None else None
|
@@ -0,0 +1,30 @@
|
|
1
|
+
from typing import Callable, TypeAlias
|
2
|
+
|
3
|
+
from .resolved import ResolvedTask
|
4
|
+
from .task import PreviousTask, Task, TaskInfo
|
5
|
+
|
6
|
+
Tasks: TypeAlias = (
|
7
|
+
str
|
8
|
+
| PreviousTask
|
9
|
+
| ResolvedTask
|
10
|
+
| TaskInfo
|
11
|
+
| Task
|
12
|
+
| Callable[..., Task]
|
13
|
+
| type[Task]
|
14
|
+
| list[str]
|
15
|
+
| list[PreviousTask]
|
16
|
+
| list[ResolvedTask]
|
17
|
+
| list[TaskInfo]
|
18
|
+
| list[Task]
|
19
|
+
| list[Callable[..., Task]]
|
20
|
+
| list[type[Task]]
|
21
|
+
| None
|
22
|
+
)
|
23
|
+
r"""One or more tasks.
|
24
|
+
|
25
|
+
Tasks to be evaluated. Many forms of task specification are
|
26
|
+
supported including directory names, task functions, task
|
27
|
+
classes, and task instances (a single task or list of tasks
|
28
|
+
can be specified). None is a request to read a task out
|
29
|
+
of the current working directory.
|
30
|
+
"""
|
inspect_ai/_util/constants.py
CHANGED
inspect_ai/_util/dotenv.py
CHANGED
@@ -52,6 +52,9 @@ def init_dotenv() -> None:
|
|
52
52
|
if inspect_log_dir:
|
53
53
|
os.environ[INSPECT_LOG_DIR_VAR] = inspect_log_dir
|
54
54
|
|
55
|
+
# re-apply any env vars specified at the cli w/ --env
|
56
|
+
apply_cli_env()
|
57
|
+
|
55
58
|
|
56
59
|
@contextlib.contextmanager
|
57
60
|
def dotenv_environ(
|
@@ -76,3 +79,17 @@ def dotenv_environ(
|
|
76
79
|
finally:
|
77
80
|
os.environ.update(update_after)
|
78
81
|
[os.environ.pop(k) for k in remove_after]
|
82
|
+
|
83
|
+
|
84
|
+
_cli_env: dict[str, Any] = {}
|
85
|
+
|
86
|
+
|
87
|
+
def init_cli_env(env: dict[str, Any]) -> None:
|
88
|
+
global _cli_env
|
89
|
+
_cli_env = env
|
90
|
+
apply_cli_env()
|
91
|
+
|
92
|
+
|
93
|
+
def apply_cli_env() -> None:
|
94
|
+
for var, value in _cli_env.items():
|
95
|
+
os.environ[var] = str(value)
|
inspect_ai/_util/logger.py
CHANGED
@@ -150,6 +150,9 @@ def init_logger(log_level: str | None, log_level_transcript: str | None = None)
|
|
150
150
|
transcript_levelno=transcript_levelno,
|
151
151
|
)
|
152
152
|
|
153
|
+
# set the global log level
|
154
|
+
getLogger().setLevel(log_level)
|
155
|
+
|
153
156
|
# set the log level for our package
|
154
157
|
getLogger(PKG_NAME).setLevel(capture_level)
|
155
158
|
getLogger(PKG_NAME).addHandler(_logHandler)
|
inspect_ai/_util/registry.py
CHANGED
@@ -5,6 +5,7 @@ from typing import Any, Callable, Literal, TypedDict, TypeGuard, cast
|
|
5
5
|
from pydantic import BaseModel, Field
|
6
6
|
from pydantic_core import to_jsonable_python
|
7
7
|
|
8
|
+
from inspect_ai._util.json import jsonable_python
|
8
9
|
from inspect_ai._util.package import get_installed_package_name
|
9
10
|
|
10
11
|
from .constants import PKG_NAME
|
@@ -198,13 +199,15 @@ def registry_create(type: RegistryType, name: str, **kwargs: Any) -> object:
|
|
198
199
|
def with_registry_info(o: object) -> object:
|
199
200
|
return set_registry_info(o, registry_info(obj))
|
200
201
|
|
201
|
-
# instantiate registry objects
|
202
|
+
# instantiate registry and model objects
|
202
203
|
for param in kwargs.keys():
|
203
204
|
value = kwargs[param]
|
204
205
|
if is_registry_dict(value):
|
205
206
|
kwargs[param] = registry_create(
|
206
207
|
value["type"], value["name"], **value["params"]
|
207
208
|
)
|
209
|
+
elif is_model_dict(value):
|
210
|
+
kwargs[param] = model_create_from_dict(value)
|
208
211
|
|
209
212
|
if isclass(obj):
|
210
213
|
return with_registry_info(obj(**kwargs))
|
@@ -380,6 +383,8 @@ def is_registry_dict(o: object) -> TypeGuard[RegistryDict]:
|
|
380
383
|
|
381
384
|
|
382
385
|
def registry_value(o: object) -> Any:
|
386
|
+
from inspect_ai.model._model import Model
|
387
|
+
|
383
388
|
# treat tuple as list
|
384
389
|
if isinstance(o, tuple):
|
385
390
|
o = list(o)
|
@@ -390,14 +395,50 @@ def registry_value(o: object) -> Any:
|
|
390
395
|
elif isinstance(o, dict):
|
391
396
|
return {k: registry_value(v) for k, v in o.items()}
|
392
397
|
elif has_registry_params(o):
|
393
|
-
return
|
398
|
+
return RegistryDict(
|
394
399
|
type=registry_info(o).type,
|
395
400
|
name=registry_log_name(o),
|
396
401
|
params=registry_params(o),
|
397
402
|
)
|
403
|
+
elif isinstance(o, Model):
|
404
|
+
return ModelDict(
|
405
|
+
model=str(o),
|
406
|
+
config=jsonable_python(o.config),
|
407
|
+
base_url=o.api.base_url,
|
408
|
+
model_args=o.model_args,
|
409
|
+
)
|
398
410
|
else:
|
399
411
|
return o
|
400
412
|
|
401
413
|
|
402
414
|
def registry_create_from_dict(d: RegistryDict) -> object:
|
403
415
|
return registry_create(d["type"], d["name"], **d["params"])
|
416
|
+
|
417
|
+
|
418
|
+
class ModelDict(TypedDict):
|
419
|
+
model: str
|
420
|
+
config: dict[str, Any]
|
421
|
+
base_url: str | None
|
422
|
+
model_args: dict[str, Any]
|
423
|
+
|
424
|
+
|
425
|
+
def is_model_dict(o: object) -> TypeGuard[ModelDict]:
|
426
|
+
return (
|
427
|
+
isinstance(o, dict)
|
428
|
+
and "model" in o
|
429
|
+
and "config" in o
|
430
|
+
and "base_url" in o
|
431
|
+
and "model_args" in o
|
432
|
+
)
|
433
|
+
|
434
|
+
|
435
|
+
def model_create_from_dict(d: ModelDict) -> object:
|
436
|
+
from inspect_ai.model._generate_config import GenerateConfig
|
437
|
+
from inspect_ai.model._model import get_model
|
438
|
+
|
439
|
+
return get_model(
|
440
|
+
d["model"],
|
441
|
+
config=GenerateConfig(**d["config"]),
|
442
|
+
base_url=d["base_url"],
|
443
|
+
**d["model_args"],
|
444
|
+
)
|
inspect_ai/_view/server.py
CHANGED
@@ -57,8 +57,7 @@ def view_server(
|
|
57
57
|
@routes.get("/api/logs/{log}")
|
58
58
|
async def api_log(request: web.Request) -> web.Response:
|
59
59
|
# log file requested
|
60
|
-
file = request.match_info["log"]
|
61
|
-
file = urllib.parse.unquote(file)
|
60
|
+
file = normalize_uri(request.match_info["log"])
|
62
61
|
validate_log_file_request(file)
|
63
62
|
|
64
63
|
# header_only is based on a size threshold
|
@@ -68,8 +67,7 @@ def view_server(
|
|
68
67
|
@routes.get("/api/log-size/{log}")
|
69
68
|
async def api_log_size(request: web.Request) -> web.Response:
|
70
69
|
# log file requested
|
71
|
-
file = request.match_info["log"]
|
72
|
-
file = urllib.parse.unquote(file)
|
70
|
+
file = normalize_uri(request.match_info["log"])
|
73
71
|
validate_log_file_request(file)
|
74
72
|
|
75
73
|
return await log_size_response(file)
|
@@ -77,8 +75,7 @@ def view_server(
|
|
77
75
|
@routes.get("/api/log-delete/{log}")
|
78
76
|
async def api_log_delete(request: web.Request) -> web.Response:
|
79
77
|
# log file requested
|
80
|
-
file = request.match_info["log"]
|
81
|
-
file = urllib.parse.unquote(file)
|
78
|
+
file = normalize_uri(request.match_info["log"])
|
82
79
|
validate_log_file_request(file)
|
83
80
|
|
84
81
|
return await log_delete_response(file)
|
@@ -86,8 +83,7 @@ def view_server(
|
|
86
83
|
@routes.get("/api/log-bytes/{log}")
|
87
84
|
async def api_log_bytes(request: web.Request) -> web.Response:
|
88
85
|
# log file requested
|
89
|
-
file = request.match_info["log"]
|
90
|
-
file = urllib.parse.unquote(file)
|
86
|
+
file = normalize_uri(request.match_info["log"])
|
91
87
|
validate_log_file_request(file)
|
92
88
|
|
93
89
|
# header_only is based on a size threshold
|
@@ -106,7 +102,7 @@ def view_server(
|
|
106
102
|
if authorization:
|
107
103
|
request_log_dir = request.query.getone("log_dir", None)
|
108
104
|
if request_log_dir:
|
109
|
-
request_log_dir =
|
105
|
+
request_log_dir = normalize_uri(request_log_dir)
|
110
106
|
else:
|
111
107
|
request_log_dir = log_dir
|
112
108
|
else:
|
@@ -121,7 +117,7 @@ def view_server(
|
|
121
117
|
@routes.get("/api/log-headers")
|
122
118
|
async def api_log_headers(request: web.Request) -> web.Response:
|
123
119
|
files = request.query.getall("file", [])
|
124
|
-
files = [
|
120
|
+
files = [normalize_uri(file) for file in files]
|
125
121
|
map(validate_log_file_request, files)
|
126
122
|
return await log_headers_response(files)
|
127
123
|
|
@@ -166,6 +162,28 @@ def view_server(
|
|
166
162
|
)
|
167
163
|
|
168
164
|
|
165
|
+
def normalize_uri(uri: str) -> str:
|
166
|
+
"""Normalize incoming URIs to a consistent format."""
|
167
|
+
# Decode any URL-encoded characters
|
168
|
+
parsed = urllib.parse.urlparse(urllib.parse.unquote(uri))
|
169
|
+
|
170
|
+
if parsed.scheme != "file":
|
171
|
+
# If this isn't a file uri, just unquote it
|
172
|
+
return urllib.parse.unquote(uri)
|
173
|
+
|
174
|
+
else:
|
175
|
+
# If this is a file uri, see whether we should process triple slashes
|
176
|
+
# down to double slashes
|
177
|
+
path = parsed.path
|
178
|
+
|
179
|
+
# Detect and normalize Windows-style file URIs
|
180
|
+
if path.startswith("/") and len(path) > 3 and path[2] == ":":
|
181
|
+
# Strip leading `/` before drive letter
|
182
|
+
path = path[1:]
|
183
|
+
|
184
|
+
return f"file://{path}"
|
185
|
+
|
186
|
+
|
169
187
|
def log_listing_response(logs: list[EvalLogInfo], log_dir: str) -> web.Response:
|
170
188
|
response = dict(
|
171
189
|
log_dir=aliased_path(log_dir),
|
@@ -16346,7 +16346,7 @@ ul.jsondiffpatch-textdiff {
|
|
16346
16346
|
column-gap: 0.5em;
|
16347
16347
|
min-width: 200px;
|
16348
16348
|
}
|
16349
|
-
.
|
16349
|
+
._flatBody_1uw6w_1 {
|
16350
16350
|
color: var(--bs-danger);
|
16351
16351
|
display: grid;
|
16352
16352
|
grid-template-columns: max-content max-content;
|
@@ -16354,16 +16354,17 @@ ul.jsondiffpatch-textdiff {
|
|
16354
16354
|
margin-top: 0.4rem;
|
16355
16355
|
}
|
16356
16356
|
|
16357
|
-
.
|
16357
|
+
._iconSmall_1uw6w_9 {
|
16358
16358
|
font-size: var(--inspect-font-size-small);
|
16359
16359
|
line-height: var(--inspect-font-size-small);
|
16360
16360
|
height: var(--inspect-font-size-small);
|
16361
16361
|
}
|
16362
16362
|
|
16363
|
-
.
|
16363
|
+
._lineBase_1uw6w_15 {
|
16364
16364
|
font-size: var(--inspect-font-size-base);
|
16365
16365
|
line-height: var(--inspect-font-size-base);
|
16366
16366
|
height: var(--inspect-font-size-base);
|
16367
|
+
max-width: 30em;
|
16367
16368
|
}
|
16368
16369
|
._target_9qy4e_1 {
|
16369
16370
|
padding-left: 0;
|
@@ -16460,44 +16461,44 @@ ul.jsondiffpatch-textdiff {
|
|
16460
16461
|
font-weight: 600;
|
16461
16462
|
padding-bottom: 0.3em;
|
16462
16463
|
}
|
16463
|
-
.
|
16464
|
+
._output_15urk_1 {
|
16464
16465
|
padding-top: 1em;
|
16465
16466
|
}
|
16466
16467
|
|
16467
|
-
.
|
16468
|
+
._container_15urk_5 {
|
16468
16469
|
margin: 0.5em 0;
|
16469
16470
|
width: 100%;
|
16470
16471
|
}
|
16471
16472
|
|
16472
|
-
.
|
16473
|
+
._all_15urk_10 {
|
16473
16474
|
display: grid;
|
16474
16475
|
grid-template-columns: 1fr 1fr 1fr;
|
16475
16476
|
column-gap: 1em;
|
16476
16477
|
}
|
16477
16478
|
|
16478
|
-
.
|
16479
|
+
._tableSelection_15urk_16 {
|
16479
16480
|
width: fit-content;
|
16480
16481
|
align-self: start;
|
16481
16482
|
justify-self: start;
|
16482
16483
|
}
|
16483
16484
|
|
16484
|
-
.
|
16485
|
+
._tools_15urk_22 {
|
16485
16486
|
grid-column: -1/1;
|
16486
16487
|
}
|
16487
16488
|
|
16488
|
-
.
|
16489
|
+
._codePre_15urk_26 {
|
16489
16490
|
background: var(--bs-light);
|
16490
16491
|
width: 100%;
|
16491
16492
|
padding: 0.5em;
|
16492
16493
|
border-radius: var(--bs-border-radius);
|
16493
16494
|
}
|
16494
16495
|
|
16495
|
-
.
|
16496
|
-
white-space: pre-wrap;
|
16497
|
-
word-wrap: anywhere;
|
16496
|
+
._code_15urk_26 {
|
16497
|
+
white-space: pre-wrap !important;
|
16498
|
+
word-wrap: anywhere !important;
|
16498
16499
|
}
|
16499
16500
|
|
16500
|
-
.
|
16501
|
+
._toolConfig_15urk_38 {
|
16501
16502
|
display: grid;
|
16502
16503
|
grid-template-columns: max-content auto;
|
16503
16504
|
column-gap: 1em;
|
@@ -17031,12 +17032,14 @@ div.ap-player div.ap-control-bar * {
|
|
17031
17032
|
div.ap-control-bar svg.ap-icon path {
|
17032
17033
|
fill: var(--term-color-foreground);
|
17033
17034
|
}
|
17034
|
-
div.ap-control-bar span.ap-
|
17035
|
+
div.ap-control-bar span.ap-button {
|
17035
17036
|
display: flex;
|
17036
17037
|
flex: 0 0 auto;
|
17037
17038
|
cursor: pointer;
|
17038
|
-
|
17039
|
+
}
|
17040
|
+
div.ap-control-bar span.ap-playback-button {
|
17039
17041
|
width: 12px;
|
17042
|
+
height: 12px;
|
17040
17043
|
padding: 10px;
|
17041
17044
|
}
|
17042
17045
|
div.ap-control-bar span.ap-playback-button svg {
|
@@ -17103,13 +17106,9 @@ div.ap-control-bar.ap-seekable .ap-progressbar .ap-bar {
|
|
17103
17106
|
cursor: pointer;
|
17104
17107
|
}
|
17105
17108
|
div.ap-control-bar .ap-fullscreen-button {
|
17106
|
-
display: block;
|
17107
|
-
flex: 0 0 auto;
|
17108
17109
|
width: 14px;
|
17109
17110
|
height: 14px;
|
17110
17111
|
padding: 9px;
|
17111
|
-
cursor: pointer;
|
17112
|
-
position: relative;
|
17113
17112
|
}
|
17114
17113
|
div.ap-control-bar .ap-fullscreen-button svg {
|
17115
17114
|
width: 14px;
|
@@ -17126,6 +17125,20 @@ div.ap-control-bar .ap-fullscreen-button .ap-tooltip {
|
|
17126
17125
|
left: initial;
|
17127
17126
|
transform: none;
|
17128
17127
|
}
|
17128
|
+
div.ap-control-bar .ap-kbd-button {
|
17129
|
+
height: 14px;
|
17130
|
+
padding: 9px;
|
17131
|
+
margin: 0 4px;
|
17132
|
+
}
|
17133
|
+
div.ap-control-bar .ap-kbd-button svg {
|
17134
|
+
width: 26px;
|
17135
|
+
height: 14px;
|
17136
|
+
}
|
17137
|
+
div.ap-control-bar .ap-kbd-button .ap-tooltip {
|
17138
|
+
right: 5px;
|
17139
|
+
left: initial;
|
17140
|
+
transform: none;
|
17141
|
+
}
|
17129
17142
|
div.ap-wrapper.ap-hud .ap-control-bar {
|
17130
17143
|
opacity: 1;
|
17131
17144
|
}
|