inspect-ai 0.3.58__py3-none-any.whl → 0.3.60__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/common.py +3 -1
- inspect_ai/_cli/eval.py +15 -9
- inspect_ai/_display/core/active.py +4 -1
- inspect_ai/_display/core/config.py +3 -3
- inspect_ai/_display/core/panel.py +7 -3
- inspect_ai/_display/plain/__init__.py +0 -0
- inspect_ai/_display/plain/display.py +203 -0
- inspect_ai/_display/rich/display.py +0 -5
- inspect_ai/_display/textual/widgets/port_mappings.py +110 -0
- inspect_ai/_display/textual/widgets/samples.py +79 -12
- inspect_ai/_display/textual/widgets/sandbox.py +37 -0
- inspect_ai/_eval/eval.py +10 -1
- inspect_ai/_eval/loader.py +79 -19
- inspect_ai/_eval/registry.py +6 -0
- inspect_ai/_eval/score.py +3 -1
- inspect_ai/_eval/task/results.py +51 -22
- inspect_ai/_eval/task/run.py +47 -13
- inspect_ai/_eval/task/sandbox.py +10 -5
- inspect_ai/_util/constants.py +1 -0
- inspect_ai/_util/port_names.py +61 -0
- inspect_ai/_util/text.py +23 -0
- inspect_ai/_view/www/App.css +31 -1
- inspect_ai/_view/www/dist/assets/index.css +31 -1
- inspect_ai/_view/www/dist/assets/index.js +25498 -2044
- inspect_ai/_view/www/log-schema.json +32 -2
- inspect_ai/_view/www/package.json +2 -0
- inspect_ai/_view/www/src/App.mjs +14 -16
- inspect_ai/_view/www/src/Types.mjs +1 -2
- inspect_ai/_view/www/src/api/Types.ts +133 -0
- inspect_ai/_view/www/src/api/{api-browser.mjs → api-browser.ts} +25 -13
- inspect_ai/_view/www/src/api/api-http.ts +219 -0
- inspect_ai/_view/www/src/api/api-shared.ts +47 -0
- inspect_ai/_view/www/src/api/{api-vscode.mjs → api-vscode.ts} +22 -19
- inspect_ai/_view/www/src/api/{client-api.mjs → client-api.ts} +93 -53
- inspect_ai/_view/www/src/api/index.ts +51 -0
- inspect_ai/_view/www/src/api/jsonrpc.ts +225 -0
- inspect_ai/_view/www/src/components/ChatView.mjs +133 -43
- inspect_ai/_view/www/src/components/DownloadButton.mjs +1 -1
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -4
- inspect_ai/_view/www/src/components/LargeModal.mjs +19 -20
- inspect_ai/_view/www/src/components/TabSet.mjs +3 -1
- inspect_ai/_view/www/src/components/VirtualList.mjs +266 -84
- inspect_ai/_view/www/src/index.js +77 -4
- inspect_ai/_view/www/src/log/{remoteLogFile.mjs → remoteLogFile.ts} +62 -46
- inspect_ai/_view/www/src/navbar/Navbar.mjs +4 -1
- inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +19 -10
- inspect_ai/_view/www/src/samples/SampleDialog.mjs +5 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +23 -15
- inspect_ai/_view/www/src/samples/SampleList.mjs +19 -49
- inspect_ai/_view/www/src/samples/SampleScores.mjs +1 -1
- inspect_ai/_view/www/src/samples/SampleTranscript.mjs +8 -3
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +38 -26
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +14 -11
- inspect_ai/_view/www/src/samples/SamplesTools.mjs +8 -8
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +712 -89
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +2 -2
- inspect_ai/_view/www/src/samples/tools/filters.mjs +260 -87
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +29 -24
- inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +1 -1
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +31 -10
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +23 -2
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +33 -3
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +25 -2
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +25 -2
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +193 -11
- inspect_ai/_view/www/src/samples/transcript/Types.mjs +10 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +26 -2
- inspect_ai/_view/www/src/types/log.d.ts +13 -2
- inspect_ai/_view/www/src/utils/Format.mjs +10 -3
- inspect_ai/_view/www/src/utils/{Json.mjs → json-worker.ts} +13 -9
- inspect_ai/_view/www/src/utils/vscode.ts +36 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +11 -5
- inspect_ai/_view/www/vite.config.js +7 -0
- inspect_ai/_view/www/yarn.lock +116 -0
- inspect_ai/approval/_human/__init__.py +0 -0
- inspect_ai/approval/_human/manager.py +1 -1
- inspect_ai/approval/_policy.py +12 -6
- inspect_ai/log/_log.py +1 -1
- inspect_ai/log/_samples.py +16 -0
- inspect_ai/log/_transcript.py +4 -1
- inspect_ai/model/_call_tools.py +59 -0
- inspect_ai/model/_conversation.py +16 -7
- inspect_ai/model/_generate_config.py +12 -12
- inspect_ai/model/_model.py +117 -18
- inspect_ai/model/_model_output.py +22 -2
- inspect_ai/model/_openai.py +383 -0
- inspect_ai/model/_providers/anthropic.py +152 -55
- inspect_ai/model/_providers/azureai.py +21 -21
- inspect_ai/model/_providers/bedrock.py +37 -40
- inspect_ai/model/_providers/goodfire.py +248 -0
- inspect_ai/model/_providers/google.py +46 -54
- inspect_ai/model/_providers/groq.py +7 -3
- inspect_ai/model/_providers/hf.py +6 -0
- inspect_ai/model/_providers/mistral.py +13 -12
- inspect_ai/model/_providers/openai.py +51 -218
- inspect_ai/model/_providers/openai_o1.py +11 -12
- inspect_ai/model/_providers/providers.py +23 -1
- inspect_ai/model/_providers/together.py +12 -12
- inspect_ai/model/_providers/util/__init__.py +2 -3
- inspect_ai/model/_providers/util/hf_handler.py +1 -1
- inspect_ai/model/_providers/util/llama31.py +1 -1
- inspect_ai/model/_providers/util/util.py +0 -76
- inspect_ai/model/_providers/vertex.py +1 -4
- inspect_ai/scorer/_metric.py +3 -0
- inspect_ai/scorer/_reducer/reducer.py +1 -1
- inspect_ai/scorer/_scorer.py +4 -3
- inspect_ai/solver/__init__.py +4 -5
- inspect_ai/solver/_basic_agent.py +1 -1
- inspect_ai/solver/_bridge/__init__.py +3 -0
- inspect_ai/solver/_bridge/bridge.py +100 -0
- inspect_ai/solver/_bridge/patch.py +170 -0
- inspect_ai/solver/_prompt.py +35 -5
- inspect_ai/solver/_solver.py +6 -0
- inspect_ai/solver/_task_state.py +80 -38
- inspect_ai/tool/__init__.py +2 -0
- inspect_ai/tool/_tool.py +12 -1
- inspect_ai/tool/_tool_call.py +10 -0
- inspect_ai/tool/_tool_def.py +16 -5
- inspect_ai/tool/_tool_with.py +21 -4
- inspect_ai/tool/beta/__init__.py +5 -0
- inspect_ai/tool/beta/_computer/__init__.py +3 -0
- inspect_ai/tool/beta/_computer/_common.py +133 -0
- inspect_ai/tool/beta/_computer/_computer.py +155 -0
- inspect_ai/tool/beta/_computer/_computer_split.py +198 -0
- inspect_ai/tool/beta/_computer/_resources/Dockerfile +100 -0
- inspect_ai/tool/beta/_computer/_resources/README.md +30 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/entrypoint.sh +18 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/novnc_startup.sh +20 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/x11vnc_startup.sh +48 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/xfce_startup.sh +13 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/xvfb_startup.sh +48 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +10 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +10 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/XPaint.desktop +10 -0
- inspect_ai/tool/beta/_computer/_resources/tool/__init__.py +0 -0
- inspect_ai/tool/beta/_computer/_resources/tool/_logger.py +22 -0
- inspect_ai/tool/beta/_computer/_resources/tool/_run.py +42 -0
- inspect_ai/tool/beta/_computer/_resources/tool/_tool_result.py +33 -0
- inspect_ai/tool/beta/_computer/_resources/tool/_x11_client.py +262 -0
- inspect_ai/tool/beta/_computer/_resources/tool/computer_tool.py +85 -0
- inspect_ai/tool/beta/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/util/__init__.py +2 -0
- inspect_ai/util/_display.py +5 -0
- inspect_ai/util/_limit.py +26 -0
- inspect_ai/util/_sandbox/docker/docker.py +64 -1
- inspect_ai/util/_sandbox/docker/internal.py +3 -1
- inspect_ai/util/_sandbox/docker/prereqs.py +1 -1
- inspect_ai/util/_sandbox/environment.py +14 -0
- {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/METADATA +3 -2
- {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/RECORD +159 -126
- inspect_ai/_view/www/src/api/Types.mjs +0 -117
- inspect_ai/_view/www/src/api/api-http.mjs +0 -300
- inspect_ai/_view/www/src/api/api-shared.mjs +0 -10
- inspect_ai/_view/www/src/api/index.mjs +0 -49
- inspect_ai/_view/www/src/api/jsonrpc.mjs +0 -208
- inspect_ai/_view/www/src/samples/transcript/TranscriptState.mjs +0 -70
- inspect_ai/_view/www/src/utils/vscode.mjs +0 -16
- {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,37 @@
|
|
1
|
+
from textual.app import ComposeResult
|
2
|
+
from textual.containers import Horizontal, Vertical
|
3
|
+
from textual.widgets import Static
|
4
|
+
|
5
|
+
from inspect_ai.util._sandbox.environment import SandboxConnection
|
6
|
+
|
7
|
+
from .port_mappings import PortMappingsView
|
8
|
+
|
9
|
+
|
10
|
+
class SandboxView(Vertical):
|
11
|
+
DEFAULT_CSS = """
|
12
|
+
.indent {
|
13
|
+
width: 2;
|
14
|
+
}
|
15
|
+
.no_indent {
|
16
|
+
width: 0;
|
17
|
+
}
|
18
|
+
"""
|
19
|
+
|
20
|
+
def __init__(
|
21
|
+
self,
|
22
|
+
connection: SandboxConnection,
|
23
|
+
name: str | None, # if None, no header or indent
|
24
|
+
) -> None:
|
25
|
+
super().__init__()
|
26
|
+
self.sandbox_name = name
|
27
|
+
self.connection = connection
|
28
|
+
|
29
|
+
def compose(self) -> ComposeResult:
|
30
|
+
if self.sandbox_name:
|
31
|
+
yield Static(self.sandbox_name)
|
32
|
+
with Horizontal():
|
33
|
+
yield Static("", classes="indent" if self.sandbox_name else "no_indent")
|
34
|
+
with Vertical():
|
35
|
+
yield Static(self.connection.command)
|
36
|
+
if self.connection.ports:
|
37
|
+
yield PortMappingsView(self.connection.ports)
|
inspect_ai/_eval/eval.py
CHANGED
@@ -35,7 +35,12 @@ from inspect_ai.scorer._reducer import reducer_log_names
|
|
35
35
|
from inspect_ai.solver._chain import chain
|
36
36
|
from inspect_ai.solver._solver import Solver, SolverSpec
|
37
37
|
from inspect_ai.util import SandboxEnvironmentType
|
38
|
-
from inspect_ai.util._display import
|
38
|
+
from inspect_ai.util._display import (
|
39
|
+
DisplayType,
|
40
|
+
display_type,
|
41
|
+
display_type_initialized,
|
42
|
+
init_display_type,
|
43
|
+
)
|
39
44
|
|
40
45
|
from .context import init_eval_context
|
41
46
|
from .loader import ResolvedTask, resolve_tasks
|
@@ -306,6 +311,10 @@ async def eval_async(
|
|
306
311
|
|
307
312
|
_eval_async_running = True
|
308
313
|
|
314
|
+
# if we are called outside of eval() then set display type to "plain"
|
315
|
+
if not display_type_initialized():
|
316
|
+
init_display_type("plain")
|
317
|
+
|
309
318
|
# resolve model and task args
|
310
319
|
model_args = resolve_args(model_args)
|
311
320
|
task_args = resolve_args(task_args)
|
inspect_ai/_eval/loader.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
import ast
|
2
2
|
import contextlib
|
3
|
+
import inspect
|
3
4
|
import os
|
4
5
|
from dataclasses import dataclass, field
|
5
6
|
from importlib.machinery import SourceFileLoader
|
@@ -9,11 +10,13 @@ from pathlib import Path
|
|
9
10
|
from types import ModuleType
|
10
11
|
from typing import Any, Callable, cast
|
11
12
|
|
13
|
+
from typing_extensions import overload
|
14
|
+
|
12
15
|
from inspect_ai._eval.task.util import task_file, task_run_dir
|
13
16
|
from inspect_ai._util.decorator import parse_decorators
|
14
17
|
from inspect_ai._util.error import PrerequisiteError
|
15
18
|
from inspect_ai._util.logger import warn_once
|
16
|
-
from inspect_ai._util.path import chdir_python
|
19
|
+
from inspect_ai._util.path import chdir_python, cwd_relative_path
|
17
20
|
from inspect_ai._util.registry import (
|
18
21
|
RegistryInfo,
|
19
22
|
is_registry_object,
|
@@ -23,6 +26,7 @@ from inspect_ai._util.registry import (
|
|
23
26
|
registry_params,
|
24
27
|
)
|
25
28
|
from inspect_ai.model import Model, ModelName
|
29
|
+
from inspect_ai.solver._bridge import bridge
|
26
30
|
from inspect_ai.solver._solver import Solver, SolverSpec
|
27
31
|
from inspect_ai.util import SandboxEnvironmentSpec, SandboxEnvironmentType
|
28
32
|
from inspect_ai.util._sandbox.environment import resolve_sandbox_environment
|
@@ -334,6 +338,16 @@ def split_spec(spec: str) -> tuple[str, str | None]:
|
|
334
338
|
return spec, None
|
335
339
|
|
336
340
|
|
341
|
+
@overload
|
342
|
+
def load_module(
|
343
|
+
module_path: Path, filter: Callable[[str], bool]
|
344
|
+
) -> ModuleType | None: ...
|
345
|
+
|
346
|
+
|
347
|
+
@overload
|
348
|
+
def load_module(module_path: Path, filter: None = None) -> ModuleType: ...
|
349
|
+
|
350
|
+
|
337
351
|
def load_module(
|
338
352
|
module_path: Path, filter: Callable[[str], bool] | None = None
|
339
353
|
) -> ModuleType | None:
|
@@ -425,28 +439,74 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
|
|
425
439
|
else contextlib.nullcontext()
|
426
440
|
)
|
427
441
|
|
442
|
+
# pretty solver name for error messages
|
443
|
+
pretty_solver_file = (
|
444
|
+
cwd_relative_path(solver_file.as_posix()) if solver_file else None
|
445
|
+
)
|
446
|
+
|
428
447
|
with create_cm:
|
429
|
-
# if
|
430
|
-
if solver_file is
|
431
|
-
|
432
|
-
|
448
|
+
# if there is no solver file then just create from the registry by name
|
449
|
+
if solver_file is None:
|
450
|
+
if solver_name is None:
|
451
|
+
raise ValueError(f"Unable to resolve solver name from {spec.solver}")
|
452
|
+
return cast(Solver, registry_create("solver", solver_name, **spec.args))
|
433
453
|
|
434
|
-
|
454
|
+
# we do have a solver file
|
455
|
+
else:
|
456
|
+
# load the module and parse decorators
|
457
|
+
solver_module = load_module(solver_file)
|
458
|
+
decorators = parse_decorators(solver_file, "solver")
|
459
|
+
|
460
|
+
# if there is no solver_name see if we can discover it
|
435
461
|
if solver_name is None:
|
436
|
-
|
437
|
-
|
462
|
+
if len(decorators) == 1:
|
463
|
+
# decorator based solver
|
464
|
+
solver_name = decorators[0][0]
|
465
|
+
elif len(decorators) == 0:
|
466
|
+
# see if we can find an agent based solver
|
467
|
+
functions = [
|
468
|
+
function
|
469
|
+
for function in inspect.getmembers(
|
470
|
+
solver_module, inspect.isfunction
|
471
|
+
)
|
472
|
+
if function[1].__module__ == solver_module.__name__
|
473
|
+
]
|
474
|
+
agent_functions = [
|
475
|
+
function
|
476
|
+
for function in functions
|
477
|
+
if "agent" in function[0] and not function[0].startswith("_")
|
478
|
+
]
|
479
|
+
if len(agent_functions) == 1:
|
480
|
+
# agent based solver
|
481
|
+
solver_name = agent_functions[0][0]
|
482
|
+
|
483
|
+
elif len(agent_functions) == 0:
|
484
|
+
raise PrerequisiteError(
|
485
|
+
f"The source file {pretty_solver_file} does not contain any @solver functions or agent functions."
|
486
|
+
)
|
487
|
+
else:
|
488
|
+
raise PrerequisiteError(
|
489
|
+
f"The source file {pretty_solver_file} has more than one agent function (qualify which agent using e.g. '{solver_file.name}@agent_fn')"
|
490
|
+
)
|
491
|
+
else:
|
438
492
|
raise PrerequisiteError(
|
439
|
-
f"The source file {
|
493
|
+
f"The source file {pretty_solver_file} has more than one @solver function (qualify which solver using e.g. '{solver_file.name}y@solver_fn')"
|
440
494
|
)
|
441
|
-
if len(solvers) > 1:
|
442
|
-
raise PrerequisiteError(
|
443
|
-
f"The source file {solver_file.as_posix()} has more than one @solver function (qualify which solver using file.py@solver)"
|
444
|
-
)
|
445
|
-
solver_name = solvers[0][0]
|
446
495
|
|
447
|
-
|
448
|
-
|
449
|
-
|
496
|
+
# create decorator based solvers using the registry
|
497
|
+
if any(solver[0] == solver_name for solver in decorators):
|
498
|
+
return cast(Solver, registry_create("solver", solver_name, **spec.args))
|
450
499
|
|
451
|
-
|
452
|
-
|
500
|
+
# create agent based solvers by calling the function and wrapping it in bridge()
|
501
|
+
else:
|
502
|
+
agent_fn = getattr(solver_module, solver_name, None)
|
503
|
+
if inspect.isfunction(agent_fn):
|
504
|
+
return bridge(agent_fn(**spec.args))
|
505
|
+
elif agent_fn is not None:
|
506
|
+
raise PrerequisiteError(
|
507
|
+
f"The object {solver_name} in file {pretty_solver_file} is not a Python function."
|
508
|
+
)
|
509
|
+
else:
|
510
|
+
raise PrerequisiteError(
|
511
|
+
f"The function {solver_name} was not found in file {pretty_solver_file}."
|
512
|
+
)
|
inspect_ai/_eval/registry.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import inspect
|
2
2
|
import logging
|
3
3
|
from copy import deepcopy
|
4
|
+
from functools import wraps
|
4
5
|
from pathlib import Path
|
5
6
|
from typing import Any, Callable, TypeVar, cast, overload
|
6
7
|
|
@@ -125,6 +126,7 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
|
|
125
126
|
params = list(inspect.signature(task_type).parameters.keys())
|
126
127
|
|
127
128
|
# Create and return the wrapper function
|
129
|
+
@wraps(task_type)
|
128
130
|
def wrapper(*w_args: Any, **w_kwargs: Any) -> Task:
|
129
131
|
# Create the task
|
130
132
|
task_instance = task_type(*w_args, **w_kwargs)
|
@@ -154,6 +156,10 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
|
|
154
156
|
# Return the task instance
|
155
157
|
return task_instance
|
156
158
|
|
159
|
+
# functools.wraps overrides the return type annotation of the inner function, so
|
160
|
+
# we explicitly set it again
|
161
|
+
wrapper.__annotations__["return"] = Task
|
162
|
+
|
157
163
|
# Register the task and return the wrapper
|
158
164
|
return task_register(
|
159
165
|
task=cast(TaskType, wrapper), name=task_name, attribs=attribs, params=params
|
inspect_ai/_eval/score.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Callable, cast
|
|
5
5
|
from inspect_ai._display import display
|
6
6
|
from inspect_ai._util.path import chdir_python
|
7
7
|
from inspect_ai._util.platform import platform_init
|
8
|
-
from inspect_ai._util.registry import registry_create
|
8
|
+
from inspect_ai._util.registry import registry_create, registry_unqualified_name
|
9
9
|
from inspect_ai.log import (
|
10
10
|
EvalLog,
|
11
11
|
EvalMetric,
|
@@ -85,6 +85,7 @@ async def score_async(
|
|
85
85
|
sample_id=sample.id,
|
86
86
|
epoch=sample.epoch,
|
87
87
|
input=sample.input,
|
88
|
+
target=Target(sample.target),
|
88
89
|
choices=sample.choices,
|
89
90
|
messages=sample.messages,
|
90
91
|
output=sample.output,
|
@@ -184,6 +185,7 @@ async def run_score_task(
|
|
184
185
|
results[scorer_name] = SampleScore(
|
185
186
|
score=result,
|
186
187
|
sample_id=state.sample_id,
|
188
|
+
scorer=registry_unqualified_name(scorer),
|
187
189
|
)
|
188
190
|
|
189
191
|
progress()
|
inspect_ai/_eval/task/results.py
CHANGED
@@ -2,6 +2,7 @@ import fnmatch
|
|
2
2
|
import re
|
3
3
|
from collections import defaultdict
|
4
4
|
from copy import deepcopy
|
5
|
+
from dataclasses import dataclass, field
|
5
6
|
from typing import Any, Tuple, cast
|
6
7
|
|
7
8
|
from inspect_ai._util.registry import (
|
@@ -19,6 +20,8 @@ from inspect_ai.log import (
|
|
19
20
|
from inspect_ai.log._log import EvalSampleReductions
|
20
21
|
from inspect_ai.scorer import Metric, Score, Scorer
|
21
22
|
from inspect_ai.scorer._metric import SampleScore
|
23
|
+
from inspect_ai.scorer._metrics.accuracy import accuracy
|
24
|
+
from inspect_ai.scorer._metrics.std import stderr
|
22
25
|
from inspect_ai.scorer._reducer import ScoreReducer, mean_score, reducer_log_name
|
23
26
|
from inspect_ai.scorer._scorer import (
|
24
27
|
SCORER_METRICS,
|
@@ -27,6 +30,27 @@ from inspect_ai.scorer._scorer import (
|
|
27
30
|
)
|
28
31
|
|
29
32
|
|
33
|
+
@dataclass
|
34
|
+
class ScorerInfo:
|
35
|
+
name: str
|
36
|
+
metrics: list[Metric | dict[str, list[Metric]]] | dict[str, list[Metric]]
|
37
|
+
params: dict[str, Any] = field(default_factory=dict)
|
38
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
39
|
+
|
40
|
+
@staticmethod
|
41
|
+
def from_scorer(scorer: Scorer) -> "ScorerInfo":
|
42
|
+
name = registry_unqualified_name(scorer)
|
43
|
+
metrics = scorer_metrics(scorer)
|
44
|
+
metadata = deepcopy(registry_info(scorer).metadata)
|
45
|
+
del metadata[SCORER_METRICS]
|
46
|
+
params = registry_params(scorer)
|
47
|
+
return ScorerInfo(name=name, metrics=metrics, params=params, metadata=metadata)
|
48
|
+
|
49
|
+
@staticmethod
|
50
|
+
def from_name(name: str) -> "ScorerInfo":
|
51
|
+
return ScorerInfo(name=name, metrics=[accuracy(), stderr()])
|
52
|
+
|
53
|
+
|
30
54
|
def eval_results(
|
31
55
|
samples: int,
|
32
56
|
scores: list[dict[str, SampleScore]],
|
@@ -38,18 +62,24 @@ def eval_results(
|
|
38
62
|
results = EvalResults(total_samples=samples, completed_samples=len(scores))
|
39
63
|
reductions = None
|
40
64
|
|
65
|
+
# extract scorers info from scorers then create scorers info for any
|
66
|
+
# scores not already accounted for by a scorer name
|
67
|
+
scorers_info = [ScorerInfo.from_scorer(scorer) for scorer in (scorers or [])]
|
68
|
+
scorer_names = {info.name for info in scorers_info}
|
69
|
+
for sample_scores in scores:
|
70
|
+
for name, sample_score in sample_scores.items():
|
71
|
+
if sample_score.scorer is None and name not in scorer_names:
|
72
|
+
scorers_info.append(ScorerInfo.from_name(name))
|
73
|
+
scorer_names.add(name)
|
74
|
+
|
41
75
|
# record scorer
|
42
|
-
if
|
76
|
+
if len(scorers_info) > 0:
|
43
77
|
result_scores: list[EvalScore] = []
|
44
78
|
sample_reductions: list[EvalSampleReductions] = []
|
45
|
-
for
|
46
|
-
# extract non-metrics metadata
|
47
|
-
metadata = deepcopy(registry_info(scorer).metadata)
|
48
|
-
del metadata[SCORER_METRICS]
|
49
|
-
|
79
|
+
for scorer_info in scorers_info:
|
50
80
|
# this scorer
|
51
81
|
scorer_name = unique_scorer_name(
|
52
|
-
|
82
|
+
scorer_info.name, [eval_score.name for eval_score in result_scores]
|
53
83
|
)
|
54
84
|
|
55
85
|
# scores for this scorer
|
@@ -75,7 +105,7 @@ def eval_results(
|
|
75
105
|
|
76
106
|
# Compute metrics for this scorer
|
77
107
|
simple_scores = cast(list[Score], reduced_scores)
|
78
|
-
targets = metrics if metrics is not None else
|
108
|
+
targets = metrics if metrics is not None else scorer_info.metrics
|
79
109
|
if isinstance(targets, list):
|
80
110
|
## split the metrics into the simple metrics and any dictionary
|
81
111
|
## metrics, to be processed independently
|
@@ -88,8 +118,7 @@ def eval_results(
|
|
88
118
|
result_scores.extend(
|
89
119
|
scorer_for_metrics(
|
90
120
|
scorer_name=scorer_name,
|
91
|
-
|
92
|
-
metadata=metadata,
|
121
|
+
scorer_info=scorer_info,
|
93
122
|
scores=simple_scores,
|
94
123
|
metrics=simple_metrics,
|
95
124
|
reducer_name=reducer_display_nm,
|
@@ -99,8 +128,7 @@ def eval_results(
|
|
99
128
|
result_scores.extend(
|
100
129
|
scorers_from_metric_dict(
|
101
130
|
scorer_name=scorer_name,
|
102
|
-
|
103
|
-
metadata=metadata,
|
131
|
+
scorer_info=scorer_info,
|
104
132
|
scores=simple_scores,
|
105
133
|
metrics=dict_metric,
|
106
134
|
reducer_name=reducer_display_nm,
|
@@ -116,8 +144,7 @@ def eval_results(
|
|
116
144
|
result_scores.extend(
|
117
145
|
scorers_from_metric_dict(
|
118
146
|
scorer_name=scorer_name,
|
119
|
-
|
120
|
-
metadata=metadata,
|
147
|
+
scorer_info=scorer_info,
|
121
148
|
scores=simple_scores,
|
122
149
|
metrics=targets,
|
123
150
|
reducer_name=reducer_display_nm,
|
@@ -156,8 +183,7 @@ def split_metrics(
|
|
156
183
|
|
157
184
|
def scorer_for_metrics(
|
158
185
|
scorer_name: str,
|
159
|
-
|
160
|
-
metadata: dict[str, Any],
|
186
|
+
scorer_info: ScorerInfo,
|
161
187
|
scores: list[Score],
|
162
188
|
metrics: list[Metric],
|
163
189
|
reducer_name: str | None = None,
|
@@ -218,8 +244,10 @@ def scorer_for_metrics(
|
|
218
244
|
scorer=scorer_name,
|
219
245
|
reducer=reducer_name,
|
220
246
|
name=scorer_name,
|
221
|
-
params=
|
222
|
-
metadata=metadata
|
247
|
+
params=scorer_info.params,
|
248
|
+
metadata=scorer_info.metadata
|
249
|
+
if len(scorer_info.metadata.keys()) > 0
|
250
|
+
else None,
|
223
251
|
metrics=list_metrics,
|
224
252
|
)
|
225
253
|
)
|
@@ -228,8 +256,7 @@ def scorer_for_metrics(
|
|
228
256
|
|
229
257
|
def scorers_from_metric_dict(
|
230
258
|
scorer_name: str,
|
231
|
-
|
232
|
-
metadata: dict[str, Any],
|
259
|
+
scorer_info: ScorerInfo,
|
233
260
|
scores: list[Score],
|
234
261
|
metrics: dict[str, list[Metric]],
|
235
262
|
reducer_name: str | None = None,
|
@@ -299,8 +326,10 @@ def scorers_from_metric_dict(
|
|
299
326
|
scorer=scorer_name,
|
300
327
|
reducer=reducer_name,
|
301
328
|
name=metric_key,
|
302
|
-
params=
|
303
|
-
metadata=metadata
|
329
|
+
params=scorer_info.params,
|
330
|
+
metadata=scorer_info.metadata
|
331
|
+
if len(scorer_info.metadata.keys()) > 0
|
332
|
+
else None,
|
304
333
|
metrics=result_metrics,
|
305
334
|
)
|
306
335
|
)
|
inspect_ai/_eval/task/run.py
CHANGED
@@ -30,8 +30,9 @@ from inspect_ai._util.hooks import send_telemetry
|
|
30
30
|
from inspect_ai._util.registry import (
|
31
31
|
is_registry_object,
|
32
32
|
registry_log_name,
|
33
|
+
registry_unqualified_name,
|
33
34
|
)
|
34
|
-
from inspect_ai._util.timeouts import Timeout, timeout
|
35
|
+
from inspect_ai._util.timeouts import Timeout, timeout
|
35
36
|
from inspect_ai._view.notify import view_notify_eval
|
36
37
|
from inspect_ai.dataset import Dataset, Sample
|
37
38
|
from inspect_ai.log import (
|
@@ -45,7 +46,11 @@ from inspect_ai.log import (
|
|
45
46
|
from inspect_ai.log._condense import condense_sample
|
46
47
|
from inspect_ai.log._file import eval_log_json_str
|
47
48
|
from inspect_ai.log._log import EvalSampleLimit, EvalSampleReductions, eval_error
|
48
|
-
from inspect_ai.log._samples import
|
49
|
+
from inspect_ai.log._samples import (
|
50
|
+
active_sample,
|
51
|
+
set_active_sample_message_limit,
|
52
|
+
set_active_sample_token_limit,
|
53
|
+
)
|
49
54
|
from inspect_ai.log._transcript import (
|
50
55
|
ErrorEvent,
|
51
56
|
SampleInitEvent,
|
@@ -72,6 +77,7 @@ from inspect_ai.solver._chain import Chain, unroll
|
|
72
77
|
from inspect_ai.solver._fork import set_task_generate
|
73
78
|
from inspect_ai.solver._solver import Solver
|
74
79
|
from inspect_ai.solver._task_state import sample_state, set_sample_state, state_jsonable
|
80
|
+
from inspect_ai.util._limit import SampleLimitExceededError
|
75
81
|
from inspect_ai.util._sandbox.context import sandbox_connections
|
76
82
|
from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
|
77
83
|
from inspect_ai.util._subtask import init_subtask
|
@@ -538,6 +544,9 @@ async def task_run_sample(
|
|
538
544
|
# helper to handle exceptions (will throw if we've exceeded the limit)
|
539
545
|
def handle_error(ex: BaseException) -> EvalError:
|
540
546
|
err = sample_error(ex)
|
547
|
+
py_logger.warning(
|
548
|
+
f"Sample error (id: {sample.id}, epoch: {state.epoch}): {exception_message(ex)})"
|
549
|
+
)
|
541
550
|
transcript()._event(ErrorEvent(error=err))
|
542
551
|
return err
|
543
552
|
|
@@ -630,30 +639,43 @@ async def task_run_sample(
|
|
630
639
|
else:
|
631
640
|
raise
|
632
641
|
|
642
|
+
except SampleLimitExceededError as ex:
|
643
|
+
# sample limit event
|
644
|
+
transcript()._event(
|
645
|
+
SampleLimitEvent(
|
646
|
+
type=ex.type,
|
647
|
+
limit=ex.limit,
|
648
|
+
message=f"Sample completed: {ex.message}",
|
649
|
+
)
|
650
|
+
)
|
651
|
+
|
652
|
+
# capture most recent state for scoring
|
653
|
+
state = sample_state() or state
|
654
|
+
state.completed = True
|
655
|
+
|
633
656
|
except BaseException as ex:
|
634
657
|
error = handle_error(ex)
|
635
658
|
|
636
|
-
# set timeout for scoring. if the original timeout was
|
637
|
-
#
|
638
|
-
# timeout time. if the original timeout was hit we still want
|
639
|
-
# to provide an opportunity for scoring, but we don't necessarily
|
659
|
+
# set timeout for scoring. if the original timeout was hit we still
|
660
|
+
# want to provide opportunity for scoring, but we don't necessarily
|
640
661
|
# want to wait the full timeout again (especially in the case where
|
641
662
|
# the cause of the timeout is a hung container and scoring requires
|
642
663
|
# interacting with the container). as a middle ground we use half
|
643
664
|
# of the original timeout value for scoring.
|
644
665
|
if isinstance(timeout_cm, Timeout):
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
666
|
+
assert time_limit
|
667
|
+
timeout_cm = timeout(time_limit / 2)
|
668
|
+
|
669
|
+
# turn off sample limits
|
670
|
+
set_active_sample_token_limit(None)
|
671
|
+
set_active_sample_message_limit(None)
|
650
672
|
|
651
673
|
# scoring
|
652
674
|
try:
|
653
675
|
# timeout during scoring will result in an ordinary sample error
|
654
676
|
async with timeout_cm:
|
655
|
-
if
|
656
|
-
for scorer in scorers:
|
677
|
+
if error is None:
|
678
|
+
for scorer in scorers or []:
|
657
679
|
scorer_name = unique_scorer_name(
|
658
680
|
scorer, list(results.keys())
|
659
681
|
)
|
@@ -667,6 +689,7 @@ async def task_run_sample(
|
|
667
689
|
sample_score = SampleScore(
|
668
690
|
score=score_result,
|
669
691
|
sample_id=sample.id,
|
692
|
+
scorer=registry_unqualified_name(scorer),
|
670
693
|
)
|
671
694
|
transcript()._event(
|
672
695
|
ScoreEvent(
|
@@ -675,6 +698,16 @@ async def task_run_sample(
|
|
675
698
|
)
|
676
699
|
results[scorer_name] = sample_score
|
677
700
|
|
701
|
+
# add scores returned by solvers
|
702
|
+
if state.scores is not None:
|
703
|
+
for name, score in state.scores.items():
|
704
|
+
results[name] = SampleScore(
|
705
|
+
score=score, sample_id=state.sample_id
|
706
|
+
)
|
707
|
+
|
708
|
+
# propagate results into scores
|
709
|
+
state.scores = {k: v.score for k, v in results.items()}
|
710
|
+
|
678
711
|
except asyncio.CancelledError:
|
679
712
|
if active.interrupt_action:
|
680
713
|
transcript()._event(
|
@@ -819,6 +852,7 @@ async def resolve_dataset(
|
|
819
852
|
epoch=epoch,
|
820
853
|
model=model_name,
|
821
854
|
input=sample.input,
|
855
|
+
target=Target(sample.target),
|
822
856
|
choices=sample.choices,
|
823
857
|
messages=sample_messages(sample),
|
824
858
|
message_limit=message_limit,
|
inspect_ai/_eval/task/sandbox.py
CHANGED
@@ -4,11 +4,13 @@ import contextlib
|
|
4
4
|
from random import random
|
5
5
|
from typing import AsyncGenerator, Callable, NamedTuple, cast
|
6
6
|
|
7
|
+
import httpx
|
8
|
+
|
7
9
|
from inspect_ai._eval.task.task import Task
|
8
10
|
from inspect_ai._eval.task.util import task_run_dir
|
9
11
|
from inspect_ai._util.file import file, filesystem
|
10
12
|
from inspect_ai._util.registry import registry_unqualified_name
|
11
|
-
from inspect_ai._util.url import data_uri_to_base64, is_data_uri
|
13
|
+
from inspect_ai._util.url import data_uri_to_base64, is_data_uri, is_http_url
|
12
14
|
from inspect_ai.dataset import Sample
|
13
15
|
from inspect_ai.util._concurrency import concurrency
|
14
16
|
from inspect_ai.util._sandbox.context import (
|
@@ -65,12 +67,12 @@ async def sandboxenv_context(
|
|
65
67
|
files: dict[str, bytes] = {}
|
66
68
|
if sample.files:
|
67
69
|
for path, contents in sample.files.items():
|
68
|
-
files[path] = read_sandboxenv_file(contents)
|
70
|
+
files[path] = await read_sandboxenv_file(contents)
|
69
71
|
|
70
72
|
# read setup script from sample (add bash shebang if necessary)
|
71
73
|
setup: bytes | None = None
|
72
74
|
if sample.setup:
|
73
|
-
setup = read_sandboxenv_file(sample.setup)
|
75
|
+
setup = await read_sandboxenv_file(sample.setup)
|
74
76
|
setup_str = setup.decode(encoding="utf-8")
|
75
77
|
if not setup_str.strip().startswith("#!"):
|
76
78
|
setup_str = f"#!/usr/bin/env bash\n\n{setup_str}"
|
@@ -108,13 +110,16 @@ async def sandboxenv_context(
|
|
108
110
|
)
|
109
111
|
|
110
112
|
|
111
|
-
def read_sandboxenv_file(contents: str) -> bytes:
|
113
|
+
async def read_sandboxenv_file(contents: str) -> bytes:
|
112
114
|
if is_data_uri(contents):
|
113
115
|
contents_base64 = data_uri_to_base64(contents)
|
114
116
|
file_bytes = base64.b64decode(contents_base64)
|
117
|
+
elif is_http_url(contents):
|
118
|
+
client = httpx.AsyncClient()
|
119
|
+
file_bytes = (await client.get(contents, follow_redirects=True)).content
|
115
120
|
else:
|
116
121
|
# try to read as a file (if it doesn't exist or has a path not cool w/
|
117
|
-
# the
|
122
|
+
# the filesystem then we fall back to contents)
|
118
123
|
try:
|
119
124
|
fs = filesystem(contents)
|
120
125
|
if fs.exists(contents):
|
inspect_ai/_util/constants.py
CHANGED
@@ -0,0 +1,61 @@
|
|
1
|
+
from typing import Literal
|
2
|
+
|
3
|
+
|
4
|
+
def get_service_by_port(port: int, protocol: Literal["tcp", "udp"]) -> str | None:
|
5
|
+
"""
|
6
|
+
Returns the likely service running on a given port number.
|
7
|
+
|
8
|
+
Args:
|
9
|
+
port (int): The port number to look up
|
10
|
+
protocol (str): Either 'tcp' or 'udp'
|
11
|
+
|
12
|
+
Returns:
|
13
|
+
str: Description of the likely service, or None if not found
|
14
|
+
"""
|
15
|
+
# Common port mappings based on IANA assignments and common usage
|
16
|
+
port_mappings = {
|
17
|
+
"tcp": {
|
18
|
+
20: "FTP (Data)",
|
19
|
+
21: "FTP (Control)",
|
20
|
+
22: "SSH",
|
21
|
+
23: "Telnet",
|
22
|
+
25: "SMTP",
|
23
|
+
53: "DNS",
|
24
|
+
80: "HTTP",
|
25
|
+
110: "POP3",
|
26
|
+
143: "IMAP",
|
27
|
+
443: "HTTPS",
|
28
|
+
445: "Microsoft-DS (SMB)",
|
29
|
+
587: "SMTP (Submission)",
|
30
|
+
993: "IMAPS",
|
31
|
+
995: "POP3S",
|
32
|
+
1433: "Microsoft SQL Server",
|
33
|
+
1521: "Oracle Database",
|
34
|
+
3306: "MySQL",
|
35
|
+
3389: "RDP (Remote Desktop)",
|
36
|
+
5432: "PostgreSQL",
|
37
|
+
5900: "VNC",
|
38
|
+
5901: "VNC Display :1",
|
39
|
+
5902: "VNC Display :2",
|
40
|
+
6080: "noVNC",
|
41
|
+
8080: "HTTP Alternate",
|
42
|
+
8443: "HTTPS Alternate",
|
43
|
+
27017: "MongoDB",
|
44
|
+
27018: "MongoDB Shard",
|
45
|
+
27019: "MongoDB Config Server",
|
46
|
+
},
|
47
|
+
"udp": {
|
48
|
+
53: "DNS",
|
49
|
+
67: "DHCP Server",
|
50
|
+
68: "DHCP Client",
|
51
|
+
69: "TFTP",
|
52
|
+
123: "NTP",
|
53
|
+
161: "SNMP",
|
54
|
+
162: "SNMP Trap",
|
55
|
+
514: "Syslog",
|
56
|
+
1194: "OpenVPN",
|
57
|
+
5353: "mDNS",
|
58
|
+
},
|
59
|
+
}
|
60
|
+
|
61
|
+
return port_mappings.get(protocol, {}).get(port, None)
|