inspect-ai 0.3.74__py3-none-any.whl → 0.3.76__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +3 -2
- inspect_ai/_cli/cache.py +1 -1
- inspect_ai/_cli/common.py +15 -0
- inspect_ai/_cli/eval.py +4 -5
- inspect_ai/_cli/log.py +1 -1
- inspect_ai/_cli/sandbox.py +1 -1
- inspect_ai/_cli/trace.py +1 -1
- inspect_ai/_cli/view.py +1 -1
- inspect_ai/_display/core/config.py +3 -1
- inspect_ai/_eval/eval.py +55 -61
- inspect_ai/_eval/evalset.py +64 -154
- inspect_ai/_eval/loader.py +27 -54
- inspect_ai/_eval/registry.py +4 -15
- inspect_ai/_eval/run.py +7 -4
- inspect_ai/_eval/task/__init__.py +8 -2
- inspect_ai/_eval/task/log.py +9 -1
- inspect_ai/_eval/task/resolved.py +35 -0
- inspect_ai/_eval/task/run.py +4 -0
- inspect_ai/_eval/task/task.py +50 -69
- inspect_ai/_eval/task/tasks.py +30 -0
- inspect_ai/_util/constants.py +3 -0
- inspect_ai/_util/dotenv.py +17 -0
- inspect_ai/_util/logger.py +3 -0
- inspect_ai/_util/registry.py +43 -2
- inspect_ai/_view/server.py +28 -10
- inspect_ai/_view/www/dist/assets/index.css +32 -19
- inspect_ai/_view/www/dist/assets/index.js +17682 -29989
- inspect_ai/_view/www/log-schema.json +79 -9
- inspect_ai/_view/www/package.json +2 -2
- inspect_ai/_view/www/src/appearance/styles.ts +6 -5
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +2 -2
- inspect_ai/_view/www/src/constants.ts +3 -0
- inspect_ai/_view/www/src/logfile/remoteZipFile.ts +141 -20
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +2 -1
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +1 -1
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +7 -5
- inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +1 -1
- inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -2
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +1 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +3 -1
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +1 -1
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +5 -2
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +2 -2
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +5 -1
- inspect_ai/_view/www/src/types/log.d.ts +11 -5
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +17 -12
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -1
- inspect_ai/_view/www/yarn.lock +12 -5
- inspect_ai/log/_log.py +10 -1
- inspect_ai/log/_recorders/eval.py +27 -8
- inspect_ai/log/_recorders/json.py +10 -2
- inspect_ai/log/_transcript.py +13 -4
- inspect_ai/model/_call_tools.py +13 -4
- inspect_ai/model/_chat_message.py +15 -1
- inspect_ai/model/_model.py +30 -12
- inspect_ai/model/_model_output.py +6 -1
- inspect_ai/model/_openai.py +11 -6
- inspect_ai/model/_providers/anthropic.py +167 -77
- inspect_ai/model/_providers/google.py +6 -2
- inspect_ai/model/_providers/none.py +31 -0
- inspect_ai/model/_providers/openai.py +11 -8
- inspect_ai/model/_providers/providers.py +7 -0
- inspect_ai/model/_providers/vertex.py +5 -2
- inspect_ai/solver/_bridge/bridge.py +1 -1
- inspect_ai/solver/_chain.py +7 -6
- inspect_ai/tool/__init__.py +4 -0
- inspect_ai/tool/_tool_call.py +5 -2
- inspect_ai/tool/_tool_support_helpers.py +200 -0
- inspect_ai/tool/_tools/_bash_session.py +119 -0
- inspect_ai/tool/_tools/_computer/_computer.py +1 -1
- inspect_ai/tool/_tools/_text_editor.py +121 -0
- inspect_ai/tool/_tools/_web_browser/_back_compat.py +150 -0
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +75 -130
- inspect_ai/tool/_tools/_web_search.py +2 -2
- inspect_ai/util/_json.py +28 -0
- inspect_ai/util/_sandbox/context.py +18 -8
- inspect_ai/util/_sandbox/docker/config.py +1 -1
- inspect_ai/util/_sandbox/docker/internal.py +3 -3
- inspect_ai/util/_sandbox/environment.py +17 -2
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/METADATA +8 -5
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/RECORD +85 -108
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/WHEEL +1 -1
- inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +0 -8
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +0 -24
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +0 -25
- inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +0 -22
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +0 -63
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +0 -71
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +0 -323
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +0 -5
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +0 -279
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +0 -9
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +0 -293
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +0 -94
- inspect_ai/tool/_tools/_web_browser/_resources/constants.py +0 -2
- inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +0 -2
- inspect_ai/tool/_tools/_web_browser/_resources/mock_environment.py +0 -45
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +0 -50
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +0 -48
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +0 -280
- inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +0 -65
- inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +0 -64
- inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +0 -146
- inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +0 -64
- inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +0 -180
- inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +0 -99
- inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +0 -15
- inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +0 -44
- inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +0 -39
- inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +0 -214
- inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +0 -35
- inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +0 -192
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info/licenses}/LICENSE +0 -0
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/top_level.txt +0 -0
inspect_ai/__init__.py
CHANGED
@@ -7,7 +7,8 @@ from inspect_ai._eval.evalset import eval_set
|
|
7
7
|
from inspect_ai._eval.list import list_tasks
|
8
8
|
from inspect_ai._eval.registry import task
|
9
9
|
from inspect_ai._eval.score import score, score_async
|
10
|
-
from inspect_ai._eval.task import Epochs, Task, TaskInfo,
|
10
|
+
from inspect_ai._eval.task import Epochs, Task, TaskInfo, task_with
|
11
|
+
from inspect_ai._eval.task.tasks import Tasks
|
11
12
|
from inspect_ai._util.constants import PKG_NAME
|
12
13
|
from inspect_ai.solver._human_agent.agent import human_agent
|
13
14
|
|
@@ -26,8 +27,8 @@ __all__ = [
|
|
26
27
|
"score_async",
|
27
28
|
"Epochs",
|
28
29
|
"Task",
|
29
|
-
"TaskInfo",
|
30
30
|
"Tasks",
|
31
|
+
"TaskInfo",
|
31
32
|
"task",
|
32
33
|
"task_with",
|
33
34
|
]
|
inspect_ai/_cli/cache.py
CHANGED
@@ -44,7 +44,7 @@ def _print_table(title: str, paths: list[tuple[str, int]]) -> None:
|
|
44
44
|
def cache_command() -> None:
|
45
45
|
"""Manage the inspect model output cache.
|
46
46
|
|
47
|
-
Learn more about model output caching at https://inspect.
|
47
|
+
Learn more about model output caching at https://inspect.aisi.org.uk/caching.html.
|
48
48
|
"""
|
49
49
|
return None
|
50
50
|
|
inspect_ai/_cli/common.py
CHANGED
@@ -10,14 +10,18 @@ from inspect_ai._util.constants import (
|
|
10
10
|
DEFAULT_DISPLAY,
|
11
11
|
DEFAULT_LOG_LEVEL,
|
12
12
|
)
|
13
|
+
from inspect_ai._util.dotenv import init_cli_env
|
13
14
|
from inspect_ai.util._display import init_display_type
|
14
15
|
|
16
|
+
from .util import parse_cli_args
|
17
|
+
|
15
18
|
|
16
19
|
class CommonOptions(TypedDict):
|
17
20
|
log_level: str
|
18
21
|
log_dir: str
|
19
22
|
display: Literal["full", "conversation", "rich", "plain", "none"]
|
20
23
|
no_ansi: bool | None
|
24
|
+
env: tuple[str] | None
|
21
25
|
debug: bool
|
22
26
|
debug_port: int
|
23
27
|
debug_errors: bool
|
@@ -68,6 +72,13 @@ def common_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
68
72
|
help="Do not print ANSI control characters.",
|
69
73
|
envvar="INSPECT_NO_ANSI",
|
70
74
|
)
|
75
|
+
@click.option(
|
76
|
+
"--env",
|
77
|
+
multiple=True,
|
78
|
+
type=str,
|
79
|
+
envvar="INSPECT_EVAL_ENV",
|
80
|
+
help="Define an environment variable e.g. --env NAME=value (--env can be specified multiple times)",
|
81
|
+
)
|
71
82
|
@click.option(
|
72
83
|
"--debug", is_flag=True, envvar="INSPECT_DEBUG", help="Wait to attach debugger"
|
73
84
|
)
|
@@ -92,6 +103,10 @@ def common_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
92
103
|
|
93
104
|
|
94
105
|
def process_common_options(options: CommonOptions) -> None:
|
106
|
+
# set environment variables
|
107
|
+
env_args = parse_cli_args(options["env"])
|
108
|
+
init_cli_env(env_args)
|
109
|
+
|
95
110
|
# propagate display
|
96
111
|
if options["no_ansi"]:
|
97
112
|
display = "rich"
|
inspect_ai/_cli/eval.py
CHANGED
@@ -56,7 +56,6 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
56
56
|
@click.option(
|
57
57
|
"--model",
|
58
58
|
type=str,
|
59
|
-
required=True,
|
60
59
|
help="Model used to evaluate tasks.",
|
61
60
|
envvar="INSPECT_EVAL_MODEL",
|
62
61
|
)
|
@@ -441,7 +440,7 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
441
440
|
def eval_command(
|
442
441
|
tasks: tuple[str] | None,
|
443
442
|
solver: str | None,
|
444
|
-
model: str,
|
443
|
+
model: str | None,
|
445
444
|
model_base_url: str | None,
|
446
445
|
m: tuple[str] | None,
|
447
446
|
model_config: str | None,
|
@@ -608,7 +607,7 @@ def eval_set_command(
|
|
608
607
|
solver: str | None,
|
609
608
|
trace: bool | None,
|
610
609
|
approval: str | None,
|
611
|
-
model: str,
|
610
|
+
model: str | None,
|
612
611
|
model_base_url: str | None,
|
613
612
|
m: tuple[str] | None,
|
614
613
|
model_config: str | None,
|
@@ -671,7 +670,7 @@ def eval_set_command(
|
|
671
670
|
) -> int:
|
672
671
|
"""Evaluate a set of tasks with retries.
|
673
672
|
|
674
|
-
Learn more about eval sets at https://inspect.
|
673
|
+
Learn more about eval sets at https://inspect.aisi.org.uk/eval-sets.html.
|
675
674
|
"""
|
676
675
|
# read config
|
677
676
|
config = config_from_locals(dict(locals()))
|
@@ -741,7 +740,7 @@ def eval_exec(
|
|
741
740
|
log_level_transcript: str,
|
742
741
|
log_dir: str,
|
743
742
|
log_format: Literal["eval", "json"] | None,
|
744
|
-
model: str,
|
743
|
+
model: str | None,
|
745
744
|
model_base_url: str | None,
|
746
745
|
m: tuple[str] | None,
|
747
746
|
model_config: str | None,
|
inspect_ai/_cli/log.py
CHANGED
@@ -30,7 +30,7 @@ def log_command() -> None:
|
|
30
30
|
|
31
31
|
The 'log' commands enable you to read Inspect logs uniformly as JSON no matter their physical storage format, and also enable you to read only the headers (everything but the samples) from log files, which is useful for very large logs.
|
32
32
|
|
33
|
-
Learn more about managing log files at https://inspect.
|
33
|
+
Learn more about managing log files at https://inspect.aisi.org.uk/eval-logs.html.
|
34
34
|
"""
|
35
35
|
return None
|
36
36
|
|
inspect_ai/_cli/sandbox.py
CHANGED
@@ -9,7 +9,7 @@ from inspect_ai.util._sandbox.registry import registry_find_sandboxenv
|
|
9
9
|
def sandbox_command() -> None:
|
10
10
|
"""Manage Sandbox Environments.
|
11
11
|
|
12
|
-
Learn more about sandboxing at https://inspect.
|
12
|
+
Learn more about sandboxing at https://inspect.aisi.org.uk/sandboxing.html.
|
13
13
|
"""
|
14
14
|
return None
|
15
15
|
|
inspect_ai/_cli/trace.py
CHANGED
@@ -28,7 +28,7 @@ def trace_command() -> None:
|
|
28
28
|
|
29
29
|
Inspect includes a TRACE log-level which is right below the HTTP and INFO log levels (so not written to the console by default). However, TRACE logs are always recorded to a separate file, and the last 10 TRACE logs are preserved. The 'trace' command provides ways to list and read these traces.
|
30
30
|
|
31
|
-
Learn more about execution traces at https://inspect.
|
31
|
+
Learn more about execution traces at https://inspect.aisi.org.uk/tracing.html.
|
32
32
|
"""
|
33
33
|
return None
|
34
34
|
|
inspect_ai/_cli/view.py
CHANGED
@@ -41,7 +41,7 @@ def start_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
41
41
|
def view_command(ctx: click.Context, **kwargs: Unpack[CommonOptions]) -> None:
|
42
42
|
"""Inspect log viewer.
|
43
43
|
|
44
|
-
Learn more about using the log viewer at https://inspect.
|
44
|
+
Learn more about using the log viewer at https://inspect.aisi.org.uk/log-viewer.html.
|
45
45
|
"""
|
46
46
|
if ctx.invoked_subcommand is None:
|
47
47
|
ctx.invoke(start, **kwargs)
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from inspect_ai._util.registry import is_registry_dict
|
1
|
+
from inspect_ai._util.registry import is_model_dict, is_registry_dict
|
2
2
|
from inspect_ai.log._log import eval_config_defaults
|
3
3
|
|
4
4
|
from .display import TaskProfile
|
@@ -14,6 +14,8 @@ def task_config(
|
|
14
14
|
value = task_args[key]
|
15
15
|
if is_registry_dict(value):
|
16
16
|
task_args[key] = value["name"]
|
17
|
+
if is_model_dict(value):
|
18
|
+
task_args[key] = value["model"]
|
17
19
|
# get eval_config overrides
|
18
20
|
eval_config = dict(profile.eval_config.model_dump(exclude_none=True))
|
19
21
|
for name, default_value in eval_config_defaults().items():
|
inspect_ai/_eval/eval.py
CHANGED
@@ -4,6 +4,8 @@ import sys
|
|
4
4
|
from pathlib import Path
|
5
5
|
from typing import Any, Literal
|
6
6
|
|
7
|
+
from inspect_ai._util.notgiven import NOT_GIVEN, NotGiven
|
8
|
+
|
7
9
|
if sys.version_info < (3, 11):
|
8
10
|
from exceptiongroup import ExceptionGroup
|
9
11
|
|
@@ -47,16 +49,18 @@ from inspect_ai.util._display import (
|
|
47
49
|
)
|
48
50
|
|
49
51
|
from .context import init_eval_context
|
50
|
-
from .loader import
|
52
|
+
from .loader import resolve_tasks
|
51
53
|
from .run import eval_run
|
52
|
-
from .task import Epochs, PreviousTask
|
54
|
+
from .task import Epochs, PreviousTask
|
55
|
+
from .task.resolved import ResolvedTask, resolved_model_names
|
56
|
+
from .task.tasks import Tasks
|
53
57
|
|
54
58
|
log = logging.getLogger(__name__)
|
55
59
|
|
56
60
|
|
57
61
|
def eval(
|
58
62
|
tasks: Tasks,
|
59
|
-
model: str | Model | list[str] | list[Model] | None =
|
63
|
+
model: str | Model | list[str] | list[Model] | None | NotGiven = NOT_GIVEN,
|
60
64
|
model_base_url: str | None = None,
|
61
65
|
model_args: dict[str, Any] | str = dict(),
|
62
66
|
task_args: dict[str, Any] | str = dict(),
|
@@ -96,9 +100,9 @@ def eval(
|
|
96
100
|
Args:
|
97
101
|
tasks: Task(s) to evaluate. If None, attempt
|
98
102
|
to evaluate a task in the current working directory
|
99
|
-
model: Model(s) for
|
100
|
-
|
101
|
-
|
103
|
+
model: Model(s) for evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
|
104
|
+
environment variable. Specify `None` to define no default model(s), which will
|
105
|
+
leave model usage entirely up to tasks.
|
102
106
|
model_base_url: Base URL for communicating
|
103
107
|
with the model API.
|
104
108
|
model_args: Model creation args
|
@@ -144,7 +148,7 @@ def eval(
|
|
144
148
|
max_samples: Maximum number of samples to run in parallel
|
145
149
|
(default is max_connections)
|
146
150
|
max_tasks: Maximum number of tasks to run in parallel
|
147
|
-
(
|
151
|
+
(defaults to number of models being evaluated)
|
148
152
|
max_subprocesses: Maximum number of subprocesses to
|
149
153
|
run in parallel (default is os.cpu_count())
|
150
154
|
max_sandboxes: Maximum number of sandboxes (per-provider)
|
@@ -223,7 +227,7 @@ _eval_async_running = False
|
|
223
227
|
|
224
228
|
async def eval_async(
|
225
229
|
tasks: Tasks,
|
226
|
-
model: str | Model | list[str] | list[Model] | None =
|
230
|
+
model: str | Model | list[str] | list[Model] | None | NotGiven = NOT_GIVEN,
|
227
231
|
model_base_url: str | None = None,
|
228
232
|
model_args: dict[str, Any] | str = dict(),
|
229
233
|
task_args: dict[str, Any] | str = dict(),
|
@@ -259,67 +263,53 @@ async def eval_async(
|
|
259
263
|
r"""Evaluate tasks using a Model (async).
|
260
264
|
|
261
265
|
Args:
|
262
|
-
tasks:
|
266
|
+
tasks: Task(s) to evaluate. If None, attempt
|
263
267
|
to evaluate a task in the current working directory
|
264
|
-
model (
|
265
|
-
|
266
|
-
|
267
|
-
model_base_url:
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
sandbox (SandboxEnvironmentType | None): Sandbox environment type
|
274
|
-
(or optionally a str or tuple with a shorthand spec)
|
275
|
-
sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
|
276
|
-
(defaults to True)
|
277
|
-
solver (Solver | list[Solver] | SolverSpec | None): Alternative solver for task(s).
|
278
|
-
Optional (uses task solver by default).
|
268
|
+
model: Model(s) for evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
|
269
|
+
environment variable. Specify `None` to define no default model(s), which will
|
270
|
+
leave model usage entirely up to tasks.
|
271
|
+
model_base_url: Base URL for communicating with the model API.
|
272
|
+
model_args: Model creation args (as a dictionary or as a path to a JSON or YAML config file)
|
273
|
+
task_args: Task creation arguments (as a dictionary or as a path to a JSON or YAML config file)
|
274
|
+
sandbox: Sandbox environment type (or optionally a str or tuple with a shorthand spec)
|
275
|
+
sandbox_cleanup: Cleanup sandbox environments after task completes (defaults to True)
|
276
|
+
solver: Alternative solver for task(s). Optional (uses task solver by default).
|
279
277
|
tags (list[str] | None): Tags to associate with this evaluation run.
|
280
|
-
approval:
|
278
|
+
approval: Tool use approval policies.
|
281
279
|
Either a path to an approval policy config file or a list of approval policies.
|
282
280
|
Defaults to no approval policy.
|
283
|
-
log_level
|
281
|
+
log_level: Level for logging to the console: "debug", "http", "sandbox",
|
284
282
|
"info", "warning", "error", or "critical" (defaults to "warning")
|
285
|
-
log_level_transcript
|
286
|
-
log_dir
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
(defaults to all samples).
|
292
|
-
sample_id (str | list[str] | None): Evaluate specific sample(s) from the dataset.
|
293
|
-
epochs (int | Epochs | None): Epochs to repeat samples for and optional score
|
283
|
+
log_level_transcript: Level for logging to the log file (defaults to "info")
|
284
|
+
log_dir: Output path for logging results (defaults to file log in ./logs directory).
|
285
|
+
log_format: Format for writing log files (defaults to "eval", the native high-performance format).
|
286
|
+
limit: Limit evaluated samples (defaults to all samples).
|
287
|
+
sample_id: Evaluate specific sample(s) from the dataset.
|
288
|
+
epochs: Epochs to repeat samples for and optional score
|
294
289
|
reducer function(s) used to combine sample scores (defaults to "mean")
|
295
|
-
fail_on_error
|
290
|
+
fail_on_error: `True` to fail on first sample error
|
296
291
|
(default); `False` to never fail on sample errors; Value between 0 and 1
|
297
292
|
to fail if a proportion of total samples fails. Value greater than 1 to fail eval if a count of samples fails.
|
298
|
-
debug_errors
|
299
|
-
|
300
|
-
|
301
|
-
token_limit (int | None): Limit on total tokens used for each sample.
|
293
|
+
debug_errors: Raise task errors (rather than logging them) so they can be debugged (defaults to False).
|
294
|
+
message_limit: Limit on total messages used for each sample.
|
295
|
+
token_limit: Limit on total tokens used for each sample.
|
302
296
|
time_limit: Limit on clock time (in seconds) for samples.
|
303
297
|
working_limit: Limit on working time (in seconds) for sample. Working
|
304
298
|
time includes model generation, tool calls, etc. but does not include
|
305
299
|
time spent waiting on retries or shared resources.
|
306
|
-
max_samples
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
log_samples: (bool | None): Log detailed samples and scores (defaults to True)
|
315
|
-
log_images: (bool | None): Log base64 encoded version of images,
|
316
|
-
even if specified as a filename or URL (defaults to False)
|
317
|
-
log_buffer: (int | None): Number of samples to buffer before writing log file.
|
300
|
+
max_samples: Maximum number of samples to run in parallel (default is max_connections)
|
301
|
+
max_tasks: Maximum number of tasks to run in parallel
|
302
|
+
(defaults to number of models being evaluated)
|
303
|
+
max_subprocesses: Maximum number of subprocesses to run in parallel (default is os.cpu_count())
|
304
|
+
max_sandboxes: Maximum number of sandboxes (per-provider) to run in parallel.
|
305
|
+
log_samples: Log detailed samples and scores (defaults to True)
|
306
|
+
log_images: Log base64 encoded version of images, even if specified as a filename or URL (defaults to False)
|
307
|
+
log_buffer: Number of samples to buffer before writing log file.
|
318
308
|
If not specified, an appropriate default for the format and filesystem is
|
319
309
|
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
320
|
-
score
|
321
|
-
score_display
|
322
|
-
**kwargs
|
310
|
+
score: Score output (defaults to True)
|
311
|
+
score_display: Show scoring metrics in realtime (defaults to True)
|
312
|
+
**kwargs: Model generation options.
|
323
313
|
|
324
314
|
Returns:
|
325
315
|
List of EvalLog (one for each task)
|
@@ -365,6 +355,12 @@ async def eval_async(
|
|
365
355
|
log.warning("No inspect tasks were found at the specified paths.")
|
366
356
|
return []
|
367
357
|
|
358
|
+
# if there is no max tasks then base it on unique model names
|
359
|
+
if max_tasks is None:
|
360
|
+
model_count = len(resolved_model_names(resolved_tasks))
|
361
|
+
if model_count > 1:
|
362
|
+
max_tasks = model_count
|
363
|
+
|
368
364
|
# apply conversation display constraints
|
369
365
|
if display_type() == "conversation":
|
370
366
|
# single task at a time
|
@@ -450,7 +446,6 @@ async def eval_async(
|
|
450
446
|
eval_config=eval_config,
|
451
447
|
eval_sandbox=sandbox,
|
452
448
|
recorder=recorder,
|
453
|
-
model_args=model_args,
|
454
449
|
epochs_reducer=epochs_reducer,
|
455
450
|
solver=solver,
|
456
451
|
tags=tags,
|
@@ -475,7 +470,6 @@ async def eval_async(
|
|
475
470
|
eval_config=eval_config,
|
476
471
|
eval_sandbox=sandbox,
|
477
472
|
recorder=recorder,
|
478
|
-
model_args=model_args,
|
479
473
|
epochs_reducer=epochs_reducer,
|
480
474
|
solver=solver,
|
481
475
|
tags=tags,
|
@@ -529,7 +523,7 @@ def eval_retry(
|
|
529
523
|
max_samples: Maximum number of samples to run in parallel
|
530
524
|
(default is max_connections)
|
531
525
|
max_tasks: Maximum number of tasks to run in parallel
|
532
|
-
(
|
526
|
+
(defaults to number of models being evaluated)
|
533
527
|
max_subprocesses: Maximum number of subprocesses to
|
534
528
|
run in parallel (default is os.cpu_count())
|
535
529
|
max_sandboxes: Maximum number of sandboxes (per-provider)
|
@@ -764,7 +758,7 @@ async def eval_retry_async(
|
|
764
758
|
log = (
|
765
759
|
await eval_async(
|
766
760
|
tasks=PreviousTask(
|
767
|
-
id=task_id, task=task, task_args=task_args, log=eval_log
|
761
|
+
id=task_id, task=task, task_args=task_args, model=None, log=eval_log
|
768
762
|
),
|
769
763
|
model=model,
|
770
764
|
model_base_url=model_base_url,
|
@@ -809,7 +803,7 @@ async def eval_retry_async(
|
|
809
803
|
|
810
804
|
def eval_init(
|
811
805
|
tasks: Tasks,
|
812
|
-
model: str | Model | list[str] | list[Model] | None =
|
806
|
+
model: str | Model | list[str] | list[Model] | None | NotGiven = NOT_GIVEN,
|
813
807
|
model_base_url: str | None = None,
|
814
808
|
model_args: dict[str, Any] | str = dict(),
|
815
809
|
task_args: dict[str, Any] | str = dict(),
|
@@ -886,7 +880,7 @@ def init_eval_display(
|
|
886
880
|
# multiple models not allowed in trace mode
|
887
881
|
if isinstance(model, list) and len(model) > 1:
|
888
882
|
raise PrerequisiteError(
|
889
|
-
"
|
883
|
+
"Conversation mode cannot be used when evaluating multiple models."
|
890
884
|
)
|
891
885
|
|
892
886
|
return max_tasks, max_samples
|