inspect-ai 0.3.74__py3-none-any.whl → 0.3.76__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. inspect_ai/__init__.py +3 -2
  2. inspect_ai/_cli/cache.py +1 -1
  3. inspect_ai/_cli/common.py +15 -0
  4. inspect_ai/_cli/eval.py +4 -5
  5. inspect_ai/_cli/log.py +1 -1
  6. inspect_ai/_cli/sandbox.py +1 -1
  7. inspect_ai/_cli/trace.py +1 -1
  8. inspect_ai/_cli/view.py +1 -1
  9. inspect_ai/_display/core/config.py +3 -1
  10. inspect_ai/_eval/eval.py +55 -61
  11. inspect_ai/_eval/evalset.py +64 -154
  12. inspect_ai/_eval/loader.py +27 -54
  13. inspect_ai/_eval/registry.py +4 -15
  14. inspect_ai/_eval/run.py +7 -4
  15. inspect_ai/_eval/task/__init__.py +8 -2
  16. inspect_ai/_eval/task/log.py +9 -1
  17. inspect_ai/_eval/task/resolved.py +35 -0
  18. inspect_ai/_eval/task/run.py +4 -0
  19. inspect_ai/_eval/task/task.py +50 -69
  20. inspect_ai/_eval/task/tasks.py +30 -0
  21. inspect_ai/_util/constants.py +3 -0
  22. inspect_ai/_util/dotenv.py +17 -0
  23. inspect_ai/_util/logger.py +3 -0
  24. inspect_ai/_util/registry.py +43 -2
  25. inspect_ai/_view/server.py +28 -10
  26. inspect_ai/_view/www/dist/assets/index.css +32 -19
  27. inspect_ai/_view/www/dist/assets/index.js +17682 -29989
  28. inspect_ai/_view/www/log-schema.json +79 -9
  29. inspect_ai/_view/www/package.json +2 -2
  30. inspect_ai/_view/www/src/appearance/styles.ts +6 -5
  31. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +2 -2
  32. inspect_ai/_view/www/src/constants.ts +3 -0
  33. inspect_ai/_view/www/src/logfile/remoteZipFile.ts +141 -20
  34. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +2 -1
  35. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +1 -1
  36. inspect_ai/_view/www/src/samples/chat/tools/tool.ts +7 -5
  37. inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +1 -1
  38. inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -2
  39. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +1 -0
  40. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +3 -1
  41. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +1 -1
  42. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +5 -2
  43. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +2 -2
  44. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +5 -1
  45. inspect_ai/_view/www/src/types/log.d.ts +11 -5
  46. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +17 -12
  47. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -1
  48. inspect_ai/_view/www/yarn.lock +12 -5
  49. inspect_ai/log/_log.py +10 -1
  50. inspect_ai/log/_recorders/eval.py +27 -8
  51. inspect_ai/log/_recorders/json.py +10 -2
  52. inspect_ai/log/_transcript.py +13 -4
  53. inspect_ai/model/_call_tools.py +13 -4
  54. inspect_ai/model/_chat_message.py +15 -1
  55. inspect_ai/model/_model.py +30 -12
  56. inspect_ai/model/_model_output.py +6 -1
  57. inspect_ai/model/_openai.py +11 -6
  58. inspect_ai/model/_providers/anthropic.py +167 -77
  59. inspect_ai/model/_providers/google.py +6 -2
  60. inspect_ai/model/_providers/none.py +31 -0
  61. inspect_ai/model/_providers/openai.py +11 -8
  62. inspect_ai/model/_providers/providers.py +7 -0
  63. inspect_ai/model/_providers/vertex.py +5 -2
  64. inspect_ai/solver/_bridge/bridge.py +1 -1
  65. inspect_ai/solver/_chain.py +7 -6
  66. inspect_ai/tool/__init__.py +4 -0
  67. inspect_ai/tool/_tool_call.py +5 -2
  68. inspect_ai/tool/_tool_support_helpers.py +200 -0
  69. inspect_ai/tool/_tools/_bash_session.py +119 -0
  70. inspect_ai/tool/_tools/_computer/_computer.py +1 -1
  71. inspect_ai/tool/_tools/_text_editor.py +121 -0
  72. inspect_ai/tool/_tools/_web_browser/_back_compat.py +150 -0
  73. inspect_ai/tool/_tools/_web_browser/_web_browser.py +75 -130
  74. inspect_ai/tool/_tools/_web_search.py +2 -2
  75. inspect_ai/util/_json.py +28 -0
  76. inspect_ai/util/_sandbox/context.py +18 -8
  77. inspect_ai/util/_sandbox/docker/config.py +1 -1
  78. inspect_ai/util/_sandbox/docker/internal.py +3 -3
  79. inspect_ai/util/_sandbox/environment.py +17 -2
  80. {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/METADATA +8 -5
  81. {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/RECORD +85 -108
  82. {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/WHEEL +1 -1
  83. inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +0 -8
  84. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +0 -24
  85. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +0 -25
  86. inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +0 -22
  87. inspect_ai/tool/_tools/_web_browser/_resources/README.md +0 -63
  88. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +0 -71
  89. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +0 -323
  90. inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +0 -5
  91. inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +0 -279
  92. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +0 -9
  93. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +0 -293
  94. inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +0 -94
  95. inspect_ai/tool/_tools/_web_browser/_resources/constants.py +0 -2
  96. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +0 -2
  97. inspect_ai/tool/_tools/_web_browser/_resources/mock_environment.py +0 -45
  98. inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +0 -50
  99. inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +0 -48
  100. inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +0 -280
  101. inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +0 -65
  102. inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +0 -64
  103. inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +0 -146
  104. inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +0 -64
  105. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +0 -180
  106. inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +0 -99
  107. inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +0 -15
  108. inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +0 -44
  109. inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +0 -39
  110. inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +0 -214
  111. inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +0 -35
  112. inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +0 -192
  113. {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/entry_points.txt +0 -0
  114. {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info/licenses}/LICENSE +0 -0
  115. {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/top_level.txt +0 -0
inspect_ai/__init__.py CHANGED
@@ -7,7 +7,8 @@ from inspect_ai._eval.evalset import eval_set
7
7
  from inspect_ai._eval.list import list_tasks
8
8
  from inspect_ai._eval.registry import task
9
9
  from inspect_ai._eval.score import score, score_async
10
- from inspect_ai._eval.task import Epochs, Task, TaskInfo, Tasks, task_with
10
+ from inspect_ai._eval.task import Epochs, Task, TaskInfo, task_with
11
+ from inspect_ai._eval.task.tasks import Tasks
11
12
  from inspect_ai._util.constants import PKG_NAME
12
13
  from inspect_ai.solver._human_agent.agent import human_agent
13
14
 
@@ -26,8 +27,8 @@ __all__ = [
26
27
  "score_async",
27
28
  "Epochs",
28
29
  "Task",
29
- "TaskInfo",
30
30
  "Tasks",
31
+ "TaskInfo",
31
32
  "task",
32
33
  "task_with",
33
34
  ]
inspect_ai/_cli/cache.py CHANGED
@@ -44,7 +44,7 @@ def _print_table(title: str, paths: list[tuple[str, int]]) -> None:
44
44
  def cache_command() -> None:
45
45
  """Manage the inspect model output cache.
46
46
 
47
- Learn more about model output caching at https://inspect.ai-safety-institute.org.uk/caching.html.
47
+ Learn more about model output caching at https://inspect.aisi.org.uk/caching.html.
48
48
  """
49
49
  return None
50
50
 
inspect_ai/_cli/common.py CHANGED
@@ -10,14 +10,18 @@ from inspect_ai._util.constants import (
10
10
  DEFAULT_DISPLAY,
11
11
  DEFAULT_LOG_LEVEL,
12
12
  )
13
+ from inspect_ai._util.dotenv import init_cli_env
13
14
  from inspect_ai.util._display import init_display_type
14
15
 
16
+ from .util import parse_cli_args
17
+
15
18
 
16
19
  class CommonOptions(TypedDict):
17
20
  log_level: str
18
21
  log_dir: str
19
22
  display: Literal["full", "conversation", "rich", "plain", "none"]
20
23
  no_ansi: bool | None
24
+ env: tuple[str] | None
21
25
  debug: bool
22
26
  debug_port: int
23
27
  debug_errors: bool
@@ -68,6 +72,13 @@ def common_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
68
72
  help="Do not print ANSI control characters.",
69
73
  envvar="INSPECT_NO_ANSI",
70
74
  )
75
+ @click.option(
76
+ "--env",
77
+ multiple=True,
78
+ type=str,
79
+ envvar="INSPECT_EVAL_ENV",
80
+ help="Define an environment variable e.g. --env NAME=value (--env can be specified multiple times)",
81
+ )
71
82
  @click.option(
72
83
  "--debug", is_flag=True, envvar="INSPECT_DEBUG", help="Wait to attach debugger"
73
84
  )
@@ -92,6 +103,10 @@ def common_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
92
103
 
93
104
 
94
105
  def process_common_options(options: CommonOptions) -> None:
106
+ # set environment variables
107
+ env_args = parse_cli_args(options["env"])
108
+ init_cli_env(env_args)
109
+
95
110
  # propagate display
96
111
  if options["no_ansi"]:
97
112
  display = "rich"
inspect_ai/_cli/eval.py CHANGED
@@ -56,7 +56,6 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
56
56
  @click.option(
57
57
  "--model",
58
58
  type=str,
59
- required=True,
60
59
  help="Model used to evaluate tasks.",
61
60
  envvar="INSPECT_EVAL_MODEL",
62
61
  )
@@ -441,7 +440,7 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
441
440
  def eval_command(
442
441
  tasks: tuple[str] | None,
443
442
  solver: str | None,
444
- model: str,
443
+ model: str | None,
445
444
  model_base_url: str | None,
446
445
  m: tuple[str] | None,
447
446
  model_config: str | None,
@@ -608,7 +607,7 @@ def eval_set_command(
608
607
  solver: str | None,
609
608
  trace: bool | None,
610
609
  approval: str | None,
611
- model: str,
610
+ model: str | None,
612
611
  model_base_url: str | None,
613
612
  m: tuple[str] | None,
614
613
  model_config: str | None,
@@ -671,7 +670,7 @@ def eval_set_command(
671
670
  ) -> int:
672
671
  """Evaluate a set of tasks with retries.
673
672
 
674
- Learn more about eval sets at https://inspect.ai-safety-institute.org.uk/eval-sets.html.
673
+ Learn more about eval sets at https://inspect.aisi.org.uk/eval-sets.html.
675
674
  """
676
675
  # read config
677
676
  config = config_from_locals(dict(locals()))
@@ -741,7 +740,7 @@ def eval_exec(
741
740
  log_level_transcript: str,
742
741
  log_dir: str,
743
742
  log_format: Literal["eval", "json"] | None,
744
- model: str,
743
+ model: str | None,
745
744
  model_base_url: str | None,
746
745
  m: tuple[str] | None,
747
746
  model_config: str | None,
inspect_ai/_cli/log.py CHANGED
@@ -30,7 +30,7 @@ def log_command() -> None:
30
30
 
31
31
  The 'log' commands enable you to read Inspect logs uniformly as JSON no matter their physical storage format, and also enable you to read only the headers (everything but the samples) from log files, which is useful for very large logs.
32
32
 
33
- Learn more about managing log files at https://inspect.ai-safety-institute.org.uk/eval-logs.html.
33
+ Learn more about managing log files at https://inspect.aisi.org.uk/eval-logs.html.
34
34
  """
35
35
  return None
36
36
 
@@ -9,7 +9,7 @@ from inspect_ai.util._sandbox.registry import registry_find_sandboxenv
9
9
  def sandbox_command() -> None:
10
10
  """Manage Sandbox Environments.
11
11
 
12
- Learn more about sandboxing at https://inspect.ai-safety-institute.org.uk/sandboxing.html.
12
+ Learn more about sandboxing at https://inspect.aisi.org.uk/sandboxing.html.
13
13
  """
14
14
  return None
15
15
 
inspect_ai/_cli/trace.py CHANGED
@@ -28,7 +28,7 @@ def trace_command() -> None:
28
28
 
29
29
  Inspect includes a TRACE log-level which is right below the HTTP and INFO log levels (so not written to the console by default). However, TRACE logs are always recorded to a separate file, and the last 10 TRACE logs are preserved. The 'trace' command provides ways to list and read these traces.
30
30
 
31
- Learn more about execution traces at https://inspect.ai-safety-institute.org.uk/tracing.html.
31
+ Learn more about execution traces at https://inspect.aisi.org.uk/tracing.html.
32
32
  """
33
33
  return None
34
34
 
inspect_ai/_cli/view.py CHANGED
@@ -41,7 +41,7 @@ def start_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
41
41
  def view_command(ctx: click.Context, **kwargs: Unpack[CommonOptions]) -> None:
42
42
  """Inspect log viewer.
43
43
 
44
- Learn more about using the log viewer at https://inspect.ai-safety-institute.org.uk/log-viewer.html.
44
+ Learn more about using the log viewer at https://inspect.aisi.org.uk/log-viewer.html.
45
45
  """
46
46
  if ctx.invoked_subcommand is None:
47
47
  ctx.invoke(start, **kwargs)
@@ -1,4 +1,4 @@
1
- from inspect_ai._util.registry import is_registry_dict
1
+ from inspect_ai._util.registry import is_model_dict, is_registry_dict
2
2
  from inspect_ai.log._log import eval_config_defaults
3
3
 
4
4
  from .display import TaskProfile
@@ -14,6 +14,8 @@ def task_config(
14
14
  value = task_args[key]
15
15
  if is_registry_dict(value):
16
16
  task_args[key] = value["name"]
17
+ if is_model_dict(value):
18
+ task_args[key] = value["model"]
17
19
  # get eval_config overrides
18
20
  eval_config = dict(profile.eval_config.model_dump(exclude_none=True))
19
21
  for name, default_value in eval_config_defaults().items():
inspect_ai/_eval/eval.py CHANGED
@@ -4,6 +4,8 @@ import sys
4
4
  from pathlib import Path
5
5
  from typing import Any, Literal
6
6
 
7
+ from inspect_ai._util.notgiven import NOT_GIVEN, NotGiven
8
+
7
9
  if sys.version_info < (3, 11):
8
10
  from exceptiongroup import ExceptionGroup
9
11
 
@@ -47,16 +49,18 @@ from inspect_ai.util._display import (
47
49
  )
48
50
 
49
51
  from .context import init_eval_context
50
- from .loader import ResolvedTask, resolve_tasks
52
+ from .loader import resolve_tasks
51
53
  from .run import eval_run
52
- from .task import Epochs, PreviousTask, Tasks
54
+ from .task import Epochs, PreviousTask
55
+ from .task.resolved import ResolvedTask, resolved_model_names
56
+ from .task.tasks import Tasks
53
57
 
54
58
  log = logging.getLogger(__name__)
55
59
 
56
60
 
57
61
  def eval(
58
62
  tasks: Tasks,
59
- model: str | Model | list[str] | list[Model] | None = None,
63
+ model: str | Model | list[str] | list[Model] | None | NotGiven = NOT_GIVEN,
60
64
  model_base_url: str | None = None,
61
65
  model_args: dict[str, Any] | str = dict(),
62
66
  task_args: dict[str, Any] | str = dict(),
@@ -96,9 +100,9 @@ def eval(
96
100
  Args:
97
101
  tasks: Task(s) to evaluate. If None, attempt
98
102
  to evaluate a task in the current working directory
99
- model: Model(s) for
100
- evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
101
- environment variable.
103
+ model: Model(s) for evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
104
+ environment variable. Specify `None` to define no default model(s), which will
105
+ leave model usage entirely up to tasks.
102
106
  model_base_url: Base URL for communicating
103
107
  with the model API.
104
108
  model_args: Model creation args
@@ -144,7 +148,7 @@ def eval(
144
148
  max_samples: Maximum number of samples to run in parallel
145
149
  (default is max_connections)
146
150
  max_tasks: Maximum number of tasks to run in parallel
147
- (default is 1)
151
+ (defaults to number of models being evaluated)
148
152
  max_subprocesses: Maximum number of subprocesses to
149
153
  run in parallel (default is os.cpu_count())
150
154
  max_sandboxes: Maximum number of sandboxes (per-provider)
@@ -223,7 +227,7 @@ _eval_async_running = False
223
227
 
224
228
  async def eval_async(
225
229
  tasks: Tasks,
226
- model: str | Model | list[str] | list[Model] | None = None,
230
+ model: str | Model | list[str] | list[Model] | None | NotGiven = NOT_GIVEN,
227
231
  model_base_url: str | None = None,
228
232
  model_args: dict[str, Any] | str = dict(),
229
233
  task_args: dict[str, Any] | str = dict(),
@@ -259,67 +263,53 @@ async def eval_async(
259
263
  r"""Evaluate tasks using a Model (async).
260
264
 
261
265
  Args:
262
- tasks: (Tasks): Task(s) to evaluate. If None, attempt
266
+ tasks: Task(s) to evaluate. If None, attempt
263
267
  to evaluate a task in the current working directory
264
- model (str | Model | list[str] | list[Model] | None): Model(s) for
265
- evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
266
- environment variable.
267
- model_base_url: (str | None): Base URL for communicating
268
- with the model API.
269
- model_args (dict[str,Any] | str): Model creation args
270
- (as a dictionary or as a path to a JSON or YAML config file)
271
- task_args (dict[str,Any] | str): Task creation arguments
272
- (as a dictionary or as a path to a JSON or YAML config file)
273
- sandbox (SandboxEnvironmentType | None): Sandbox environment type
274
- (or optionally a str or tuple with a shorthand spec)
275
- sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
276
- (defaults to True)
277
- solver (Solver | list[Solver] | SolverSpec | None): Alternative solver for task(s).
278
- Optional (uses task solver by default).
268
+ model: Model(s) for evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
269
+ environment variable. Specify `None` to define no default model(s), which will
270
+ leave model usage entirely up to tasks.
271
+ model_base_url: Base URL for communicating with the model API.
272
+ model_args: Model creation args (as a dictionary or as a path to a JSON or YAML config file)
273
+ task_args: Task creation arguments (as a dictionary or as a path to a JSON or YAML config file)
274
+ sandbox: Sandbox environment type (or optionally a str or tuple with a shorthand spec)
275
+ sandbox_cleanup: Cleanup sandbox environments after task completes (defaults to True)
276
+ solver: Alternative solver for task(s). Optional (uses task solver by default).
279
277
  tags (list[str] | None): Tags to associate with this evaluation run.
280
- approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
278
+ approval: Tool use approval policies.
281
279
  Either a path to an approval policy config file or a list of approval policies.
282
280
  Defaults to no approval policy.
283
- log_level (str | None): Level for logging to the console: "debug", "http", "sandbox",
281
+ log_level: Level for logging to the console: "debug", "http", "sandbox",
284
282
  "info", "warning", "error", or "critical" (defaults to "warning")
285
- log_level_transcript (str | None): Level for logging to the log file (defaults to "info")
286
- log_dir (str | None): Output path for logging results
287
- (defaults to file log in ./logs directory).
288
- log_format (Literal["eval", "json"] | None): Format for writing log files (defaults
289
- to "eval", the native high-performance format).
290
- limit (str | int | list[str | int] | None): Limit evaluated samples
291
- (defaults to all samples).
292
- sample_id (str | list[str] | None): Evaluate specific sample(s) from the dataset.
293
- epochs (int | Epochs | None): Epochs to repeat samples for and optional score
283
+ log_level_transcript: Level for logging to the log file (defaults to "info")
284
+ log_dir: Output path for logging results (defaults to file log in ./logs directory).
285
+ log_format: Format for writing log files (defaults to "eval", the native high-performance format).
286
+ limit: Limit evaluated samples (defaults to all samples).
287
+ sample_id: Evaluate specific sample(s) from the dataset.
288
+ epochs: Epochs to repeat samples for and optional score
294
289
  reducer function(s) used to combine sample scores (defaults to "mean")
295
- fail_on_error (bool | float | None): `True` to fail on first sample error
290
+ fail_on_error: `True` to fail on first sample error
296
291
  (default); `False` to never fail on sample errors; Value between 0 and 1
297
292
  to fail if a proportion of total samples fails. Value greater than 1 to fail eval if a count of samples fails.
298
- debug_errors (bool | None): Raise task errors (rather than logging them)
299
- so they can be debugged (defaults to False).
300
- message_limit (int | None): Limit on total messages used for each sample.
301
- token_limit (int | None): Limit on total tokens used for each sample.
293
+ debug_errors: Raise task errors (rather than logging them) so they can be debugged (defaults to False).
294
+ message_limit: Limit on total messages used for each sample.
295
+ token_limit: Limit on total tokens used for each sample.
302
296
  time_limit: Limit on clock time (in seconds) for samples.
303
297
  working_limit: Limit on working time (in seconds) for sample. Working
304
298
  time includes model generation, tool calls, etc. but does not include
305
299
  time spent waiting on retries or shared resources.
306
- max_samples (int | None): Maximum number of samples to run in parallel
307
- (default is max_connections)
308
- max_tasks (int | None): Maximum number of tasks to run in parallel
309
- (default is 1)
310
- max_subprocesses (int | None): Maximum number of subprocesses to
311
- run in parallel (default is os.cpu_count())
312
- max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
313
- to run in parallel.
314
- log_samples: (bool | None): Log detailed samples and scores (defaults to True)
315
- log_images: (bool | None): Log base64 encoded version of images,
316
- even if specified as a filename or URL (defaults to False)
317
- log_buffer: (int | None): Number of samples to buffer before writing log file.
300
+ max_samples: Maximum number of samples to run in parallel (default is max_connections)
301
+ max_tasks: Maximum number of tasks to run in parallel
302
+ (defaults to number of models being evaluated)
303
+ max_subprocesses: Maximum number of subprocesses to run in parallel (default is os.cpu_count())
304
+ max_sandboxes: Maximum number of sandboxes (per-provider) to run in parallel.
305
+ log_samples: Log detailed samples and scores (defaults to True)
306
+ log_images: Log base64 encoded version of images, even if specified as a filename or URL (defaults to False)
307
+ log_buffer: Number of samples to buffer before writing log file.
318
308
  If not specified, an appropriate default for the format and filesystem is
319
309
  chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
320
- score (bool): Score output (defaults to True)
321
- score_display (bool | None): Show scoring metrics in realtime (defaults to True)
322
- **kwargs (GenerateConfigArgs): Model generation options.
310
+ score: Score output (defaults to True)
311
+ score_display: Show scoring metrics in realtime (defaults to True)
312
+ **kwargs: Model generation options.
323
313
 
324
314
  Returns:
325
315
  List of EvalLog (one for each task)
@@ -365,6 +355,12 @@ async def eval_async(
365
355
  log.warning("No inspect tasks were found at the specified paths.")
366
356
  return []
367
357
 
358
+ # if there is no max tasks then base it on unique model names
359
+ if max_tasks is None:
360
+ model_count = len(resolved_model_names(resolved_tasks))
361
+ if model_count > 1:
362
+ max_tasks = model_count
363
+
368
364
  # apply conversation display constraints
369
365
  if display_type() == "conversation":
370
366
  # single task at a time
@@ -450,7 +446,6 @@ async def eval_async(
450
446
  eval_config=eval_config,
451
447
  eval_sandbox=sandbox,
452
448
  recorder=recorder,
453
- model_args=model_args,
454
449
  epochs_reducer=epochs_reducer,
455
450
  solver=solver,
456
451
  tags=tags,
@@ -475,7 +470,6 @@ async def eval_async(
475
470
  eval_config=eval_config,
476
471
  eval_sandbox=sandbox,
477
472
  recorder=recorder,
478
- model_args=model_args,
479
473
  epochs_reducer=epochs_reducer,
480
474
  solver=solver,
481
475
  tags=tags,
@@ -529,7 +523,7 @@ def eval_retry(
529
523
  max_samples: Maximum number of samples to run in parallel
530
524
  (default is max_connections)
531
525
  max_tasks: Maximum number of tasks to run in parallel
532
- (default is 1)
526
+ (defaults to number of models being evaluated)
533
527
  max_subprocesses: Maximum number of subprocesses to
534
528
  run in parallel (default is os.cpu_count())
535
529
  max_sandboxes: Maximum number of sandboxes (per-provider)
@@ -764,7 +758,7 @@ async def eval_retry_async(
764
758
  log = (
765
759
  await eval_async(
766
760
  tasks=PreviousTask(
767
- id=task_id, task=task, task_args=task_args, log=eval_log
761
+ id=task_id, task=task, task_args=task_args, model=None, log=eval_log
768
762
  ),
769
763
  model=model,
770
764
  model_base_url=model_base_url,
@@ -809,7 +803,7 @@ async def eval_retry_async(
809
803
 
810
804
  def eval_init(
811
805
  tasks: Tasks,
812
- model: str | Model | list[str] | list[Model] | None = None,
806
+ model: str | Model | list[str] | list[Model] | None | NotGiven = NOT_GIVEN,
813
807
  model_base_url: str | None = None,
814
808
  model_args: dict[str, Any] | str = dict(),
815
809
  task_args: dict[str, Any] | str = dict(),
@@ -886,7 +880,7 @@ def init_eval_display(
886
880
  # multiple models not allowed in trace mode
887
881
  if isinstance(model, list) and len(model) > 1:
888
882
  raise PrerequisiteError(
889
- "Trace mode cannot be used when evaluating multiple models."
883
+ "Conversation mode cannot be used when evaluating multiple models."
890
884
  )
891
885
 
892
886
  return max_tasks, max_samples