inspect-ai 0.3.58__py3-none-any.whl → 0.3.60__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. inspect_ai/_cli/common.py +3 -1
  2. inspect_ai/_cli/eval.py +15 -9
  3. inspect_ai/_display/core/active.py +4 -1
  4. inspect_ai/_display/core/config.py +3 -3
  5. inspect_ai/_display/core/panel.py +7 -3
  6. inspect_ai/_display/plain/__init__.py +0 -0
  7. inspect_ai/_display/plain/display.py +203 -0
  8. inspect_ai/_display/rich/display.py +0 -5
  9. inspect_ai/_display/textual/widgets/port_mappings.py +110 -0
  10. inspect_ai/_display/textual/widgets/samples.py +79 -12
  11. inspect_ai/_display/textual/widgets/sandbox.py +37 -0
  12. inspect_ai/_eval/eval.py +10 -1
  13. inspect_ai/_eval/loader.py +79 -19
  14. inspect_ai/_eval/registry.py +6 -0
  15. inspect_ai/_eval/score.py +3 -1
  16. inspect_ai/_eval/task/results.py +51 -22
  17. inspect_ai/_eval/task/run.py +47 -13
  18. inspect_ai/_eval/task/sandbox.py +10 -5
  19. inspect_ai/_util/constants.py +1 -0
  20. inspect_ai/_util/port_names.py +61 -0
  21. inspect_ai/_util/text.py +23 -0
  22. inspect_ai/_view/www/App.css +31 -1
  23. inspect_ai/_view/www/dist/assets/index.css +31 -1
  24. inspect_ai/_view/www/dist/assets/index.js +25498 -2044
  25. inspect_ai/_view/www/log-schema.json +32 -2
  26. inspect_ai/_view/www/package.json +2 -0
  27. inspect_ai/_view/www/src/App.mjs +14 -16
  28. inspect_ai/_view/www/src/Types.mjs +1 -2
  29. inspect_ai/_view/www/src/api/Types.ts +133 -0
  30. inspect_ai/_view/www/src/api/{api-browser.mjs → api-browser.ts} +25 -13
  31. inspect_ai/_view/www/src/api/api-http.ts +219 -0
  32. inspect_ai/_view/www/src/api/api-shared.ts +47 -0
  33. inspect_ai/_view/www/src/api/{api-vscode.mjs → api-vscode.ts} +22 -19
  34. inspect_ai/_view/www/src/api/{client-api.mjs → client-api.ts} +93 -53
  35. inspect_ai/_view/www/src/api/index.ts +51 -0
  36. inspect_ai/_view/www/src/api/jsonrpc.ts +225 -0
  37. inspect_ai/_view/www/src/components/ChatView.mjs +133 -43
  38. inspect_ai/_view/www/src/components/DownloadButton.mjs +1 -1
  39. inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -4
  40. inspect_ai/_view/www/src/components/LargeModal.mjs +19 -20
  41. inspect_ai/_view/www/src/components/TabSet.mjs +3 -1
  42. inspect_ai/_view/www/src/components/VirtualList.mjs +266 -84
  43. inspect_ai/_view/www/src/index.js +77 -4
  44. inspect_ai/_view/www/src/log/{remoteLogFile.mjs → remoteLogFile.ts} +62 -46
  45. inspect_ai/_view/www/src/navbar/Navbar.mjs +4 -1
  46. inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +19 -10
  47. inspect_ai/_view/www/src/samples/SampleDialog.mjs +5 -1
  48. inspect_ai/_view/www/src/samples/SampleDisplay.mjs +23 -15
  49. inspect_ai/_view/www/src/samples/SampleList.mjs +19 -49
  50. inspect_ai/_view/www/src/samples/SampleScores.mjs +1 -1
  51. inspect_ai/_view/www/src/samples/SampleTranscript.mjs +8 -3
  52. inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +38 -26
  53. inspect_ai/_view/www/src/samples/SamplesTab.mjs +14 -11
  54. inspect_ai/_view/www/src/samples/SamplesTools.mjs +8 -8
  55. inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +712 -89
  56. inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +2 -2
  57. inspect_ai/_view/www/src/samples/tools/filters.mjs +260 -87
  58. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +24 -2
  59. inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +29 -24
  60. inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +1 -1
  61. inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +24 -2
  62. inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +24 -2
  63. inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +31 -10
  64. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +24 -2
  65. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +23 -2
  66. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +24 -2
  67. inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +33 -3
  68. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +25 -2
  69. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +25 -2
  70. inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +193 -11
  71. inspect_ai/_view/www/src/samples/transcript/Types.mjs +10 -0
  72. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +26 -2
  73. inspect_ai/_view/www/src/types/log.d.ts +13 -2
  74. inspect_ai/_view/www/src/utils/Format.mjs +10 -3
  75. inspect_ai/_view/www/src/utils/{Json.mjs → json-worker.ts} +13 -9
  76. inspect_ai/_view/www/src/utils/vscode.ts +36 -0
  77. inspect_ai/_view/www/src/workspace/WorkSpace.mjs +11 -5
  78. inspect_ai/_view/www/vite.config.js +7 -0
  79. inspect_ai/_view/www/yarn.lock +116 -0
  80. inspect_ai/approval/_human/__init__.py +0 -0
  81. inspect_ai/approval/_human/manager.py +1 -1
  82. inspect_ai/approval/_policy.py +12 -6
  83. inspect_ai/log/_log.py +1 -1
  84. inspect_ai/log/_samples.py +16 -0
  85. inspect_ai/log/_transcript.py +4 -1
  86. inspect_ai/model/_call_tools.py +59 -0
  87. inspect_ai/model/_conversation.py +16 -7
  88. inspect_ai/model/_generate_config.py +12 -12
  89. inspect_ai/model/_model.py +117 -18
  90. inspect_ai/model/_model_output.py +22 -2
  91. inspect_ai/model/_openai.py +383 -0
  92. inspect_ai/model/_providers/anthropic.py +152 -55
  93. inspect_ai/model/_providers/azureai.py +21 -21
  94. inspect_ai/model/_providers/bedrock.py +37 -40
  95. inspect_ai/model/_providers/goodfire.py +248 -0
  96. inspect_ai/model/_providers/google.py +46 -54
  97. inspect_ai/model/_providers/groq.py +7 -3
  98. inspect_ai/model/_providers/hf.py +6 -0
  99. inspect_ai/model/_providers/mistral.py +13 -12
  100. inspect_ai/model/_providers/openai.py +51 -218
  101. inspect_ai/model/_providers/openai_o1.py +11 -12
  102. inspect_ai/model/_providers/providers.py +23 -1
  103. inspect_ai/model/_providers/together.py +12 -12
  104. inspect_ai/model/_providers/util/__init__.py +2 -3
  105. inspect_ai/model/_providers/util/hf_handler.py +1 -1
  106. inspect_ai/model/_providers/util/llama31.py +1 -1
  107. inspect_ai/model/_providers/util/util.py +0 -76
  108. inspect_ai/model/_providers/vertex.py +1 -4
  109. inspect_ai/scorer/_metric.py +3 -0
  110. inspect_ai/scorer/_reducer/reducer.py +1 -1
  111. inspect_ai/scorer/_scorer.py +4 -3
  112. inspect_ai/solver/__init__.py +4 -5
  113. inspect_ai/solver/_basic_agent.py +1 -1
  114. inspect_ai/solver/_bridge/__init__.py +3 -0
  115. inspect_ai/solver/_bridge/bridge.py +100 -0
  116. inspect_ai/solver/_bridge/patch.py +170 -0
  117. inspect_ai/solver/_prompt.py +35 -5
  118. inspect_ai/solver/_solver.py +6 -0
  119. inspect_ai/solver/_task_state.py +80 -38
  120. inspect_ai/tool/__init__.py +2 -0
  121. inspect_ai/tool/_tool.py +12 -1
  122. inspect_ai/tool/_tool_call.py +10 -0
  123. inspect_ai/tool/_tool_def.py +16 -5
  124. inspect_ai/tool/_tool_with.py +21 -4
  125. inspect_ai/tool/beta/__init__.py +5 -0
  126. inspect_ai/tool/beta/_computer/__init__.py +3 -0
  127. inspect_ai/tool/beta/_computer/_common.py +133 -0
  128. inspect_ai/tool/beta/_computer/_computer.py +155 -0
  129. inspect_ai/tool/beta/_computer/_computer_split.py +198 -0
  130. inspect_ai/tool/beta/_computer/_resources/Dockerfile +100 -0
  131. inspect_ai/tool/beta/_computer/_resources/README.md +30 -0
  132. inspect_ai/tool/beta/_computer/_resources/entrypoint/entrypoint.sh +18 -0
  133. inspect_ai/tool/beta/_computer/_resources/entrypoint/novnc_startup.sh +20 -0
  134. inspect_ai/tool/beta/_computer/_resources/entrypoint/x11vnc_startup.sh +48 -0
  135. inspect_ai/tool/beta/_computer/_resources/entrypoint/xfce_startup.sh +13 -0
  136. inspect_ai/tool/beta/_computer/_resources/entrypoint/xvfb_startup.sh +48 -0
  137. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +10 -0
  138. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +10 -0
  139. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/XPaint.desktop +10 -0
  140. inspect_ai/tool/beta/_computer/_resources/tool/__init__.py +0 -0
  141. inspect_ai/tool/beta/_computer/_resources/tool/_logger.py +22 -0
  142. inspect_ai/tool/beta/_computer/_resources/tool/_run.py +42 -0
  143. inspect_ai/tool/beta/_computer/_resources/tool/_tool_result.py +33 -0
  144. inspect_ai/tool/beta/_computer/_resources/tool/_x11_client.py +262 -0
  145. inspect_ai/tool/beta/_computer/_resources/tool/computer_tool.py +85 -0
  146. inspect_ai/tool/beta/_computer/_resources/tool/requirements.txt +0 -0
  147. inspect_ai/util/__init__.py +2 -0
  148. inspect_ai/util/_display.py +5 -0
  149. inspect_ai/util/_limit.py +26 -0
  150. inspect_ai/util/_sandbox/docker/docker.py +64 -1
  151. inspect_ai/util/_sandbox/docker/internal.py +3 -1
  152. inspect_ai/util/_sandbox/docker/prereqs.py +1 -1
  153. inspect_ai/util/_sandbox/environment.py +14 -0
  154. {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/METADATA +3 -2
  155. {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/RECORD +159 -126
  156. inspect_ai/_view/www/src/api/Types.mjs +0 -117
  157. inspect_ai/_view/www/src/api/api-http.mjs +0 -300
  158. inspect_ai/_view/www/src/api/api-shared.mjs +0 -10
  159. inspect_ai/_view/www/src/api/index.mjs +0 -49
  160. inspect_ai/_view/www/src/api/jsonrpc.mjs +0 -208
  161. inspect_ai/_view/www/src/samples/transcript/TranscriptState.mjs +0 -70
  162. inspect_ai/_view/www/src/utils/vscode.mjs +0 -16
  163. {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/LICENSE +0 -0
  164. {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/WHEEL +0 -0
  165. {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/entry_points.txt +0 -0
  166. {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,37 @@
1
+ from textual.app import ComposeResult
2
+ from textual.containers import Horizontal, Vertical
3
+ from textual.widgets import Static
4
+
5
+ from inspect_ai.util._sandbox.environment import SandboxConnection
6
+
7
+ from .port_mappings import PortMappingsView
8
+
9
+
10
+ class SandboxView(Vertical):
11
+ DEFAULT_CSS = """
12
+ .indent {
13
+ width: 2;
14
+ }
15
+ .no_indent {
16
+ width: 0;
17
+ }
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ connection: SandboxConnection,
23
+ name: str | None, # if None, no header or indent
24
+ ) -> None:
25
+ super().__init__()
26
+ self.sandbox_name = name
27
+ self.connection = connection
28
+
29
+ def compose(self) -> ComposeResult:
30
+ if self.sandbox_name:
31
+ yield Static(self.sandbox_name)
32
+ with Horizontal():
33
+ yield Static("", classes="indent" if self.sandbox_name else "no_indent")
34
+ with Vertical():
35
+ yield Static(self.connection.command)
36
+ if self.connection.ports:
37
+ yield PortMappingsView(self.connection.ports)
inspect_ai/_eval/eval.py CHANGED
@@ -35,7 +35,12 @@ from inspect_ai.scorer._reducer import reducer_log_names
35
35
  from inspect_ai.solver._chain import chain
36
36
  from inspect_ai.solver._solver import Solver, SolverSpec
37
37
  from inspect_ai.util import SandboxEnvironmentType
38
- from inspect_ai.util._display import DisplayType, display_type, init_display_type
38
+ from inspect_ai.util._display import (
39
+ DisplayType,
40
+ display_type,
41
+ display_type_initialized,
42
+ init_display_type,
43
+ )
39
44
 
40
45
  from .context import init_eval_context
41
46
  from .loader import ResolvedTask, resolve_tasks
@@ -306,6 +311,10 @@ async def eval_async(
306
311
 
307
312
  _eval_async_running = True
308
313
 
314
+ # if we are called outside of eval() then set display type to "plain"
315
+ if not display_type_initialized():
316
+ init_display_type("plain")
317
+
309
318
  # resolve model and task args
310
319
  model_args = resolve_args(model_args)
311
320
  task_args = resolve_args(task_args)
@@ -1,5 +1,6 @@
1
1
  import ast
2
2
  import contextlib
3
+ import inspect
3
4
  import os
4
5
  from dataclasses import dataclass, field
5
6
  from importlib.machinery import SourceFileLoader
@@ -9,11 +10,13 @@ from pathlib import Path
9
10
  from types import ModuleType
10
11
  from typing import Any, Callable, cast
11
12
 
13
+ from typing_extensions import overload
14
+
12
15
  from inspect_ai._eval.task.util import task_file, task_run_dir
13
16
  from inspect_ai._util.decorator import parse_decorators
14
17
  from inspect_ai._util.error import PrerequisiteError
15
18
  from inspect_ai._util.logger import warn_once
16
- from inspect_ai._util.path import chdir_python
19
+ from inspect_ai._util.path import chdir_python, cwd_relative_path
17
20
  from inspect_ai._util.registry import (
18
21
  RegistryInfo,
19
22
  is_registry_object,
@@ -23,6 +26,7 @@ from inspect_ai._util.registry import (
23
26
  registry_params,
24
27
  )
25
28
  from inspect_ai.model import Model, ModelName
29
+ from inspect_ai.solver._bridge import bridge
26
30
  from inspect_ai.solver._solver import Solver, SolverSpec
27
31
  from inspect_ai.util import SandboxEnvironmentSpec, SandboxEnvironmentType
28
32
  from inspect_ai.util._sandbox.environment import resolve_sandbox_environment
@@ -334,6 +338,16 @@ def split_spec(spec: str) -> tuple[str, str | None]:
334
338
  return spec, None
335
339
 
336
340
 
341
+ @overload
342
+ def load_module(
343
+ module_path: Path, filter: Callable[[str], bool]
344
+ ) -> ModuleType | None: ...
345
+
346
+
347
+ @overload
348
+ def load_module(module_path: Path, filter: None = None) -> ModuleType: ...
349
+
350
+
337
351
  def load_module(
338
352
  module_path: Path, filter: Callable[[str], bool] | None = None
339
353
  ) -> ModuleType | None:
@@ -425,28 +439,74 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
425
439
  else contextlib.nullcontext()
426
440
  )
427
441
 
442
+ # pretty solver name for error messages
443
+ pretty_solver_file = (
444
+ cwd_relative_path(solver_file.as_posix()) if solver_file else None
445
+ )
446
+
428
447
  with create_cm:
429
- # if we have a file then we need to load it and (if required) determine the solver name
430
- if solver_file is not None:
431
- # load the module so that registry_create works
432
- load_module(solver_file)
448
+ # if there is no solver file then just create from the registry by name
449
+ if solver_file is None:
450
+ if solver_name is None:
451
+ raise ValueError(f"Unable to resolve solver name from {spec.solver}")
452
+ return cast(Solver, registry_create("solver", solver_name, **spec.args))
433
453
 
434
- # if there is no solver_name we need to discover the first @solver
454
+ # we do have a solver file
455
+ else:
456
+ # load the module and parse decorators
457
+ solver_module = load_module(solver_file)
458
+ decorators = parse_decorators(solver_file, "solver")
459
+
460
+ # if there is no solver_name see if we can discover it
435
461
  if solver_name is None:
436
- solvers = parse_decorators(solver_file, "solver")
437
- if len(solvers) == 0:
462
+ if len(decorators) == 1:
463
+ # decorator based solver
464
+ solver_name = decorators[0][0]
465
+ elif len(decorators) == 0:
466
+ # see if we can find an agent based solver
467
+ functions = [
468
+ function
469
+ for function in inspect.getmembers(
470
+ solver_module, inspect.isfunction
471
+ )
472
+ if function[1].__module__ == solver_module.__name__
473
+ ]
474
+ agent_functions = [
475
+ function
476
+ for function in functions
477
+ if "agent" in function[0] and not function[0].startswith("_")
478
+ ]
479
+ if len(agent_functions) == 1:
480
+ # agent based solver
481
+ solver_name = agent_functions[0][0]
482
+
483
+ elif len(agent_functions) == 0:
484
+ raise PrerequisiteError(
485
+ f"The source file {pretty_solver_file} does not contain any @solver functions or agent functions."
486
+ )
487
+ else:
488
+ raise PrerequisiteError(
489
+ f"The source file {pretty_solver_file} has more than one agent function (qualify which agent using e.g. '{solver_file.name}@agent_fn')"
490
+ )
491
+ else:
438
492
  raise PrerequisiteError(
439
- f"The source file {solver_file.as_posix()} does not contain any @solver functions."
493
+ f"The source file {pretty_solver_file} has more than one @solver function (qualify which solver using e.g. '{solver_file.name}y@solver_fn')"
440
494
  )
441
- if len(solvers) > 1:
442
- raise PrerequisiteError(
443
- f"The source file {solver_file.as_posix()} has more than one @solver function (qualify which solver using file.py@solver)"
444
- )
445
- solver_name = solvers[0][0]
446
495
 
447
- # make mypy happy and catch unexpected branching
448
- if solver_name is None:
449
- raise ValueError(f"Unable to resolve solver name from {spec.solver}")
496
+ # create decorator based solvers using the registry
497
+ if any(solver[0] == solver_name for solver in decorators):
498
+ return cast(Solver, registry_create("solver", solver_name, **spec.args))
450
499
 
451
- solver = cast(Solver, registry_create("solver", solver_name, **spec.args))
452
- return solver
500
+ # create agent based solvers by calling the function and wrapping it in bridge()
501
+ else:
502
+ agent_fn = getattr(solver_module, solver_name, None)
503
+ if inspect.isfunction(agent_fn):
504
+ return bridge(agent_fn(**spec.args))
505
+ elif agent_fn is not None:
506
+ raise PrerequisiteError(
507
+ f"The object {solver_name} in file {pretty_solver_file} is not a Python function."
508
+ )
509
+ else:
510
+ raise PrerequisiteError(
511
+ f"The function {solver_name} was not found in file {pretty_solver_file}."
512
+ )
@@ -1,6 +1,7 @@
1
1
  import inspect
2
2
  import logging
3
3
  from copy import deepcopy
4
+ from functools import wraps
4
5
  from pathlib import Path
5
6
  from typing import Any, Callable, TypeVar, cast, overload
6
7
 
@@ -125,6 +126,7 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
125
126
  params = list(inspect.signature(task_type).parameters.keys())
126
127
 
127
128
  # Create and return the wrapper function
129
+ @wraps(task_type)
128
130
  def wrapper(*w_args: Any, **w_kwargs: Any) -> Task:
129
131
  # Create the task
130
132
  task_instance = task_type(*w_args, **w_kwargs)
@@ -154,6 +156,10 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
154
156
  # Return the task instance
155
157
  return task_instance
156
158
 
159
+ # functools.wraps overrides the return type annotation of the inner function, so
160
+ # we explicitly set it again
161
+ wrapper.__annotations__["return"] = Task
162
+
157
163
  # Register the task and return the wrapper
158
164
  return task_register(
159
165
  task=cast(TaskType, wrapper), name=task_name, attribs=attribs, params=params
inspect_ai/_eval/score.py CHANGED
@@ -5,7 +5,7 @@ from typing import Callable, cast
5
5
  from inspect_ai._display import display
6
6
  from inspect_ai._util.path import chdir_python
7
7
  from inspect_ai._util.platform import platform_init
8
- from inspect_ai._util.registry import registry_create
8
+ from inspect_ai._util.registry import registry_create, registry_unqualified_name
9
9
  from inspect_ai.log import (
10
10
  EvalLog,
11
11
  EvalMetric,
@@ -85,6 +85,7 @@ async def score_async(
85
85
  sample_id=sample.id,
86
86
  epoch=sample.epoch,
87
87
  input=sample.input,
88
+ target=Target(sample.target),
88
89
  choices=sample.choices,
89
90
  messages=sample.messages,
90
91
  output=sample.output,
@@ -184,6 +185,7 @@ async def run_score_task(
184
185
  results[scorer_name] = SampleScore(
185
186
  score=result,
186
187
  sample_id=state.sample_id,
188
+ scorer=registry_unqualified_name(scorer),
187
189
  )
188
190
 
189
191
  progress()
@@ -2,6 +2,7 @@ import fnmatch
2
2
  import re
3
3
  from collections import defaultdict
4
4
  from copy import deepcopy
5
+ from dataclasses import dataclass, field
5
6
  from typing import Any, Tuple, cast
6
7
 
7
8
  from inspect_ai._util.registry import (
@@ -19,6 +20,8 @@ from inspect_ai.log import (
19
20
  from inspect_ai.log._log import EvalSampleReductions
20
21
  from inspect_ai.scorer import Metric, Score, Scorer
21
22
  from inspect_ai.scorer._metric import SampleScore
23
+ from inspect_ai.scorer._metrics.accuracy import accuracy
24
+ from inspect_ai.scorer._metrics.std import stderr
22
25
  from inspect_ai.scorer._reducer import ScoreReducer, mean_score, reducer_log_name
23
26
  from inspect_ai.scorer._scorer import (
24
27
  SCORER_METRICS,
@@ -27,6 +30,27 @@ from inspect_ai.scorer._scorer import (
27
30
  )
28
31
 
29
32
 
33
+ @dataclass
34
+ class ScorerInfo:
35
+ name: str
36
+ metrics: list[Metric | dict[str, list[Metric]]] | dict[str, list[Metric]]
37
+ params: dict[str, Any] = field(default_factory=dict)
38
+ metadata: dict[str, Any] = field(default_factory=dict)
39
+
40
+ @staticmethod
41
+ def from_scorer(scorer: Scorer) -> "ScorerInfo":
42
+ name = registry_unqualified_name(scorer)
43
+ metrics = scorer_metrics(scorer)
44
+ metadata = deepcopy(registry_info(scorer).metadata)
45
+ del metadata[SCORER_METRICS]
46
+ params = registry_params(scorer)
47
+ return ScorerInfo(name=name, metrics=metrics, params=params, metadata=metadata)
48
+
49
+ @staticmethod
50
+ def from_name(name: str) -> "ScorerInfo":
51
+ return ScorerInfo(name=name, metrics=[accuracy(), stderr()])
52
+
53
+
30
54
  def eval_results(
31
55
  samples: int,
32
56
  scores: list[dict[str, SampleScore]],
@@ -38,18 +62,24 @@ def eval_results(
38
62
  results = EvalResults(total_samples=samples, completed_samples=len(scores))
39
63
  reductions = None
40
64
 
65
+ # extract scorers info from scorers then create scorers info for any
66
+ # scores not already accounted for by a scorer name
67
+ scorers_info = [ScorerInfo.from_scorer(scorer) for scorer in (scorers or [])]
68
+ scorer_names = {info.name for info in scorers_info}
69
+ for sample_scores in scores:
70
+ for name, sample_score in sample_scores.items():
71
+ if sample_score.scorer is None and name not in scorer_names:
72
+ scorers_info.append(ScorerInfo.from_name(name))
73
+ scorer_names.add(name)
74
+
41
75
  # record scorer
42
- if scorers:
76
+ if len(scorers_info) > 0:
43
77
  result_scores: list[EvalScore] = []
44
78
  sample_reductions: list[EvalSampleReductions] = []
45
- for scorer in scorers:
46
- # extract non-metrics metadata
47
- metadata = deepcopy(registry_info(scorer).metadata)
48
- del metadata[SCORER_METRICS]
49
-
79
+ for scorer_info in scorers_info:
50
80
  # this scorer
51
81
  scorer_name = unique_scorer_name(
52
- scorer, [eval_score.name for eval_score in result_scores]
82
+ scorer_info.name, [eval_score.name for eval_score in result_scores]
53
83
  )
54
84
 
55
85
  # scores for this scorer
@@ -75,7 +105,7 @@ def eval_results(
75
105
 
76
106
  # Compute metrics for this scorer
77
107
  simple_scores = cast(list[Score], reduced_scores)
78
- targets = metrics if metrics is not None else scorer_metrics(scorer)
108
+ targets = metrics if metrics is not None else scorer_info.metrics
79
109
  if isinstance(targets, list):
80
110
  ## split the metrics into the simple metrics and any dictionary
81
111
  ## metrics, to be processed independently
@@ -88,8 +118,7 @@ def eval_results(
88
118
  result_scores.extend(
89
119
  scorer_for_metrics(
90
120
  scorer_name=scorer_name,
91
- scorer=scorer,
92
- metadata=metadata,
121
+ scorer_info=scorer_info,
93
122
  scores=simple_scores,
94
123
  metrics=simple_metrics,
95
124
  reducer_name=reducer_display_nm,
@@ -99,8 +128,7 @@ def eval_results(
99
128
  result_scores.extend(
100
129
  scorers_from_metric_dict(
101
130
  scorer_name=scorer_name,
102
- scorer=scorer,
103
- metadata=metadata,
131
+ scorer_info=scorer_info,
104
132
  scores=simple_scores,
105
133
  metrics=dict_metric,
106
134
  reducer_name=reducer_display_nm,
@@ -116,8 +144,7 @@ def eval_results(
116
144
  result_scores.extend(
117
145
  scorers_from_metric_dict(
118
146
  scorer_name=scorer_name,
119
- scorer=scorer,
120
- metadata=metadata,
147
+ scorer_info=scorer_info,
121
148
  scores=simple_scores,
122
149
  metrics=targets,
123
150
  reducer_name=reducer_display_nm,
@@ -156,8 +183,7 @@ def split_metrics(
156
183
 
157
184
  def scorer_for_metrics(
158
185
  scorer_name: str,
159
- scorer: Scorer,
160
- metadata: dict[str, Any],
186
+ scorer_info: ScorerInfo,
161
187
  scores: list[Score],
162
188
  metrics: list[Metric],
163
189
  reducer_name: str | None = None,
@@ -218,8 +244,10 @@ def scorer_for_metrics(
218
244
  scorer=scorer_name,
219
245
  reducer=reducer_name,
220
246
  name=scorer_name,
221
- params=registry_params(scorer),
222
- metadata=metadata if len(metadata.keys()) > 0 else None,
247
+ params=scorer_info.params,
248
+ metadata=scorer_info.metadata
249
+ if len(scorer_info.metadata.keys()) > 0
250
+ else None,
223
251
  metrics=list_metrics,
224
252
  )
225
253
  )
@@ -228,8 +256,7 @@ def scorer_for_metrics(
228
256
 
229
257
  def scorers_from_metric_dict(
230
258
  scorer_name: str,
231
- scorer: Scorer,
232
- metadata: dict[str, Any],
259
+ scorer_info: ScorerInfo,
233
260
  scores: list[Score],
234
261
  metrics: dict[str, list[Metric]],
235
262
  reducer_name: str | None = None,
@@ -299,8 +326,10 @@ def scorers_from_metric_dict(
299
326
  scorer=scorer_name,
300
327
  reducer=reducer_name,
301
328
  name=metric_key,
302
- params=registry_params(scorer),
303
- metadata=metadata if len(metadata.keys()) > 0 else None,
329
+ params=scorer_info.params,
330
+ metadata=scorer_info.metadata
331
+ if len(scorer_info.metadata.keys()) > 0
332
+ else None,
304
333
  metrics=result_metrics,
305
334
  )
306
335
  )
@@ -30,8 +30,9 @@ from inspect_ai._util.hooks import send_telemetry
30
30
  from inspect_ai._util.registry import (
31
31
  is_registry_object,
32
32
  registry_log_name,
33
+ registry_unqualified_name,
33
34
  )
34
- from inspect_ai._util.timeouts import Timeout, timeout, timeout_at
35
+ from inspect_ai._util.timeouts import Timeout, timeout
35
36
  from inspect_ai._view.notify import view_notify_eval
36
37
  from inspect_ai.dataset import Dataset, Sample
37
38
  from inspect_ai.log import (
@@ -45,7 +46,11 @@ from inspect_ai.log import (
45
46
  from inspect_ai.log._condense import condense_sample
46
47
  from inspect_ai.log._file import eval_log_json_str
47
48
  from inspect_ai.log._log import EvalSampleLimit, EvalSampleReductions, eval_error
48
- from inspect_ai.log._samples import active_sample
49
+ from inspect_ai.log._samples import (
50
+ active_sample,
51
+ set_active_sample_message_limit,
52
+ set_active_sample_token_limit,
53
+ )
49
54
  from inspect_ai.log._transcript import (
50
55
  ErrorEvent,
51
56
  SampleInitEvent,
@@ -72,6 +77,7 @@ from inspect_ai.solver._chain import Chain, unroll
72
77
  from inspect_ai.solver._fork import set_task_generate
73
78
  from inspect_ai.solver._solver import Solver
74
79
  from inspect_ai.solver._task_state import sample_state, set_sample_state, state_jsonable
80
+ from inspect_ai.util._limit import SampleLimitExceededError
75
81
  from inspect_ai.util._sandbox.context import sandbox_connections
76
82
  from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
77
83
  from inspect_ai.util._subtask import init_subtask
@@ -538,6 +544,9 @@ async def task_run_sample(
538
544
  # helper to handle exceptions (will throw if we've exceeded the limit)
539
545
  def handle_error(ex: BaseException) -> EvalError:
540
546
  err = sample_error(ex)
547
+ py_logger.warning(
548
+ f"Sample error (id: {sample.id}, epoch: {state.epoch}): {exception_message(ex)})"
549
+ )
541
550
  transcript()._event(ErrorEvent(error=err))
542
551
  return err
543
552
 
@@ -630,30 +639,43 @@ async def task_run_sample(
630
639
  else:
631
640
  raise
632
641
 
642
+ except SampleLimitExceededError as ex:
643
+ # sample limit event
644
+ transcript()._event(
645
+ SampleLimitEvent(
646
+ type=ex.type,
647
+ limit=ex.limit,
648
+ message=f"Sample completed: {ex.message}",
649
+ )
650
+ )
651
+
652
+ # capture most recent state for scoring
653
+ state = sample_state() or state
654
+ state.completed = True
655
+
633
656
  except BaseException as ex:
634
657
  error = handle_error(ex)
635
658
 
636
- # set timeout for scoring. if the original timeout was never hit
637
- # then just create a new timeout_cm targeting the original
638
- # timeout time. if the original timeout was hit we still want
639
- # to provide an opportunity for scoring, but we don't necessarily
659
+ # set timeout for scoring. if the original timeout was hit we still
660
+ # want to provide opportunity for scoring, but we don't necessarily
640
661
  # want to wait the full timeout again (especially in the case where
641
662
  # the cause of the timeout is a hung container and scoring requires
642
663
  # interacting with the container). as a middle ground we use half
643
664
  # of the original timeout value for scoring.
644
665
  if isinstance(timeout_cm, Timeout):
645
- if not timeout_cm.expired():
646
- timeout_cm = timeout_at(timeout_cm.when())
647
- else:
648
- assert time_limit
649
- timeout_cm = timeout(time_limit / 2)
666
+ assert time_limit
667
+ timeout_cm = timeout(time_limit / 2)
668
+
669
+ # turn off sample limits
670
+ set_active_sample_token_limit(None)
671
+ set_active_sample_message_limit(None)
650
672
 
651
673
  # scoring
652
674
  try:
653
675
  # timeout during scoring will result in an ordinary sample error
654
676
  async with timeout_cm:
655
- if scorers and error is None:
656
- for scorer in scorers:
677
+ if error is None:
678
+ for scorer in scorers or []:
657
679
  scorer_name = unique_scorer_name(
658
680
  scorer, list(results.keys())
659
681
  )
@@ -667,6 +689,7 @@ async def task_run_sample(
667
689
  sample_score = SampleScore(
668
690
  score=score_result,
669
691
  sample_id=sample.id,
692
+ scorer=registry_unqualified_name(scorer),
670
693
  )
671
694
  transcript()._event(
672
695
  ScoreEvent(
@@ -675,6 +698,16 @@ async def task_run_sample(
675
698
  )
676
699
  results[scorer_name] = sample_score
677
700
 
701
+ # add scores returned by solvers
702
+ if state.scores is not None:
703
+ for name, score in state.scores.items():
704
+ results[name] = SampleScore(
705
+ score=score, sample_id=state.sample_id
706
+ )
707
+
708
+ # propagate results into scores
709
+ state.scores = {k: v.score for k, v in results.items()}
710
+
678
711
  except asyncio.CancelledError:
679
712
  if active.interrupt_action:
680
713
  transcript()._event(
@@ -819,6 +852,7 @@ async def resolve_dataset(
819
852
  epoch=epoch,
820
853
  model=model_name,
821
854
  input=sample.input,
855
+ target=Target(sample.target),
822
856
  choices=sample.choices,
823
857
  messages=sample_messages(sample),
824
858
  message_limit=message_limit,
@@ -4,11 +4,13 @@ import contextlib
4
4
  from random import random
5
5
  from typing import AsyncGenerator, Callable, NamedTuple, cast
6
6
 
7
+ import httpx
8
+
7
9
  from inspect_ai._eval.task.task import Task
8
10
  from inspect_ai._eval.task.util import task_run_dir
9
11
  from inspect_ai._util.file import file, filesystem
10
12
  from inspect_ai._util.registry import registry_unqualified_name
11
- from inspect_ai._util.url import data_uri_to_base64, is_data_uri
13
+ from inspect_ai._util.url import data_uri_to_base64, is_data_uri, is_http_url
12
14
  from inspect_ai.dataset import Sample
13
15
  from inspect_ai.util._concurrency import concurrency
14
16
  from inspect_ai.util._sandbox.context import (
@@ -65,12 +67,12 @@ async def sandboxenv_context(
65
67
  files: dict[str, bytes] = {}
66
68
  if sample.files:
67
69
  for path, contents in sample.files.items():
68
- files[path] = read_sandboxenv_file(contents)
70
+ files[path] = await read_sandboxenv_file(contents)
69
71
 
70
72
  # read setup script from sample (add bash shebang if necessary)
71
73
  setup: bytes | None = None
72
74
  if sample.setup:
73
- setup = read_sandboxenv_file(sample.setup)
75
+ setup = await read_sandboxenv_file(sample.setup)
74
76
  setup_str = setup.decode(encoding="utf-8")
75
77
  if not setup_str.strip().startswith("#!"):
76
78
  setup_str = f"#!/usr/bin/env bash\n\n{setup_str}"
@@ -108,13 +110,16 @@ async def sandboxenv_context(
108
110
  )
109
111
 
110
112
 
111
- def read_sandboxenv_file(contents: str) -> bytes:
113
+ async def read_sandboxenv_file(contents: str) -> bytes:
112
114
  if is_data_uri(contents):
113
115
  contents_base64 = data_uri_to_base64(contents)
114
116
  file_bytes = base64.b64decode(contents_base64)
117
+ elif is_http_url(contents):
118
+ client = httpx.AsyncClient()
119
+ file_bytes = (await client.get(contents, follow_redirects=True)).content
115
120
  else:
116
121
  # try to read as a file (if it doesn't exist or has a path not cool w/
117
- # the fileystem then we fall back to contents)
122
+ # the filesystem then we fall back to contents)
118
123
  try:
119
124
  fs = filesystem(contents)
120
125
  if fs.exists(contents):
@@ -37,3 +37,4 @@ SAMPLE_SUBTASK = "sample"
37
37
  CONSOLE_DISPLAY_WIDTH = 120
38
38
  BASE_64_DATA_REMOVED = "<base64-data-removed>"
39
39
  SANDBOX_SETUP_TIMEOUT = 300
40
+ NO_CONTENT = "(no content)"
@@ -0,0 +1,61 @@
1
+ from typing import Literal
2
+
3
+
4
+ def get_service_by_port(port: int, protocol: Literal["tcp", "udp"]) -> str | None:
5
+ """
6
+ Returns the likely service running on a given port number.
7
+
8
+ Args:
9
+ port (int): The port number to look up
10
+ protocol (str): Either 'tcp' or 'udp'
11
+
12
+ Returns:
13
+ str: Description of the likely service, or None if not found
14
+ """
15
+ # Common port mappings based on IANA assignments and common usage
16
+ port_mappings = {
17
+ "tcp": {
18
+ 20: "FTP (Data)",
19
+ 21: "FTP (Control)",
20
+ 22: "SSH",
21
+ 23: "Telnet",
22
+ 25: "SMTP",
23
+ 53: "DNS",
24
+ 80: "HTTP",
25
+ 110: "POP3",
26
+ 143: "IMAP",
27
+ 443: "HTTPS",
28
+ 445: "Microsoft-DS (SMB)",
29
+ 587: "SMTP (Submission)",
30
+ 993: "IMAPS",
31
+ 995: "POP3S",
32
+ 1433: "Microsoft SQL Server",
33
+ 1521: "Oracle Database",
34
+ 3306: "MySQL",
35
+ 3389: "RDP (Remote Desktop)",
36
+ 5432: "PostgreSQL",
37
+ 5900: "VNC",
38
+ 5901: "VNC Display :1",
39
+ 5902: "VNC Display :2",
40
+ 6080: "noVNC",
41
+ 8080: "HTTP Alternate",
42
+ 8443: "HTTPS Alternate",
43
+ 27017: "MongoDB",
44
+ 27018: "MongoDB Shard",
45
+ 27019: "MongoDB Config Server",
46
+ },
47
+ "udp": {
48
+ 53: "DNS",
49
+ 67: "DHCP Server",
50
+ 68: "DHCP Client",
51
+ 69: "TFTP",
52
+ 123: "NTP",
53
+ 161: "SNMP",
54
+ 162: "SNMP Trap",
55
+ 514: "Syslog",
56
+ 1194: "OpenVPN",
57
+ 5353: "mDNS",
58
+ },
59
+ }
60
+
61
+ return port_mappings.get(protocol, {}).get(port, None)