PyPI - inspect-ai - Versions diffs - 0.3.58__py3-none-any.whl → 0.3.60__py3-none-any.whl - Mend

inspect-ai 0.3.58py3-none-any.whl → 0.3.60py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (166) hide show

inspect_ai/_cli/common.py +3 -1
inspect_ai/_cli/eval.py +15 -9
inspect_ai/_display/core/active.py +4 -1
inspect_ai/_display/core/config.py +3 -3
inspect_ai/_display/core/panel.py +7 -3
inspect_ai/_display/plain/__init__.py +0 -0
inspect_ai/_display/plain/display.py +203 -0
inspect_ai/_display/rich/display.py +0 -5
inspect_ai/_display/textual/widgets/port_mappings.py +110 -0
inspect_ai/_display/textual/widgets/samples.py +79 -12
inspect_ai/_display/textual/widgets/sandbox.py +37 -0
inspect_ai/_eval/eval.py +10 -1
inspect_ai/_eval/loader.py +79 -19
inspect_ai/_eval/registry.py +6 -0
inspect_ai/_eval/score.py +3 -1
inspect_ai/_eval/task/results.py +51 -22
inspect_ai/_eval/task/run.py +47 -13
inspect_ai/_eval/task/sandbox.py +10 -5
inspect_ai/_util/constants.py +1 -0
inspect_ai/_util/port_names.py +61 -0
inspect_ai/_util/text.py +23 -0
inspect_ai/_view/www/App.css +31 -1
inspect_ai/_view/www/dist/assets/index.css +31 -1
inspect_ai/_view/www/dist/assets/index.js +25498 -2044
inspect_ai/_view/www/log-schema.json +32 -2
inspect_ai/_view/www/package.json +2 -0
inspect_ai/_view/www/src/App.mjs +14 -16
inspect_ai/_view/www/src/Types.mjs +1 -2
inspect_ai/_view/www/src/api/Types.ts +133 -0
inspect_ai/_view/www/src/api/{api-browser.mjs → api-browser.ts} +25 -13
inspect_ai/_view/www/src/api/api-http.ts +219 -0
inspect_ai/_view/www/src/api/api-shared.ts +47 -0
inspect_ai/_view/www/src/api/{api-vscode.mjs → api-vscode.ts} +22 -19
inspect_ai/_view/www/src/api/{client-api.mjs → client-api.ts} +93 -53
inspect_ai/_view/www/src/api/index.ts +51 -0
inspect_ai/_view/www/src/api/jsonrpc.ts +225 -0
inspect_ai/_view/www/src/components/ChatView.mjs +133 -43
inspect_ai/_view/www/src/components/DownloadButton.mjs +1 -1
inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -4
inspect_ai/_view/www/src/components/LargeModal.mjs +19 -20
inspect_ai/_view/www/src/components/TabSet.mjs +3 -1
inspect_ai/_view/www/src/components/VirtualList.mjs +266 -84
inspect_ai/_view/www/src/index.js +77 -4
inspect_ai/_view/www/src/log/{remoteLogFile.mjs → remoteLogFile.ts} +62 -46
inspect_ai/_view/www/src/navbar/Navbar.mjs +4 -1
inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +19 -10
inspect_ai/_view/www/src/samples/SampleDialog.mjs +5 -1
inspect_ai/_view/www/src/samples/SampleDisplay.mjs +23 -15
inspect_ai/_view/www/src/samples/SampleList.mjs +19 -49
inspect_ai/_view/www/src/samples/SampleScores.mjs +1 -1
inspect_ai/_view/www/src/samples/SampleTranscript.mjs +8 -3
inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +38 -26
inspect_ai/_view/www/src/samples/SamplesTab.mjs +14 -11
inspect_ai/_view/www/src/samples/SamplesTools.mjs +8 -8
inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +712 -89
inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +2 -2
inspect_ai/_view/www/src/samples/tools/filters.mjs +260 -87
inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +24 -2
inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +29 -24
inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +1 -1
inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +24 -2
inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +24 -2
inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +31 -10
inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +24 -2
inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +23 -2
inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +24 -2
inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +33 -3
inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +25 -2
inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +25 -2
inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +193 -11
inspect_ai/_view/www/src/samples/transcript/Types.mjs +10 -0
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +26 -2
inspect_ai/_view/www/src/types/log.d.ts +13 -2
inspect_ai/_view/www/src/utils/Format.mjs +10 -3
inspect_ai/_view/www/src/utils/{Json.mjs → json-worker.ts} +13 -9
inspect_ai/_view/www/src/utils/vscode.ts +36 -0
inspect_ai/_view/www/src/workspace/WorkSpace.mjs +11 -5
inspect_ai/_view/www/vite.config.js +7 -0
inspect_ai/_view/www/yarn.lock +116 -0
inspect_ai/approval/_human/__init__.py +0 -0
inspect_ai/approval/_human/manager.py +1 -1
inspect_ai/approval/_policy.py +12 -6
inspect_ai/log/_log.py +1 -1
inspect_ai/log/_samples.py +16 -0
inspect_ai/log/_transcript.py +4 -1
inspect_ai/model/_call_tools.py +59 -0
inspect_ai/model/_conversation.py +16 -7
inspect_ai/model/_generate_config.py +12 -12
inspect_ai/model/_model.py +117 -18
inspect_ai/model/_model_output.py +22 -2
inspect_ai/model/_openai.py +383 -0
inspect_ai/model/_providers/anthropic.py +152 -55
inspect_ai/model/_providers/azureai.py +21 -21
inspect_ai/model/_providers/bedrock.py +37 -40
inspect_ai/model/_providers/goodfire.py +248 -0
inspect_ai/model/_providers/google.py +46 -54
inspect_ai/model/_providers/groq.py +7 -3
inspect_ai/model/_providers/hf.py +6 -0
inspect_ai/model/_providers/mistral.py +13 -12
inspect_ai/model/_providers/openai.py +51 -218
inspect_ai/model/_providers/openai_o1.py +11 -12
inspect_ai/model/_providers/providers.py +23 -1
inspect_ai/model/_providers/together.py +12 -12
inspect_ai/model/_providers/util/__init__.py +2 -3
inspect_ai/model/_providers/util/hf_handler.py +1 -1
inspect_ai/model/_providers/util/llama31.py +1 -1
inspect_ai/model/_providers/util/util.py +0 -76
inspect_ai/model/_providers/vertex.py +1 -4
inspect_ai/scorer/_metric.py +3 -0
inspect_ai/scorer/_reducer/reducer.py +1 -1
inspect_ai/scorer/_scorer.py +4 -3
inspect_ai/solver/__init__.py +4 -5
inspect_ai/solver/_basic_agent.py +1 -1
inspect_ai/solver/_bridge/__init__.py +3 -0
inspect_ai/solver/_bridge/bridge.py +100 -0
inspect_ai/solver/_bridge/patch.py +170 -0
inspect_ai/solver/_prompt.py +35 -5
inspect_ai/solver/_solver.py +6 -0
inspect_ai/solver/_task_state.py +80 -38
inspect_ai/tool/__init__.py +2 -0
inspect_ai/tool/_tool.py +12 -1
inspect_ai/tool/_tool_call.py +10 -0
inspect_ai/tool/_tool_def.py +16 -5
inspect_ai/tool/_tool_with.py +21 -4
inspect_ai/tool/beta/__init__.py +5 -0
inspect_ai/tool/beta/_computer/__init__.py +3 -0
inspect_ai/tool/beta/_computer/_common.py +133 -0
inspect_ai/tool/beta/_computer/_computer.py +155 -0
inspect_ai/tool/beta/_computer/_computer_split.py +198 -0
inspect_ai/tool/beta/_computer/_resources/Dockerfile +100 -0
inspect_ai/tool/beta/_computer/_resources/README.md +30 -0
inspect_ai/tool/beta/_computer/_resources/entrypoint/entrypoint.sh +18 -0
inspect_ai/tool/beta/_computer/_resources/entrypoint/novnc_startup.sh +20 -0
inspect_ai/tool/beta/_computer/_resources/entrypoint/x11vnc_startup.sh +48 -0
inspect_ai/tool/beta/_computer/_resources/entrypoint/xfce_startup.sh +13 -0
inspect_ai/tool/beta/_computer/_resources/entrypoint/xvfb_startup.sh +48 -0
inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +10 -0
inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +10 -0
inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/XPaint.desktop +10 -0
inspect_ai/tool/beta/_computer/_resources/tool/__init__.py +0 -0
inspect_ai/tool/beta/_computer/_resources/tool/_logger.py +22 -0
inspect_ai/tool/beta/_computer/_resources/tool/_run.py +42 -0
inspect_ai/tool/beta/_computer/_resources/tool/_tool_result.py +33 -0
inspect_ai/tool/beta/_computer/_resources/tool/_x11_client.py +262 -0
inspect_ai/tool/beta/_computer/_resources/tool/computer_tool.py +85 -0
inspect_ai/tool/beta/_computer/_resources/tool/requirements.txt +0 -0
inspect_ai/util/__init__.py +2 -0
inspect_ai/util/_display.py +5 -0
inspect_ai/util/_limit.py +26 -0
inspect_ai/util/_sandbox/docker/docker.py +64 -1
inspect_ai/util/_sandbox/docker/internal.py +3 -1
inspect_ai/util/_sandbox/docker/prereqs.py +1 -1
inspect_ai/util/_sandbox/environment.py +14 -0
{inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/METADATA +3 -2
{inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/RECORD +159 -126
inspect_ai/_view/www/src/api/Types.mjs +0 -117
inspect_ai/_view/www/src/api/api-http.mjs +0 -300
inspect_ai/_view/www/src/api/api-shared.mjs +0 -10
inspect_ai/_view/www/src/api/index.mjs +0 -49
inspect_ai/_view/www/src/api/jsonrpc.mjs +0 -208
inspect_ai/_view/www/src/samples/transcript/TranscriptState.mjs +0 -70
inspect_ai/_view/www/src/utils/vscode.mjs +0 -16
{inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/top_level.txt +0 -0

inspect_ai/_display/textual/widgets/sandbox.py ADDED Viewed

@@ -0,0 +1,37 @@
+from textual.app import ComposeResult
+from textual.containers import Horizontal, Vertical
+from textual.widgets import Static
+from inspect_ai.util._sandbox.environment import SandboxConnection
+from .port_mappings import PortMappingsView
+class SandboxView(Vertical):
+    DEFAULT_CSS = """
+    .indent {
+        width: 2;
+    }
+    .no_indent {
+        width: 0;
+    }
+    """
+    def __init__(
+        self,
+        connection: SandboxConnection,
+        name: str | None,  # if None, no header or indent
+    ) -> None:
+        super().__init__()
+        self.sandbox_name = name
+        self.connection = connection
+    def compose(self) -> ComposeResult:
+        if self.sandbox_name:
+            yield Static(self.sandbox_name)
+        with Horizontal():
+            yield Static("", classes="indent" if self.sandbox_name else "no_indent")
+            with Vertical():
+                yield Static(self.connection.command)
+                if self.connection.ports:
+                    yield PortMappingsView(self.connection.ports)

inspect_ai/_eval/eval.py CHANGED Viewed

@@ -35,7 +35,12 @@ from inspect_ai.scorer._reducer import reducer_log_names
 from inspect_ai.solver._chain import chain
 from inspect_ai.solver._solver import Solver, SolverSpec
 from inspect_ai.util import SandboxEnvironmentType
-from inspect_ai.util._display import DisplayType, display_type, init_display_type
+from inspect_ai.util._display import (
+    DisplayType,
+    display_type,
+    display_type_initialized,
+    init_display_type,
+)
 from .context import init_eval_context
 from .loader import ResolvedTask, resolve_tasks
@@ -306,6 +311,10 @@ async def eval_async(
     _eval_async_running = True
+    # if we are called outside of eval() then set display type to "plain"
+    if not display_type_initialized():
+        init_display_type("plain")
     # resolve model and task args
     model_args = resolve_args(model_args)
     task_args = resolve_args(task_args)

inspect_ai/_eval/loader.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import ast
 import contextlib
+import inspect
 import os
 from dataclasses import dataclass, field
 from importlib.machinery import SourceFileLoader
@@ -9,11 +10,13 @@ from pathlib import Path
 from types import ModuleType
 from typing import Any, Callable, cast
+from typing_extensions import overload
 from inspect_ai._eval.task.util import task_file, task_run_dir
 from inspect_ai._util.decorator import parse_decorators
 from inspect_ai._util.error import PrerequisiteError
 from inspect_ai._util.logger import warn_once
-from inspect_ai._util.path import chdir_python
+from inspect_ai._util.path import chdir_python, cwd_relative_path
 from inspect_ai._util.registry import (
     RegistryInfo,
     is_registry_object,
@@ -23,6 +26,7 @@ from inspect_ai._util.registry import (
     registry_params,
 )
 from inspect_ai.model import Model, ModelName
+from inspect_ai.solver._bridge import bridge
 from inspect_ai.solver._solver import Solver, SolverSpec
 from inspect_ai.util import SandboxEnvironmentSpec, SandboxEnvironmentType
 from inspect_ai.util._sandbox.environment import resolve_sandbox_environment
@@ -334,6 +338,16 @@ def split_spec(spec: str) -> tuple[str, str | None]:
         return spec, None
+@overload
+def load_module(
+    module_path: Path, filter: Callable[[str], bool]
+) -> ModuleType | None: ...
+@overload
+def load_module(module_path: Path, filter: None = None) -> ModuleType: ...
 def load_module(
     module_path: Path, filter: Callable[[str], bool] | None = None
 ) -> ModuleType | None:
@@ -425,28 +439,74 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
         else contextlib.nullcontext()
     )
+    # pretty solver name for error messages
+    pretty_solver_file = (
+        cwd_relative_path(solver_file.as_posix()) if solver_file else None
+    )
     with create_cm:
-        # if we have a file then we need to load it and (if required) determine the solver name
-        if solver_file is not None:
-            # load the module so that registry_create works
-            load_module(solver_file)
+        # if there is no solver file then just create from the registry by name
+        if solver_file is None:
+            if solver_name is None:
+                raise ValueError(f"Unable to resolve solver name from {spec.solver}")
+            return cast(Solver, registry_create("solver", solver_name, **spec.args))
-            # if there is no solver_name we need to discover the first @solver
+        # we do have a solver file
+        else:
+            # load the module and parse decorators
+            solver_module = load_module(solver_file)
+            decorators = parse_decorators(solver_file, "solver")
+            # if there is no solver_name see if we can discover it
             if solver_name is None:
-                solvers = parse_decorators(solver_file, "solver")
-                if len(solvers) == 0:
+                if len(decorators) == 1:
+                    # decorator based solver
+                    solver_name = decorators[0][0]
+                elif len(decorators) == 0:
+                    # see if we can find an agent based solver
+                    functions = [
+                        function
+                        for function in inspect.getmembers(
+                            solver_module, inspect.isfunction
+                        )
+                        if function[1].__module__ == solver_module.__name__
+                    ]
+                    agent_functions = [
+                        function
+                        for function in functions
+                        if "agent" in function[0] and not function[0].startswith("_")
+                    ]
+                    if len(agent_functions) == 1:
+                        # agent based solver
+                        solver_name = agent_functions[0][0]
+                    elif len(agent_functions) == 0:
+                        raise PrerequisiteError(
+                            f"The source file {pretty_solver_file} does not contain any @solver functions or agent functions."
+                        )
+                    else:
+                        raise PrerequisiteError(
+                            f"The source file {pretty_solver_file} has more than one agent function (qualify which agent using e.g. '{solver_file.name}@agent_fn')"
+                        )
+                else:
                     raise PrerequisiteError(
-                        f"The source file {solver_file.as_posix()} does not contain any @solver functions."
+                        f"The source file {pretty_solver_file} has more than one @solver function (qualify which solver using e.g. '{solver_file.name}y@solver_fn')"
                     )
-                if len(solvers) > 1:
-                    raise PrerequisiteError(
-                        f"The source file {solver_file.as_posix()} has more than one @solver function (qualify which solver using file.py@solver)"
-                    )
-                solver_name = solvers[0][0]
-        # make mypy happy and catch unexpected branching
-        if solver_name is None:
-            raise ValueError(f"Unable to resolve solver name from {spec.solver}")
+            # create decorator based solvers using the registry
+            if any(solver[0] == solver_name for solver in decorators):
+                return cast(Solver, registry_create("solver", solver_name, **spec.args))
-        solver = cast(Solver, registry_create("solver", solver_name, **spec.args))
-        return solver
+            # create agent based solvers by calling the function and wrapping it in bridge()
+            else:
+                agent_fn = getattr(solver_module, solver_name, None)
+                if inspect.isfunction(agent_fn):
+                    return bridge(agent_fn(**spec.args))
+                elif agent_fn is not None:
+                    raise PrerequisiteError(
+                        f"The object {solver_name} in file {pretty_solver_file} is not a Python function."
+                    )
+                else:
+                    raise PrerequisiteError(
+                        f"The function {solver_name} was not found in file {pretty_solver_file}."
+                    )

inspect_ai/_eval/registry.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import inspect
 import logging
 from copy import deepcopy
+from functools import wraps
 from pathlib import Path
 from typing import Any, Callable, TypeVar, cast, overload
@@ -125,6 +126,7 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
         params = list(inspect.signature(task_type).parameters.keys())
         # Create and return the wrapper function
+        @wraps(task_type)
         def wrapper(*w_args: Any, **w_kwargs: Any) -> Task:
             # Create the task
             task_instance = task_type(*w_args, **w_kwargs)
@@ -154,6 +156,10 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
             # Return the task instance
             return task_instance
+        # functools.wraps overrides the return type annotation of the inner function, so
+        # we explicitly set it again
+        wrapper.__annotations__["return"] = Task
         # Register the task and return the wrapper
         return task_register(
             task=cast(TaskType, wrapper), name=task_name, attribs=attribs, params=params

inspect_ai/_eval/score.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Callable, cast
 from inspect_ai._display import display
 from inspect_ai._util.path import chdir_python
 from inspect_ai._util.platform import platform_init
-from inspect_ai._util.registry import registry_create
+from inspect_ai._util.registry import registry_create, registry_unqualified_name
 from inspect_ai.log import (
     EvalLog,
     EvalMetric,
@@ -85,6 +85,7 @@ async def score_async(
             sample_id=sample.id,
             epoch=sample.epoch,
             input=sample.input,
+            target=Target(sample.target),
             choices=sample.choices,
             messages=sample.messages,
             output=sample.output,
@@ -184,6 +185,7 @@ async def run_score_task(
         results[scorer_name] = SampleScore(
             score=result,
             sample_id=state.sample_id,
+            scorer=registry_unqualified_name(scorer),
         )
     progress()

inspect_ai/_eval/task/results.py CHANGED Viewed

@@ -2,6 +2,7 @@ import fnmatch
 import re
 from collections import defaultdict
 from copy import deepcopy
+from dataclasses import dataclass, field
 from typing import Any, Tuple, cast
 from inspect_ai._util.registry import (
@@ -19,6 +20,8 @@ from inspect_ai.log import (
 from inspect_ai.log._log import EvalSampleReductions
 from inspect_ai.scorer import Metric, Score, Scorer
 from inspect_ai.scorer._metric import SampleScore
+from inspect_ai.scorer._metrics.accuracy import accuracy
+from inspect_ai.scorer._metrics.std import stderr
 from inspect_ai.scorer._reducer import ScoreReducer, mean_score, reducer_log_name
 from inspect_ai.scorer._scorer import (
     SCORER_METRICS,
@@ -27,6 +30,27 @@ from inspect_ai.scorer._scorer import (
 )
+@dataclass
+class ScorerInfo:
+    name: str
+    metrics: list[Metric | dict[str, list[Metric]]] | dict[str, list[Metric]]
+    params: dict[str, Any] = field(default_factory=dict)
+    metadata: dict[str, Any] = field(default_factory=dict)
+    @staticmethod
+    def from_scorer(scorer: Scorer) -> "ScorerInfo":
+        name = registry_unqualified_name(scorer)
+        metrics = scorer_metrics(scorer)
+        metadata = deepcopy(registry_info(scorer).metadata)
+        del metadata[SCORER_METRICS]
+        params = registry_params(scorer)
+        return ScorerInfo(name=name, metrics=metrics, params=params, metadata=metadata)
+    @staticmethod
+    def from_name(name: str) -> "ScorerInfo":
+        return ScorerInfo(name=name, metrics=[accuracy(), stderr()])
 def eval_results(
     samples: int,
     scores: list[dict[str, SampleScore]],
@@ -38,18 +62,24 @@ def eval_results(
     results = EvalResults(total_samples=samples, completed_samples=len(scores))
     reductions = None
+    # extract scorers info from scorers then create scorers info for any
+    # scores not already accounted for by a scorer name
+    scorers_info = [ScorerInfo.from_scorer(scorer) for scorer in (scorers or [])]
+    scorer_names = {info.name for info in scorers_info}
+    for sample_scores in scores:
+        for name, sample_score in sample_scores.items():
+            if sample_score.scorer is None and name not in scorer_names:
+                scorers_info.append(ScorerInfo.from_name(name))
+                scorer_names.add(name)
     # record scorer
-    if scorers:
+    if len(scorers_info) > 0:
         result_scores: list[EvalScore] = []
         sample_reductions: list[EvalSampleReductions] = []
-        for scorer in scorers:
-            # extract non-metrics metadata
-            metadata = deepcopy(registry_info(scorer).metadata)
-            del metadata[SCORER_METRICS]
+        for scorer_info in scorers_info:
             # this scorer
             scorer_name = unique_scorer_name(
-                scorer, [eval_score.name for eval_score in result_scores]
+                scorer_info.name, [eval_score.name for eval_score in result_scores]
             )
             # scores for this scorer
@@ -75,7 +105,7 @@ def eval_results(
                 # Compute metrics for this scorer
                 simple_scores = cast(list[Score], reduced_scores)
-                targets = metrics if metrics is not None else scorer_metrics(scorer)
+                targets = metrics if metrics is not None else scorer_info.metrics
                 if isinstance(targets, list):
                     ## split the metrics into the simple metrics and any dictionary
                     ## metrics, to be processed independently
@@ -88,8 +118,7 @@ def eval_results(
                     result_scores.extend(
                         scorer_for_metrics(
                             scorer_name=scorer_name,
-                            scorer=scorer,
-                            metadata=metadata,
+                            scorer_info=scorer_info,
                             scores=simple_scores,
                             metrics=simple_metrics,
                             reducer_name=reducer_display_nm,
@@ -99,8 +128,7 @@ def eval_results(
                         result_scores.extend(
                             scorers_from_metric_dict(
                                 scorer_name=scorer_name,
-                                scorer=scorer,
-                                metadata=metadata,
+                                scorer_info=scorer_info,
                                 scores=simple_scores,
                                 metrics=dict_metric,
                                 reducer_name=reducer_display_nm,
@@ -116,8 +144,7 @@ def eval_results(
                     result_scores.extend(
                         scorers_from_metric_dict(
                             scorer_name=scorer_name,
-                            scorer=scorer,
-                            metadata=metadata,
+                            scorer_info=scorer_info,
                             scores=simple_scores,
                             metrics=targets,
                             reducer_name=reducer_display_nm,
@@ -156,8 +183,7 @@ def split_metrics(
 def scorer_for_metrics(
     scorer_name: str,
-    scorer: Scorer,
-    metadata: dict[str, Any],
+    scorer_info: ScorerInfo,
     scores: list[Score],
     metrics: list[Metric],
     reducer_name: str | None = None,
@@ -218,8 +244,10 @@ def scorer_for_metrics(
             scorer=scorer_name,
             reducer=reducer_name,
             name=scorer_name,
-            params=registry_params(scorer),
-            metadata=metadata if len(metadata.keys()) > 0 else None,
+            params=scorer_info.params,
+            metadata=scorer_info.metadata
+            if len(scorer_info.metadata.keys()) > 0
+            else None,
             metrics=list_metrics,
         )
     )
@@ -228,8 +256,7 @@ def scorer_for_metrics(
 def scorers_from_metric_dict(
     scorer_name: str,
-    scorer: Scorer,
-    metadata: dict[str, Any],
+    scorer_info: ScorerInfo,
     scores: list[Score],
     metrics: dict[str, list[Metric]],
     reducer_name: str | None = None,
@@ -299,8 +326,10 @@ def scorers_from_metric_dict(
                 scorer=scorer_name,
                 reducer=reducer_name,
                 name=metric_key,
-                params=registry_params(scorer),
-                metadata=metadata if len(metadata.keys()) > 0 else None,
+                params=scorer_info.params,
+                metadata=scorer_info.metadata
+                if len(scorer_info.metadata.keys()) > 0
+                else None,
                 metrics=result_metrics,
             )
         )

inspect_ai/_eval/task/run.py CHANGED Viewed

@@ -30,8 +30,9 @@ from inspect_ai._util.hooks import send_telemetry
 from inspect_ai._util.registry import (
     is_registry_object,
     registry_log_name,
+    registry_unqualified_name,
 )
-from inspect_ai._util.timeouts import Timeout, timeout, timeout_at
+from inspect_ai._util.timeouts import Timeout, timeout
 from inspect_ai._view.notify import view_notify_eval
 from inspect_ai.dataset import Dataset, Sample
 from inspect_ai.log import (
@@ -45,7 +46,11 @@ from inspect_ai.log import (
 from inspect_ai.log._condense import condense_sample
 from inspect_ai.log._file import eval_log_json_str
 from inspect_ai.log._log import EvalSampleLimit, EvalSampleReductions, eval_error
-from inspect_ai.log._samples import active_sample
+from inspect_ai.log._samples import (
+    active_sample,
+    set_active_sample_message_limit,
+    set_active_sample_token_limit,
+)
 from inspect_ai.log._transcript import (
     ErrorEvent,
     SampleInitEvent,
@@ -72,6 +77,7 @@ from inspect_ai.solver._chain import Chain, unroll
 from inspect_ai.solver._fork import set_task_generate
 from inspect_ai.solver._solver import Solver
 from inspect_ai.solver._task_state import sample_state, set_sample_state, state_jsonable
+from inspect_ai.util._limit import SampleLimitExceededError
 from inspect_ai.util._sandbox.context import sandbox_connections
 from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
 from inspect_ai.util._subtask import init_subtask
@@ -538,6 +544,9 @@ async def task_run_sample(
     # helper to handle exceptions (will throw if we've exceeded the limit)
     def handle_error(ex: BaseException) -> EvalError:
         err = sample_error(ex)
+        py_logger.warning(
+            f"Sample error (id: {sample.id}, epoch: {state.epoch}): {exception_message(ex)})"
+        )
         transcript()._event(ErrorEvent(error=err))
         return err
@@ -630,30 +639,43 @@ async def task_run_sample(
                     else:
                         raise
+                except SampleLimitExceededError as ex:
+                    # sample limit event
+                    transcript()._event(
+                        SampleLimitEvent(
+                            type=ex.type,
+                            limit=ex.limit,
+                            message=f"Sample completed: {ex.message}",
+                        )
+                    )
+                    # capture most recent state for scoring
+                    state = sample_state() or state
+                    state.completed = True
                 except BaseException as ex:
                     error = handle_error(ex)
-                # set timeout for scoring. if the original timeout was never hit
-                # then just create a new timeout_cm targeting the original
-                # timeout time. if the original timeout was hit we still want
-                # to provide an opportunity for scoring, but we don't necessarily
+                # set timeout for scoring. if the original timeout was hit we still
+                # want to provide opportunity for scoring, but we don't necessarily
                 # want to wait the full timeout again (especially in the case where
                 # the cause of the timeout is a hung container and scoring requires
                 # interacting with the container). as a middle ground we use half
                 # of the original timeout value for scoring.
                 if isinstance(timeout_cm, Timeout):
-                    if not timeout_cm.expired():
-                        timeout_cm = timeout_at(timeout_cm.when())
-                    else:
-                        assert time_limit
-                        timeout_cm = timeout(time_limit / 2)
+                    assert time_limit
+                    timeout_cm = timeout(time_limit / 2)
+                # turn off sample limits
+                set_active_sample_token_limit(None)
+                set_active_sample_message_limit(None)
                 # scoring
                 try:
                     # timeout during scoring will result in an ordinary sample error
                     async with timeout_cm:
-                        if scorers and error is None:
-                            for scorer in scorers:
+                        if error is None:
+                            for scorer in scorers or []:
                                 scorer_name = unique_scorer_name(
                                     scorer, list(results.keys())
                                 )
@@ -667,6 +689,7 @@ async def task_run_sample(
                                         sample_score = SampleScore(
                                             score=score_result,
                                             sample_id=sample.id,
+                                            scorer=registry_unqualified_name(scorer),
                                         )
                                         transcript()._event(
                                             ScoreEvent(
@@ -675,6 +698,16 @@ async def task_run_sample(
                                         )
                                         results[scorer_name] = sample_score
+                            # add scores returned by solvers
+                            if state.scores is not None:
+                                for name, score in state.scores.items():
+                                    results[name] = SampleScore(
+                                        score=score, sample_id=state.sample_id
+                                    )
+                            # propagate results into scores
+                            state.scores = {k: v.score for k, v in results.items()}
                 except asyncio.CancelledError:
                     if active.interrupt_action:
                         transcript()._event(
@@ -819,6 +852,7 @@ async def resolve_dataset(
                 epoch=epoch,
                 model=model_name,
                 input=sample.input,
+                target=Target(sample.target),
                 choices=sample.choices,
                 messages=sample_messages(sample),
                 message_limit=message_limit,

inspect_ai/_eval/task/sandbox.py CHANGED Viewed

@@ -4,11 +4,13 @@ import contextlib
 from random import random
 from typing import AsyncGenerator, Callable, NamedTuple, cast
+import httpx
 from inspect_ai._eval.task.task import Task
 from inspect_ai._eval.task.util import task_run_dir
 from inspect_ai._util.file import file, filesystem
 from inspect_ai._util.registry import registry_unqualified_name
-from inspect_ai._util.url import data_uri_to_base64, is_data_uri
+from inspect_ai._util.url import data_uri_to_base64, is_data_uri, is_http_url
 from inspect_ai.dataset import Sample
 from inspect_ai.util._concurrency import concurrency
 from inspect_ai.util._sandbox.context import (
@@ -65,12 +67,12 @@ async def sandboxenv_context(
         files: dict[str, bytes] = {}
         if sample.files:
             for path, contents in sample.files.items():
-                files[path] = read_sandboxenv_file(contents)
+                files[path] = await read_sandboxenv_file(contents)
         # read setup script from sample (add bash shebang if necessary)
         setup: bytes | None = None
         if sample.setup:
-            setup = read_sandboxenv_file(sample.setup)
+            setup = await read_sandboxenv_file(sample.setup)
             setup_str = setup.decode(encoding="utf-8")
             if not setup_str.strip().startswith("#!"):
                 setup_str = f"#!/usr/bin/env bash\n\n{setup_str}"
@@ -108,13 +110,16 @@ async def sandboxenv_context(
                 )
-def read_sandboxenv_file(contents: str) -> bytes:
+async def read_sandboxenv_file(contents: str) -> bytes:
     if is_data_uri(contents):
         contents_base64 = data_uri_to_base64(contents)
         file_bytes = base64.b64decode(contents_base64)
+    elif is_http_url(contents):
+        client = httpx.AsyncClient()
+        file_bytes = (await client.get(contents, follow_redirects=True)).content
     else:
         # try to read as a file (if it doesn't exist or has a path not cool w/
-        # the fileystem then we fall back to contents)
+        # the filesystem then we fall back to contents)
         try:
             fs = filesystem(contents)
             if fs.exists(contents):

inspect_ai/_util/constants.py CHANGED Viewed

@@ -37,3 +37,4 @@ SAMPLE_SUBTASK = "sample"
 CONSOLE_DISPLAY_WIDTH = 120
 BASE_64_DATA_REMOVED = "<base64-data-removed>"
 SANDBOX_SETUP_TIMEOUT = 300
+NO_CONTENT = "(no content)"

inspect_ai/_util/port_names.py ADDED Viewed

@@ -0,0 +1,61 @@
+from typing import Literal
+def get_service_by_port(port: int, protocol: Literal["tcp", "udp"]) -> str | None:
+    """
+    Returns the likely service running on a given port number.
+    Args:
+        port (int): The port number to look up
+        protocol (str): Either 'tcp' or 'udp'
+    Returns:
+        str: Description of the likely service, or None if not found
+    """
+    # Common port mappings based on IANA assignments and common usage
+    port_mappings = {
+        "tcp": {
+            20: "FTP (Data)",
+            21: "FTP (Control)",
+            22: "SSH",
+            23: "Telnet",
+            25: "SMTP",
+            53: "DNS",
+            80: "HTTP",
+            110: "POP3",
+            143: "IMAP",
+            443: "HTTPS",
+            445: "Microsoft-DS (SMB)",
+            587: "SMTP (Submission)",
+            993: "IMAPS",
+            995: "POP3S",
+            1433: "Microsoft SQL Server",
+            1521: "Oracle Database",
+            3306: "MySQL",
+            3389: "RDP (Remote Desktop)",
+            5432: "PostgreSQL",
+            5900: "VNC",
+            5901: "VNC Display :1",
+            5902: "VNC Display :2",
+            6080: "noVNC",
+            8080: "HTTP Alternate",
+            8443: "HTTPS Alternate",
+            27017: "MongoDB",
+            27018: "MongoDB Shard",
+            27019: "MongoDB Config Server",
+        },
+        "udp": {
+            53: "DNS",
+            67: "DHCP Server",
+            68: "DHCP Client",
+            69: "TFTP",
+            123: "NTP",
+            161: "SNMP",
+            162: "SNMP Trap",
+            514: "Syslog",
+            1194: "OpenVPN",
+            5353: "mDNS",
+        },
+    }
+    return port_mappings.get(protocol, {}).get(port, None)

inspect-ai 0.3.58__py3-none-any.whl → 0.3.60__py3-none-any.whl

inspect-ai 0.3.58py3-none-any.whl → 0.3.60py3-none-any.whl