PyPI - inspect-ai - Versions diffs - 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl - Mend

inspect-ai 0.3.63py3-none-any.whl → 0.3.65py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

inspect_ai/_cli/cache.py +8 -7
inspect_ai/_cli/common.py +0 -12
inspect_ai/_cli/eval.py +32 -4
inspect_ai/_cli/info.py +1 -0
inspect_ai/_cli/list.py +1 -1
inspect_ai/_cli/log.py +2 -0
inspect_ai/_cli/sandbox.py +4 -1
inspect_ai/_cli/score.py +181 -32
inspect_ai/_cli/trace.py +2 -0
inspect_ai/_cli/view.py +4 -2
inspect_ai/_display/core/config.py +7 -1
inspect_ai/_display/core/progress.py +1 -1
inspect_ai/_display/textual/app.py +8 -4
inspect_ai/_display/textual/widgets/samples.py +6 -5
inspect_ai/_display/textual/widgets/sandbox.py +6 -0
inspect_ai/_eval/__init__.py +0 -0
inspect_ai/_eval/eval.py +100 -97
inspect_ai/_eval/evalset.py +69 -69
inspect_ai/_eval/loader.py +122 -12
inspect_ai/_eval/registry.py +1 -1
inspect_ai/_eval/run.py +14 -0
inspect_ai/_eval/score.py +125 -36
inspect_ai/_eval/task/log.py +105 -4
inspect_ai/_eval/task/results.py +92 -38
inspect_ai/_eval/task/run.py +6 -2
inspect_ai/_eval/task/sandbox.py +35 -2
inspect_ai/_eval/task/task.py +49 -46
inspect_ai/_util/__init__.py +0 -0
inspect_ai/_util/constants.py +1 -1
inspect_ai/_util/content.py +8 -0
inspect_ai/_util/error.py +2 -0
inspect_ai/_util/file.py +15 -1
inspect_ai/_util/logger.py +4 -2
inspect_ai/_util/registry.py +7 -1
inspect_ai/_view/view.py +1 -2
inspect_ai/_view/www/App.css +8 -3
inspect_ai/_view/www/README.md +1 -1
inspect_ai/_view/www/dist/assets/index.css +66 -38
inspect_ai/_view/www/dist/assets/index.js +525 -523
inspect_ai/_view/www/log-schema.json +86 -73
inspect_ai/_view/www/package.json +1 -1
inspect_ai/_view/www/src/App.tsx +1 -0
inspect_ai/_view/www/src/components/AnsiDisplay.tsx +1 -1
inspect_ai/_view/www/src/components/JsonPanel.tsx +1 -1
inspect_ai/_view/www/src/components/LargeModal.tsx +39 -49
inspect_ai/_view/www/src/components/NavPills.tsx +3 -1
inspect_ai/_view/www/src/components/TabSet.tsx +19 -4
inspect_ai/_view/www/src/logfile/remoteLogFile.ts +0 -1
inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +1 -1
inspect_ai/_view/www/src/metadata/MetaDataView.tsx +1 -1
inspect_ai/_view/www/src/metadata/RenderedContent.tsx +6 -13
inspect_ai/_view/www/src/plan/PlanDetailView.tsx +17 -2
inspect_ai/_view/www/src/plan/SolverDetailView.tsx +1 -1
inspect_ai/_view/www/src/samples/SampleDisplay.tsx +14 -5
inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +4 -2
inspect_ai/_view/www/src/samples/SamplesTools.tsx +16 -24
inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +1 -1
inspect_ai/_view/www/src/samples/chat/ChatView.tsx +1 -0
inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +27 -13
inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +19 -17
inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +12 -10
inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +56 -66
inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +12 -5
inspect_ai/_view/www/src/samples/chat/tools/tool.ts +21 -36
inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +3 -1
inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +27 -25
inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +5 -1
inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +13 -13
inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +1 -1
inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +2 -2
inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +9 -5
inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +1 -1
inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -4
inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +1 -0
inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +1 -0
inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +17 -6
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +14 -19
inspect_ai/_view/www/src/types/log.d.ts +107 -19
inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +7 -1
inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +5 -3
inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +25 -27
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +12 -11
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +25 -2
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +60 -36
inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +4 -0
inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +6 -4
inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +16 -14
inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +9 -19
inspect_ai/_view/www/src/workspace/utils.ts +34 -0
inspect_ai/approval/_approval.py +2 -0
inspect_ai/approval/_approver.py +4 -4
inspect_ai/approval/_auto.py +1 -1
inspect_ai/approval/_human/approver.py +3 -0
inspect_ai/approval/_policy.py +5 -0
inspect_ai/approval/_registry.py +2 -2
inspect_ai/dataset/_dataset.py +36 -45
inspect_ai/dataset/_sources/__init__.py +0 -0
inspect_ai/dataset/_sources/csv.py +13 -13
inspect_ai/dataset/_sources/hf.py +29 -29
inspect_ai/dataset/_sources/json.py +10 -10
inspect_ai/log/__init__.py +2 -0
inspect_ai/log/_convert.py +3 -3
inspect_ai/log/_file.py +24 -9
inspect_ai/log/_log.py +98 -7
inspect_ai/log/_message.py +3 -1
inspect_ai/log/_recorders/file.py +4 -0
inspect_ai/log/_recorders/recorder.py +3 -0
inspect_ai/log/_transcript.py +19 -8
inspect_ai/model/__init__.py +2 -0
inspect_ai/model/_cache.py +39 -21
inspect_ai/model/_call_tools.py +2 -2
inspect_ai/model/_chat_message.py +14 -4
inspect_ai/model/_generate_config.py +1 -1
inspect_ai/model/_model.py +31 -24
inspect_ai/model/_model_output.py +14 -1
inspect_ai/model/_openai.py +10 -18
inspect_ai/model/_providers/google.py +9 -5
inspect_ai/model/_providers/openai.py +5 -9
inspect_ai/model/_providers/openrouter.py +1 -1
inspect_ai/scorer/__init__.py +6 -1
inspect_ai/scorer/_answer.py +1 -1
inspect_ai/scorer/_classification.py +4 -0
inspect_ai/scorer/_match.py +4 -5
inspect_ai/scorer/_metric.py +87 -28
inspect_ai/scorer/_metrics/__init__.py +3 -3
inspect_ai/scorer/_metrics/accuracy.py +8 -10
inspect_ai/scorer/_metrics/mean.py +3 -17
inspect_ai/scorer/_metrics/std.py +111 -30
inspect_ai/scorer/_model.py +12 -12
inspect_ai/scorer/_pattern.py +3 -3
inspect_ai/scorer/_reducer/reducer.py +36 -21
inspect_ai/scorer/_reducer/registry.py +2 -2
inspect_ai/scorer/_reducer/types.py +7 -1
inspect_ai/scorer/_score.py +11 -1
inspect_ai/scorer/_scorer.py +110 -16
inspect_ai/solver/__init__.py +1 -1
inspect_ai/solver/_basic_agent.py +19 -22
inspect_ai/solver/_bridge/__init__.py +0 -3
inspect_ai/solver/_bridge/bridge.py +3 -3
inspect_ai/solver/_chain.py +1 -2
inspect_ai/solver/_critique.py +3 -3
inspect_ai/solver/_fork.py +2 -2
inspect_ai/solver/_human_agent/__init__.py +0 -0
inspect_ai/solver/_human_agent/agent.py +5 -8
inspect_ai/solver/_human_agent/commands/clock.py +14 -10
inspect_ai/solver/_human_agent/commands/note.py +1 -1
inspect_ai/solver/_human_agent/commands/score.py +0 -11
inspect_ai/solver/_multiple_choice.py +15 -18
inspect_ai/solver/_prompt.py +7 -7
inspect_ai/solver/_solver.py +53 -52
inspect_ai/solver/_task_state.py +80 -69
inspect_ai/solver/_use_tools.py +9 -9
inspect_ai/tool/__init__.py +2 -1
inspect_ai/tool/_tool.py +43 -14
inspect_ai/tool/_tool_call.py +6 -2
inspect_ai/tool/_tool_choice.py +3 -1
inspect_ai/tool/_tool_def.py +10 -8
inspect_ai/tool/_tool_params.py +24 -0
inspect_ai/tool/_tool_with.py +7 -7
inspect_ai/tool/_tools/__init__.py +0 -0
inspect_ai/tool/_tools/_computer/_common.py +2 -2
inspect_ai/tool/_tools/_computer/_computer.py +11 -0
inspect_ai/tool/_tools/_execute.py +15 -9
inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
inspect_ai/tool/_tools/_web_search.py +7 -5
inspect_ai/util/_concurrency.py +3 -3
inspect_ai/util/_panel.py +2 -0
inspect_ai/util/_resource.py +12 -12
inspect_ai/util/_sandbox/docker/compose.py +23 -20
inspect_ai/util/_sandbox/docker/config.py +2 -1
inspect_ai/util/_sandbox/docker/docker.py +10 -1
inspect_ai/util/_sandbox/docker/service.py +100 -0
inspect_ai/util/_sandbox/environment.py +99 -96
inspect_ai/util/_subprocess.py +5 -3
inspect_ai/util/_subtask.py +15 -16
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/LICENSE +1 -1
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/METADATA +10 -6
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/RECORD +182 -175
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/top_level.txt +0 -0

inspect_ai/_eval/eval.py CHANGED Viewed

@@ -89,67 +89,67 @@ def eval(
     r"""Evaluate tasks using a Model.
     Args:
-        tasks: (Tasks): Task(s) to evaluate. If None, attempt
+        tasks: Task(s) to evaluate. If None, attempt
             to evaluate a task in the current working directory
-        model (str | Model | list[str] | list[Model] | None): Model(s) for
+        model: Model(s) for
             evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
             environment variable.
-        model_base_url: (str | None): Base URL for communicating
+        model_base_url: Base URL for communicating
             with the model API.
-        model_args (dict[str,Any] | str): Model creation args
+        model_args: Model creation args
             (as a dictionary or as a path to a JSON or YAML config file)
-        task_args (dict[str,Any] | str): Task creation arguments
+        task_args: Task creation arguments
             (as a dictionary or as a path to a JSON or YAML config file)
-        sandbox (SandboxEnvironmentType | None): Sandbox environment type
-          (or optionally a str or tuple with a shorthand spec)
-        sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
-          (defaults to True)
-        solver (Solver | list[Solver] | SolverSpec | None): Alternative solver for task(s).
-          Optional (uses task solver by default).
-        tags (list[str] | None): Tags to associate with this evaluation run.
-        trace (bool | None): Trace message interactions with evaluated model to terminal.
-        display (DisplayType | None): Task display type (defaults to 'full').
-        approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
-          Either a path to an approval policy config file or a list of approval policies.
-          Defaults to no approval policy.
-        log_level (str | None): Level for logging to the console: "debug", "http", "sandbox",
-          "info", "warning", "error", or "critical" (defaults to "warning")
-        log_level_transcript (str | None): Level for logging to the log file (defaults to "info")
-        log_dir (str | None): Output path for logging results
-           (defaults to file log in ./logs directory).
-        log_format (Literal["eval", "json"] | None): Format for writing log files (defaults
-           to "eval", the native high-performance format).
-        limit (int | tuple[int, int] | None): Limit evaluated samples
-           (defaults to all samples).
-        sample_id (str | int | list[str | int] | None): Evaluate specific sample(s) from the dataset.
-        epochs (int | Epochs | None): Epochs to repeat samples for and optional score
-           reducer function(s) used to combine sample scores (defaults to "mean")
-        fail_on_error (bool | float | None): `True` to fail on first sample error
-           (default); `False` to never fail on sample errors; Value between 0 and 1
-           to fail if a proportion of total samples fails. Value greater than 1 to fail
-           eval if a count of samples fails.
-        debug_errors (bool | None): Raise task errors (rather than logging them)
-           so they can be debugged (defaults to False).
-        message_limit (int | None): Limit on total messages used for each sample.
-        token_limit (int | None): Limit on total tokens used for each sample.
-        time_limit (int | None): Limit on time (in seconds) for execution of each sample.
-        max_samples (int | None): Maximum number of samples to run in parallel
-           (default is max_connections)
-        max_tasks (int | None): Maximum number of tasks to run in parallel
-           (default is 1)
-        max_subprocesses (int | None): Maximum number of subprocesses to
-           run in parallel (default is os.cpu_count())
-        max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
-           to run in parallel.
-        log_samples: (bool | None): Log detailed samples and scores (defaults to True)
-        log_images: (bool | None): Log base64 encoded version of images,
-           even if specified as a filename or URL (defaults to False)
-        log_buffer: (int | None): Number of samples to buffer before writing log file.
-           If not specified, an appropriate default for the format and filesystem is
-           chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
-        score (bool): Score output (defaults to True)
-        score_display (bool | None): Show scoring metrics in realtime (defaults to True)
-        **kwargs (GenerateConfigArgs): Model generation options.
+        sandbox: Sandbox environment type
+            (or optionally a str or tuple with a shorthand spec)
+        sandbox_cleanup: Cleanup sandbox environments after task completes
+            (defaults to True)
+        solver: Alternative solver for task(s).
+            Optional (uses task solver by default).
+        tags: Tags to associate with this evaluation run.
+        trace: Trace message interactions with evaluated model to terminal.
+        display: Task display type (defaults to 'full').
+        approval: Tool use approval policies.
+            Either a path to an approval policy config file or a list of approval policies.
+            Defaults to no approval policy.
+        log_level: Level for logging to the console: "debug", "http", "sandbox",
+            "info", "warning", "error", or "critical" (defaults to "warning")
+        log_level_transcript: Level for logging to the log file (defaults to "info")
+        log_dir: Output path for logging results
+            (defaults to file log in ./logs directory).
+        log_format: Format for writing log files (defaults
+            to "eval", the native high-performance format).
+        limit: Limit evaluated samples
+            (defaults to all samples).
+        sample_id: Evaluate specific sample(s) from the dataset.
+        epochs: Epochs to repeat samples for and optional score
+            reducer function(s) used to combine sample scores (defaults to "mean")
+        fail_on_error: `True` to fail on first sample error
+            (default); `False` to never fail on sample errors; Value between 0 and 1
+            to fail if a proportion of total samples fails. Value greater than 1 to fail
+            eval if a count of samples fails.
+        debug_errors: Raise task errors (rather than logging them)
+            so they can be debugged (defaults to False).
+        message_limit: Limit on total messages used for each sample.
+        token_limit: Limit on total tokens used for each sample.
+        time_limit: Limit on time (in seconds) for execution of each sample.
+        max_samples: Maximum number of samples to run in parallel
+            (default is max_connections)
+        max_tasks: Maximum number of tasks to run in parallel
+            (default is 1)
+        max_subprocesses: Maximum number of subprocesses to
+            run in parallel (default is os.cpu_count())
+        max_sandboxes: Maximum number of sandboxes (per-provider)
+            to run in parallel.
+        log_samples: Log detailed samples and scores (defaults to True)
+        log_images: Log base64 encoded version of images,
+            even if specified as a filename or URL (defaults to False)
+        log_buffer: Number of samples to buffer before writing log file.
+            If not specified, an appropriate default for the format and filesystem is
+            chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
+        score: Score output (defaults to True)
+        score_display: Show scoring metrics in realtime (defaults to True)
+        **kwargs: Model generation options.
     Returns:
         List of EvalLog (one for each task)
@@ -359,10 +359,14 @@ async def eval_async(
                     "Trace mode cannot be used when evaluating multiple models."
                 )
-        # resolve recorder
+        # resolve recorder (confirm writeable)
         log_dir = log_dir if log_dir else os.environ.get("INSPECT_LOG_DIR", "./logs")
         log_dir = absolute_file_path(log_dir)
         recorder = create_recorder_for_format(log_format or DEFAULT_LOG_FORMAT, log_dir)
+        if not recorder.is_writeable():
+            raise PrerequisiteError(
+                f"ERROR: You do not have write permission for the log_dir '{log_dir}'"
+            )
         # resolve solver
         solver = chain(solver) if isinstance(solver, list) else solver
@@ -492,47 +496,46 @@ def eval_retry(
     """Retry a previously failed evaluation task.
     Args:
-        tasks: (str | EvalLogInfo | EvalLog | list[str] | list[EvalLogInfo] | list[EvalLog]):
-            Log files for task(s) to retry.
-        log_level (str | None): Level for logging to the console: "debug", "http", "sandbox",
-          "info", "warning", "error", or "critical" (defaults to "warning")
-        log_level_transcript (str | None): Level for logging to the log file (defaults to "info")
-        log_dir (str | None): Output path for logging results
-           (defaults to file log in ./logs directory).
-        log_format (Literal["eval", "json"] | None): Format for writing log files (defaults
-           to "eval", the native high-performance format).
-        max_samples (int | None): Maximum number of samples to run in parallel
-           (default is max_connections)
-        max_tasks (int | None): Maximum number of tasks to run in parallel
-           (default is 1)
-        max_subprocesses (int | None): Maximum number of subprocesses to
-           run in parallel (default is os.cpu_count())
-        max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
-           to run in parallel.
-        sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
-           (defaults to True)
-        trace (bool | None): Trace message interactions with evaluated model to terminal.
-        display (DisplayType | None): Task display type (defaults to 'full').
-        fail_on_error (bool | float | None): `True` to fail on first sample error
-           (default); `False` to never fail on sample errors; Value between 0 and 1
-           to fail if a proportion of total samples fails. Value greater than 1 to fail
-           eval if a count of samples fails.
-        debug_errors (bool | None): Raise task errors (rather than logging them)
-           so they can be debugged (defaults to False).
-        log_samples: (bool | None): Log detailed samples and scores (defaults to True)
-        log_images: (bool | None): Log base64 encoded version of images,
-           even if specified as a filename or URL (defaults to False)
-        log_buffer: (int | None): Number of samples to buffer before writing log file.
-           If not specified, an appropriate default for the format and filesystem is
-           chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
-        score (bool): Score output (defaults to True)
-        score_display (bool | None): Show scoring metrics in realtime (defaults to True)
-        max_retries (int | None):
-           Maximum number of times to retry request.
-        timeout: (int | None):
-           Request timeout (in seconds)
-        max_connections (int | None):
-           Maximum number of concurrent connections to Model API (default is per Model API)
+        tasks: Log files for task(s) to retry.
+        log_level: Level for logging to the console: "debug", "http", "sandbox",
+            "info", "warning", "error", or "critical" (defaults to "warning")
+        log_level_transcript: Level for logging to the log file (defaults to "info")
+        log_dir: Output path for logging results
+            (defaults to file log in ./logs directory).
+        log_format: Format for writing log files (defaults
+            to "eval", the native high-performance format).
+        max_samples: Maximum number of samples to run in parallel
+            (default is max_connections)
+        max_tasks: Maximum number of tasks to run in parallel
+            (default is 1)
+        max_subprocesses: Maximum number of subprocesses to
+            run in parallel (default is os.cpu_count())
+        max_sandboxes: Maximum number of sandboxes (per-provider)
+            to run in parallel.
+        sandbox_cleanup: Cleanup sandbox environments after task completes
+            (defaults to True)
+        trace: Trace message interactions with evaluated model to terminal.
+        display: Task display type (defaults to 'full').
+        fail_on_error: `True` to fail on first sample error
+            (default); `False` to never fail on sample errors; Value between 0 and 1
+            to fail if a proportion of total samples fails. Value greater than 1 to fail
+            eval if a count of samples fails.
+        debug_errors: Raise task errors (rather than logging them)
+            so they can be debugged (defaults to False).
+        log_samples: Log detailed samples and scores (defaults to True)
+        log_images: Log base64 encoded version of images,
+            even if specified as a filename or URL (defaults to False)
+        log_buffer: Number of samples to buffer before writing log file.
+            If not specified, an appropriate default for the format and filesystem is
+            chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
+        score: Score output (defaults to True)
+        score_display: Show scoring metrics in realtime (defaults to True)
+        max_retries:
+            Maximum number of times to retry request.
+        timeout:
+            Request timeout (in seconds)
+        max_connections:
+            Maximum number of concurrent connections to Model API (default is per Model API)
     Returns:
         List of EvalLog (one for each task)

inspect_ai/_eval/evalset.py CHANGED Viewed

@@ -93,79 +93,79 @@ def eval_set(
     r"""Evaluate a set of tasks.
     Args:
-        tasks: (Tasks): Task(s) to evaluate. If None, attempt
+        tasks: Task(s) to evaluate. If None, attempt
             to evaluate a task in the current working directory
-        log_dir (str): Output path for logging results
-           (required to ensure that a unique storage scope is assigned for the set).
-        retry_attempts: (int | None): Maximum number of retry attempts before giving up
-          (defaults to 10).
-        retry_wait (float | None): Time to wait between attempts, increased exponentially.
-          (defaults to 30, resulting in waits of 30, 60, 120, 240, etc.). Wait time
-          per-retry will in no case by longer than 1 hour.
-        retry_connections (float | None): Reduce max_connections at this rate with each retry
-          (defaults to 0.5)
-        retry_cleanup (bool | None): Cleanup failed log files after retries
-          (defaults to True)
-        model (str | Model | list[str] | list[Model] | None): Model(s) for
-          evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
-          environment variable.
-        model_base_url: (str | None): Base URL for communicating
-          with the model API.
-        model_args (dict[str,Any] | str): Model creation args
-          (as a dictionary or as a path to a JSON or YAML config file)
-        task_args (dict[str,Any] | str): Task creation arguments
-          (as a dictionary or as a path to a JSON or YAML config file)
-        sandbox (SandboxEnvironmentType | None): Sandbox environment type
-          (or optionally a str or tuple with a shorthand spec)
-        sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
-          (defaults to True)
-        solver (Solver | list[Solver] | SolverSpec | None): Alternative solver(s) for
-           evaluating task(s). ptional (uses task solver by default).
-        tags (list[str] | None): Tags to associate with this evaluation run.
-        trace: (bool | None): Trace message interactions with evaluated model to terminal.
-        display (DisplayType | None): Task display type (defaults to 'full').
-        approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
-          Either a path to an approval policy config file or a list of approval policies.
-          Defaults to no approval policy.
-        score (bool): Score output (defaults to True)
-        log_level (str | None): Level for logging to the console: "debug", "http", "sandbox",
-          "info", "warning", "error", or "critical" (defaults to "warning")
-        log_level_transcript (str | None): Level for logging to the log file (defaults to "info")
-        log_format (Literal["eval", "json"] | None): Format for writing
-          log files (defaults to "eval", the native high-performance format).
-        limit (int | tuple[int, int] | None): Limit evaluated samples
-           (defaults to all samples).
-        sample_id (str | int | list[str | int] | None): Evaluate specific sample(s) from the dataset.
-        epochs (int | Epochs | None): Epochs to repeat samples for and optional score
-           reducer function(s) used to combine sample scores (defaults to "mean")
-        fail_on_error (bool | float | None): `True` to fail on first sample error
-           (default); `False` to never fail on sample errors; Value between 0 and 1
-           to fail if a proportion of total samples fails. Value greater than 1 to fail
-           eval if a count of samples fails.
-        debug_errors (bool | None): Raise task errors (rather than logging them)
-           so they can be debugged (defaults to False).
-        message_limit (int | None): Limit on total messages used for each sample.
-        token_limit (int | None): Limit on total tokens used for each sample.
-        time_limit (int | None): Limit on time (in seconds) for execution of each sample.
-        max_samples (int | None): Maximum number of samples to run in parallel
-           (default is max_connections)
-        max_tasks (int | None): Maximum number of tasks to run in parallel
-           (default is 1)
-        max_subprocesses (int | None): Maximum number of subprocesses to
-           run in parallel (default is os.cpu_count())
-        max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
-           to run in parallel.
-        log_samples: (bool | None): Log detailed samples and scores (defaults to True)
-        log_images: (bool | None): Log base64 encoded version of images,
+        log_dir: Output path for logging results
+            (required to ensure that a unique storage scope is assigned for the set).
+        retry_attempts: Maximum number of retry attempts before giving up
+            (defaults to 10).
+        retry_wait: Time to wait between attempts, increased exponentially.
+            (defaults to 30, resulting in waits of 30, 60, 120, 240, etc.). Wait time
+            per-retry will in no case by longer than 1 hour.
+        retry_connections: Reduce max_connections at this rate with each retry
+            (defaults to 0.5)
+        retry_cleanup: Cleanup failed log files after retries
+            (defaults to True)
+        model: Model(s) for
+            evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
+            environment variable.
+        model_base_url: Base URL for communicating
+            with the model API.
+        model_args: Model creation args
+            (as a dictionary or as a path to a JSON or YAML config file)
+        task_args: Task creation arguments
+            (as a dictionary or as a path to a JSON or YAML config file)
+        sandbox: Sandbox environment type
+            (or optionally a str or tuple with a shorthand spec)
+        sandbox_cleanup: Cleanup sandbox environments after task completes
+            (defaults to True)
+        solver: Alternative solver(s) for
+            evaluating task(s). ptional (uses task solver by default).
+        tags: Tags to associate with this evaluation run.
+        trace: Trace message interactions with evaluated model to terminal.
+        display: Task display type (defaults to 'full').
+        approval: Tool use approval policies.
+            Either a path to an approval policy config file or a list of approval policies.
+            Defaults to no approval policy.
+        score: Score output (defaults to True)
+        log_level: Level for logging to the console: "debug", "http", "sandbox",
+            "info", "warning", "error", or "critical" (defaults to "warning")
+        log_level_transcript: Level for logging to the log file (defaults to "info")
+        log_format: Format for writing
+            log files (defaults to "eval", the native high-performance format).
+        limit: Limit evaluated samples
+            (defaults to all samples).
+        sample_id: Evaluate specific sample(s) from the dataset.
+        epochs: Epochs to repeat samples for and optional score
+            reducer function(s) used to combine sample scores (defaults to "mean")
+        fail_on_error: `True` to fail on first sample error
+            (default); `False` to never fail on sample errors; Value between 0 and 1
+            to fail if a proportion of total samples fails. Value greater than 1 to fail
+            eval if a count of samples fails.
+        debug_errors: Raise task errors (rather than logging them)
+            so they can be debugged (defaults to False).
+        message_limit: Limit on total messages used for each sample.
+        token_limit: Limit on total tokens used for each sample.
+        time_limit: Limit on time (in seconds) for execution of each sample.
+        max_samples: Maximum number of samples to run in parallel
+            (default is max_connections)
+        max_tasks: Maximum number of tasks to run in parallel
+            (default is 1)
+        max_subprocesses: Maximum number of subprocesses to
+            run in parallel (default is os.cpu_count())
+        max_sandboxes: Maximum number of sandboxes (per-provider)
+            to run in parallel.
+        log_samples: Log detailed samples and scores (defaults to True)
+        log_images: Log base64 encoded version of images,
             even if specified as a filename or URL (defaults to False)
-        log_buffer: (int | None): Number of samples to buffer before writing log file.
-           If not specified, an appropriate default for the format and filesystem is
-           chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
-        bundle_dir: (str | None): If specified, the log viewer and logs generated
+        log_buffer: Number of samples to buffer before writing log file.
+            If not specified, an appropriate default for the format and filesystem is
+            chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
+        bundle_dir: If specified, the log viewer and logs generated
             by this eval set will be bundled into this directory.
-        bundle_overwrite (bool): Whether to overwrite files in the bundle_dir.
+        bundle_overwrite: Whether to overwrite files in the bundle_dir.
             (defaults to False).
-        **kwargs (GenerateConfigArgs): Model generation options.
+        **kwargs: Model generation options.
     Returns:
         Tuple of bool (whether all tasks completed successfully) and list of EvalLog

inspect_ai/_eval/loader.py CHANGED Viewed

@@ -8,7 +8,7 @@ from importlib.util import module_from_spec, spec_from_loader
 from logging import getLogger
 from pathlib import Path
 from types import ModuleType
-from typing import Any, Callable, cast
+from typing import Any, Callable, Tuple, cast
 from typing_extensions import overload
@@ -26,6 +26,7 @@ from inspect_ai._util.registry import (
     registry_params,
 )
 from inspect_ai.model import Model, ModelName
+from inspect_ai.scorer._scorer import Scorer, ScorerSpec, scorer_create
 from inspect_ai.solver._bridge import bridge
 from inspect_ai.solver._solver import Solver, SolverSpec
 from inspect_ai.util import SandboxEnvironmentSpec, SandboxEnvironmentType
@@ -421,16 +422,7 @@ def as_solver_spec(solver: Solver) -> SolverSpec:
 def solver_from_spec(spec: SolverSpec) -> Solver:
     # resolve @ reference
-    spec_split = split_spec(spec.solver)
-    if spec_split[1] is not None:
-        solver_file: Path | None = Path(spec_split[0]).resolve()
-        solver_name: str | None = spec_split[1]
-    elif Path(spec_split[0]).exists():
-        solver_file = Path(spec_split[0]).resolve()
-        solver_name = None
-    else:
-        solver_file = None
-        solver_name = spec_split[0]
+    solver_file, solver_name = parse_spec_str(spec.solver)
     # switch contexts if we are loading from a file
     create_cm = (
@@ -501,7 +493,7 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
             else:
                 agent_fn = getattr(solver_module, solver_name, None)
                 if inspect.isfunction(agent_fn):
-                    return bridge(agent_fn(**spec.args))
+                    return bridge.bridge(agent_fn(**spec.args))
                 elif agent_fn is not None:
                     raise PrerequisiteError(
                         f"The object {solver_name} in file {pretty_solver_file} is not a Python function."
@@ -510,3 +502,121 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
                     raise PrerequisiteError(
                         f"The function {solver_name} was not found in file {pretty_solver_file}."
                     )
+def scorer_from_spec(spec: ScorerSpec, task_path: Path | None, **kwargs: Any) -> Scorer:
+    """
+    Load a scorer
+    Args:
+        spec: The scorer spec
+        task_path: An optional path to the task file
+        **kwargs: Additional keyword arguments passed to the scorer initialization
+    Returns:
+        Scorer: the loaded scorer
+    Raises:
+        PrerequisiteError: If the scorer cannot be found, loaded, or lacks required type annotations
+    """
+    # resolve @ reference
+    scorer_file, scorer_name = parse_spec_str(spec.scorer)
+    # switch contexts if we are loading from a file
+    create_cm = (
+        chdir_python(scorer_file.parent.as_posix())
+        if scorer_file is not None
+        else contextlib.nullcontext()
+    )
+    # pretty solver name for error messages
+    pretty_scorer_file = (
+        cwd_relative_path(scorer_file.as_posix()) if scorer_file else None
+    )
+    with create_cm:
+        # is there a scorer file being provided? if not, load from registry
+        if scorer_file is None:
+            if scorer_name is None:
+                raise ValueError(f"Unable to resolve scorer name from {spec.scorer}")
+            try:
+                return scorer_create(scorer_name, **kwargs)
+            except ValueError:
+                # We need a valid path to a scorer file to try to load the scorer from there
+                if not task_path:
+                    raise PrerequisiteError(
+                        f"The scorer '{scorer_name}' couldn't be loaded. Please provide a path to the file containing the scorer using the '--scorer' parameter"
+                    )
+                task_pretty_path = task_path.as_posix()
+                if not task_path.exists():
+                    raise PrerequisiteError(
+                        f"The scorer `{scorer_name}` couldn't be loaded. The file '{task_pretty_path}' was not found. Please provide a path to the file containing the scorer using the '--scorer' parameter"
+                    )
+                # We have the path to a file, so load that and try again
+                try:
+                    load_module(task_path)
+                    scorer_fn = scorer_create(scorer_name, **kwargs)
+                    # See if the scorer doesn't have type annotations. Currently the registry will not load
+                    # the function without type annotations.
+                    # TODO: We could consider calling this ourselves if we're certain it is what we're looking for
+                    signature = inspect.signature(scorer_fn)
+                    if signature.return_annotation is inspect.Signature.empty:
+                        raise PrerequisiteError(
+                            f"The scorer '{scorer_name}' in the file '{task_pretty_path}' requires return type annotations. Please add type annotations to load the scorer."
+                        )
+                    return scorer_fn
+                except ValueError:
+                    # we still couldn't load this, request the user provide a path
+                    raise PrerequisiteError(
+                        f"The scorer '{scorer_name}' in the file '{task_pretty_path}' couldn't be loaded. Please provide a path to the file containing the scorer using the '--scorer' parameter."
+                    )
+                except ModuleNotFoundError:
+                    # we still couldn't load this, request the user provide a path
+                    raise PrerequisiteError(
+                        f"The scorer '{scorer_name}' in the file '{task_pretty_path}' couldn't be loaded. Please provide a path to the file containing the scorer using the '--scorer' parameter."
+                    )
+        # solver is a path, so load it that way
+        else:
+            load_module(scorer_file)
+            decorators = parse_decorators(scorer_file, "scorer")
+            # if there is no solver_name see if we can discover it
+            if scorer_name is None:
+                if len(decorators) == 1:
+                    # decorator based solver
+                    scorer_name = decorators[0][0]
+                elif len(decorators) == 0:
+                    raise PrerequisiteError(
+                        f"The source file {pretty_scorer_file} does not contain any @scorer functions."
+                    )
+                else:
+                    raise PrerequisiteError(
+                        f"The source file {pretty_scorer_file} has more than one @solver function (qualify which solver using e.g. '{scorer_file.name}y@solver_fn')"
+                    )
+            # create decorator based solvers using the registry
+            if any(solver[0] == scorer_name for solver in decorators):
+                return scorer_create(scorer_name, **kwargs)
+            else:
+                raise PrerequisiteError(
+                    f"The function {scorer_name} was not found in file {pretty_scorer_file}."
+                )
+def parse_spec_str(spec_str: str) -> Tuple[Path | None, str | None]:
+    spec_split = split_spec(spec_str)
+    if spec_split[1] is not None:
+        file: Path | None = Path(spec_split[0]).resolve()
+        name: str | None = spec_split[1]
+    elif Path(spec_split[0]).exists():
+        file = Path(spec_split[0]).resolve()
+        name = None
+    else:
+        file = None
+        name = spec_split[0]
+    return file, name

inspect_ai/_eval/registry.py CHANGED Viewed

@@ -148,7 +148,7 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
             # module import, so set its task file and run dir
             if get_installed_package_name(task_type) is None:
                 module = inspect.getmodule(task_type)
-                if module and hasattr(module, "__file__"):
+                if module and hasattr(module, "__file__") and module.__file__:
                     file = Path(getattr(module, "__file__"))
                     setattr(task_instance, TASK_FILE_ATTR, file.as_posix())
                     setattr(task_instance, TASK_RUN_DIR_ATTR, file.parent.as_posix())

inspect_ai/_eval/run.py CHANGED Viewed

@@ -20,8 +20,10 @@ from inspect_ai.log import EvalConfig, EvalLog
 from inspect_ai.log._recorders import Recorder
 from inspect_ai.model import GenerateConfigArgs
 from inspect_ai.model._model import ModelName
+from inspect_ai.scorer._metric import to_metric_specs
 from inspect_ai.scorer._reducer import ScoreReducer, reducer_log_names
 from inspect_ai.scorer._reducer.registry import validate_reducer
+from inspect_ai.scorer._scorer import as_scorer_spec
 from inspect_ai.solver._solver import Solver, SolverSpec
 from inspect_ai.util._sandbox.environment import (
     SandboxEnvironmentConfigType,
@@ -100,6 +102,16 @@ async def eval_run(
         eval_solver = None
         eval_solver_spec = None
+    # resolve the task scorers
+    eval_scorer_specs = (
+        [as_scorer_spec(scorer) for scorer in task.scorer]
+        if task.scorer is not None
+        else None
+    )
+    # resolve task metrics
+    eval_metrics = to_metric_specs(task.metrics) if task.metrics is not None else None
     try:
         # create run tasks
         task_run_options: list[TaskRunOptions] = []
@@ -168,6 +180,8 @@ async def eval_run(
                     tags=tags,
                     model=resolved_task.model,
                     dataset=task.dataset,
+                    scorer=eval_scorer_specs,
+                    metrics=eval_metrics,
                     sandbox=resolved_task.sandbox,
                     task_attribs=task.attribs,
                     task_args=resolved_task.task_args,

inspect-ai 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl

inspect-ai 0.3.63py3-none-any.whl → 0.3.65py3-none-any.whl