PyPI - inspect-ai - Versions diffs - 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl - Mend

inspect-ai 0.3.63py3-none-any.whl → 0.3.65py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

inspect_ai/_cli/cache.py +8 -7
inspect_ai/_cli/common.py +0 -12
inspect_ai/_cli/eval.py +32 -4
inspect_ai/_cli/info.py +1 -0
inspect_ai/_cli/list.py +1 -1
inspect_ai/_cli/log.py +2 -0
inspect_ai/_cli/sandbox.py +4 -1
inspect_ai/_cli/score.py +181 -32
inspect_ai/_cli/trace.py +2 -0
inspect_ai/_cli/view.py +4 -2
inspect_ai/_display/core/config.py +7 -1
inspect_ai/_display/core/progress.py +1 -1
inspect_ai/_display/textual/app.py +8 -4
inspect_ai/_display/textual/widgets/samples.py +6 -5
inspect_ai/_display/textual/widgets/sandbox.py +6 -0
inspect_ai/_eval/__init__.py +0 -0
inspect_ai/_eval/eval.py +100 -97
inspect_ai/_eval/evalset.py +69 -69
inspect_ai/_eval/loader.py +122 -12
inspect_ai/_eval/registry.py +1 -1
inspect_ai/_eval/run.py +14 -0
inspect_ai/_eval/score.py +125 -36
inspect_ai/_eval/task/log.py +105 -4
inspect_ai/_eval/task/results.py +92 -38
inspect_ai/_eval/task/run.py +6 -2
inspect_ai/_eval/task/sandbox.py +35 -2
inspect_ai/_eval/task/task.py +49 -46
inspect_ai/_util/__init__.py +0 -0
inspect_ai/_util/constants.py +1 -1
inspect_ai/_util/content.py +8 -0
inspect_ai/_util/error.py +2 -0
inspect_ai/_util/file.py +15 -1
inspect_ai/_util/logger.py +4 -2
inspect_ai/_util/registry.py +7 -1
inspect_ai/_view/view.py +1 -2
inspect_ai/_view/www/App.css +8 -3
inspect_ai/_view/www/README.md +1 -1
inspect_ai/_view/www/dist/assets/index.css +66 -38
inspect_ai/_view/www/dist/assets/index.js +525 -523
inspect_ai/_view/www/log-schema.json +86 -73
inspect_ai/_view/www/package.json +1 -1
inspect_ai/_view/www/src/App.tsx +1 -0
inspect_ai/_view/www/src/components/AnsiDisplay.tsx +1 -1
inspect_ai/_view/www/src/components/JsonPanel.tsx +1 -1
inspect_ai/_view/www/src/components/LargeModal.tsx +39 -49
inspect_ai/_view/www/src/components/NavPills.tsx +3 -1
inspect_ai/_view/www/src/components/TabSet.tsx +19 -4
inspect_ai/_view/www/src/logfile/remoteLogFile.ts +0 -1
inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +1 -1
inspect_ai/_view/www/src/metadata/MetaDataView.tsx +1 -1
inspect_ai/_view/www/src/metadata/RenderedContent.tsx +6 -13
inspect_ai/_view/www/src/plan/PlanDetailView.tsx +17 -2
inspect_ai/_view/www/src/plan/SolverDetailView.tsx +1 -1
inspect_ai/_view/www/src/samples/SampleDisplay.tsx +14 -5
inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +4 -2
inspect_ai/_view/www/src/samples/SamplesTools.tsx +16 -24
inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +1 -1
inspect_ai/_view/www/src/samples/chat/ChatView.tsx +1 -0
inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +27 -13
inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +19 -17
inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +12 -10
inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +56 -66
inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +12 -5
inspect_ai/_view/www/src/samples/chat/tools/tool.ts +21 -36
inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +3 -1
inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +27 -25
inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +5 -1
inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +13 -13
inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +1 -1
inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +2 -2
inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +9 -5
inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +1 -1
inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -4
inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +1 -0
inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +1 -0
inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +17 -6
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +14 -19
inspect_ai/_view/www/src/types/log.d.ts +107 -19
inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +7 -1
inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +5 -3
inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +25 -27
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +12 -11
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +25 -2
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +60 -36
inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +4 -0
inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +6 -4
inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +16 -14
inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +9 -19
inspect_ai/_view/www/src/workspace/utils.ts +34 -0
inspect_ai/approval/_approval.py +2 -0
inspect_ai/approval/_approver.py +4 -4
inspect_ai/approval/_auto.py +1 -1
inspect_ai/approval/_human/approver.py +3 -0
inspect_ai/approval/_policy.py +5 -0
inspect_ai/approval/_registry.py +2 -2
inspect_ai/dataset/_dataset.py +36 -45
inspect_ai/dataset/_sources/__init__.py +0 -0
inspect_ai/dataset/_sources/csv.py +13 -13
inspect_ai/dataset/_sources/hf.py +29 -29
inspect_ai/dataset/_sources/json.py +10 -10
inspect_ai/log/__init__.py +2 -0
inspect_ai/log/_convert.py +3 -3
inspect_ai/log/_file.py +24 -9
inspect_ai/log/_log.py +98 -7
inspect_ai/log/_message.py +3 -1
inspect_ai/log/_recorders/file.py +4 -0
inspect_ai/log/_recorders/recorder.py +3 -0
inspect_ai/log/_transcript.py +19 -8
inspect_ai/model/__init__.py +2 -0
inspect_ai/model/_cache.py +39 -21
inspect_ai/model/_call_tools.py +2 -2
inspect_ai/model/_chat_message.py +14 -4
inspect_ai/model/_generate_config.py +1 -1
inspect_ai/model/_model.py +31 -24
inspect_ai/model/_model_output.py +14 -1
inspect_ai/model/_openai.py +10 -18
inspect_ai/model/_providers/google.py +9 -5
inspect_ai/model/_providers/openai.py +5 -9
inspect_ai/model/_providers/openrouter.py +1 -1
inspect_ai/scorer/__init__.py +6 -1
inspect_ai/scorer/_answer.py +1 -1
inspect_ai/scorer/_classification.py +4 -0
inspect_ai/scorer/_match.py +4 -5
inspect_ai/scorer/_metric.py +87 -28
inspect_ai/scorer/_metrics/__init__.py +3 -3
inspect_ai/scorer/_metrics/accuracy.py +8 -10
inspect_ai/scorer/_metrics/mean.py +3 -17
inspect_ai/scorer/_metrics/std.py +111 -30
inspect_ai/scorer/_model.py +12 -12
inspect_ai/scorer/_pattern.py +3 -3
inspect_ai/scorer/_reducer/reducer.py +36 -21
inspect_ai/scorer/_reducer/registry.py +2 -2
inspect_ai/scorer/_reducer/types.py +7 -1
inspect_ai/scorer/_score.py +11 -1
inspect_ai/scorer/_scorer.py +110 -16
inspect_ai/solver/__init__.py +1 -1
inspect_ai/solver/_basic_agent.py +19 -22
inspect_ai/solver/_bridge/__init__.py +0 -3
inspect_ai/solver/_bridge/bridge.py +3 -3
inspect_ai/solver/_chain.py +1 -2
inspect_ai/solver/_critique.py +3 -3
inspect_ai/solver/_fork.py +2 -2
inspect_ai/solver/_human_agent/__init__.py +0 -0
inspect_ai/solver/_human_agent/agent.py +5 -8
inspect_ai/solver/_human_agent/commands/clock.py +14 -10
inspect_ai/solver/_human_agent/commands/note.py +1 -1
inspect_ai/solver/_human_agent/commands/score.py +0 -11
inspect_ai/solver/_multiple_choice.py +15 -18
inspect_ai/solver/_prompt.py +7 -7
inspect_ai/solver/_solver.py +53 -52
inspect_ai/solver/_task_state.py +80 -69
inspect_ai/solver/_use_tools.py +9 -9
inspect_ai/tool/__init__.py +2 -1
inspect_ai/tool/_tool.py +43 -14
inspect_ai/tool/_tool_call.py +6 -2
inspect_ai/tool/_tool_choice.py +3 -1
inspect_ai/tool/_tool_def.py +10 -8
inspect_ai/tool/_tool_params.py +24 -0
inspect_ai/tool/_tool_with.py +7 -7
inspect_ai/tool/_tools/__init__.py +0 -0
inspect_ai/tool/_tools/_computer/_common.py +2 -2
inspect_ai/tool/_tools/_computer/_computer.py +11 -0
inspect_ai/tool/_tools/_execute.py +15 -9
inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
inspect_ai/tool/_tools/_web_search.py +7 -5
inspect_ai/util/_concurrency.py +3 -3
inspect_ai/util/_panel.py +2 -0
inspect_ai/util/_resource.py +12 -12
inspect_ai/util/_sandbox/docker/compose.py +23 -20
inspect_ai/util/_sandbox/docker/config.py +2 -1
inspect_ai/util/_sandbox/docker/docker.py +10 -1
inspect_ai/util/_sandbox/docker/service.py +100 -0
inspect_ai/util/_sandbox/environment.py +99 -96
inspect_ai/util/_subprocess.py +5 -3
inspect_ai/util/_subtask.py +15 -16
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/LICENSE +1 -1
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/METADATA +10 -6
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/RECORD +182 -175
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/top_level.txt +0 -0

inspect_ai/scorer/_reducer/registry.py CHANGED Viewed

@@ -40,9 +40,9 @@ def score_reducer(
     """Decorator for registering Score Reducers.
     Args:
-        func (ScoreReducerType | None): Function returning `ScoreReducer` targeted by
+        func: Function returning `ScoreReducer` targeted by
             plain task decorator without attributes (e.g. `@score_reducer`)
-        name (str | None): Optional name for reducer. If the decorator has no name
+        name: Optional name for reducer. If the decorator has no name
             argument then the name of the function will be used to automatically assign a name.
     Returns:

inspect_ai/scorer/_reducer/types.py CHANGED Viewed

@@ -5,7 +5,13 @@ from .._metric import Score
 @runtime_checkable
 class ScoreReducer(Protocol):
-    def __call__(self, scores: list[Score]) -> Score: ...
+    def __call__(self, scores: list[Score]) -> Score:
+        """Reduce a set of scores to a single score.
+        Args:
+          scores: List of scores.
+        """
+        ...
     @property
     def __name__(self) -> str: ...

inspect_ai/scorer/_score.py CHANGED Viewed

@@ -23,6 +23,8 @@ async def score(state: TaskState) -> list[Score]:
         a task that does not have a scorer.
     """
+    from inspect_ai.log._transcript import ScoreEvent, transcript
     scorers = _scorers.get(None)
     target = _target.get(None)
     if scorers is None or target is None:
@@ -30,7 +32,15 @@ async def score(state: TaskState) -> list[Score]:
             "The score() function can only be called while executing a task with a scorer."
         )
-    return [await scorer(state, target) for scorer in scorers]
+    scores: list[Score] = []
+    for scorer in scorers:
+        score = await scorer(state, target)
+        scores.append(score)
+        transcript()._event(
+            ScoreEvent(score=score, target=target.target, intermediate=True)
+        )
+    return scores
 def init_scoring_context(scorers: list[Scorer], target: Target) -> None:

inspect_ai/scorer/_scorer.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from copy import deepcopy
+from dataclasses import dataclass, field
 from functools import wraps
 from typing import (
     Any,
@@ -9,38 +11,74 @@ from typing import (
 )
 from inspect_ai._util._async import is_callable_coroutine
+from inspect_ai._util.error import PrerequisiteError
 from inspect_ai._util.registry import (
     RegistryInfo,
+    is_registry_object,
     registry_add,
     registry_create,
     registry_info,
     registry_name,
+    registry_params,
     registry_tag,
     registry_unqualified_name,
 )
 from inspect_ai.solver._task_state import TaskState
-from ._metric import Metric, Score
+from ._metric import Metric, MetricSpec, Score, as_metric_spec
 from ._target import Target
 @runtime_checkable
 class Scorer(Protocol):
-    r"""Score model outputs.
-    Evaluate the passed outputs and targets and return a
-    dictionary with scoring outcomes and context.
-    Args:
-        state (TaskState): Task state
-        target (Target): Ideal target for the output.
-    """
     async def __call__(
         self,
         state: TaskState,
         target: Target,
-    ) -> Score: ...
+    ) -> Score:
+        r"""Score model outputs.
+        Evaluate the passed outputs and targets and return a
+        dictionary with scoring outcomes and context.
+        Args:
+            state: Task state
+            target: Ideal target for the output.
+        Examples:
+          ```python
+          @scorer
+          def custom_scorer() -> Scorer:
+              async def score(state: TaskState, target: Target) -> Score:
+                  # Compare state / model output with target
+                  # to yield a score
+                  return Score(value=...)
+              return score
+          ````
+        """
+        ...
+@dataclass(frozen=True)
+class ScorerSpec:
+    """Scorer specification used to (re-)create scorers."""
+    scorer: str
+    """Scorer name"""
+    args: dict[str, Any] = field(default_factory=dict)
+    """Scorer arguments."""
+    metadata: dict[str, Any] | None = field(default=None)
+    """Scorer metadata"""
+    metrics: (
+        list[MetricSpec | dict[str, list[MetricSpec]]]
+        | dict[str, list[MetricSpec]]
+        | None
+    ) = field(default=None)
+    """Scorer metrics"""
 P = ParamSpec("P")
@@ -90,17 +128,28 @@ def scorer(
     r"""Decorator for registering scorers.
     Args:
-        metrics (list[Metric] | dict[str, list[Metric]]): One or more metrics to calculate
+        metrics: One or more metrics to calculate
             over the scores.
-        name (str | None):
-            Optional name for scorer. If the decorator has no name
+        name: Optional name for scorer. If the decorator has no name
             argument then the name of the underlying ScorerType
             object will be used to automatically assign a name.
-        **metadata (dict[str,Any]): Additional values to serialize
+        **metadata: Additional values to serialize
             in metadata.
     Returns:
         Scorer with registry attributes.
+    Examples:
+      ```python
+      @scorer
+      def custom_scorer() -> Scorer:
+          async def score(state: TaskState, target: Target) -> Score:
+              # Compare state / model output with target
+              # to yield a score
+              return Score(value=...)
+          return score
+      ````
     """
     def wrapper(scorer_type: Callable[P, Scorer]) -> Callable[P, Scorer]:
@@ -142,6 +191,51 @@ def scorer(
     return wrapper
+def as_scorer_spec(scorer: Scorer) -> ScorerSpec:
+    if not is_registry_object(scorer):
+        raise PrerequisiteError(
+            f"The scorer {getattr(scorer, '__name__', '<unknown>')} was not created by a function decorated with @scorer so cannot be recorded."
+        )
+    name = registry_unqualified_name(scorer)
+    metrics = scorer_metrics(scorer)
+    resolved_metrics = resolve_metrics(metrics)
+    args = registry_params(scorer)
+    metadata = deepcopy(registry_info(scorer).metadata)
+    del metadata[SCORER_METRICS]
+    return ScorerSpec(
+        scorer=name, args=args, metadata=metadata, metrics=resolved_metrics
+    )
+def resolve_metrics(
+    metrics: list[Metric | dict[str, list[Metric]]] | dict[str, list[Metric]],
+) -> (
+    list[MetricSpec | dict[str, list[MetricSpec]]] | dict[str, list[MetricSpec]] | None
+):
+    if isinstance(metrics, list):
+        resolved_metrics: list[MetricSpec | dict[str, list[MetricSpec]]] = []
+        for metric_item in metrics:
+            if isinstance(metric_item, Metric):
+                resolved_metrics.append(as_metric_spec(metric_item))
+            else:
+                resolved_metrics.append(
+                    {
+                        metric_group: [
+                            as_metric_spec(metric) for metric in metrics_list
+                        ]
+                        for metric_group, metrics_list in metric_item.items()
+                    }
+                )
+        return resolved_metrics
+    else:
+        return {
+            metric_group: [as_metric_spec(metric) for metric in metrics_list]
+            for metric_group, metrics_list in metrics.items()
+        }
 def scorer_metrics(
     scorer: Scorer,
 ) -> list[Metric | dict[str, list[Metric]]] | dict[str, list[Metric]]:

inspect_ai/solver/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from inspect_ai._util.deprecation import relocated_module_attribute
 from ._basic_agent import basic_agent
-from ._bridge import bridge
+from ._bridge.bridge import bridge
 from ._chain import chain
 from ._critique import self_critique
 from ._fork import fork

inspect_ai/solver/_basic_agent.py CHANGED Viewed

@@ -81,31 +81,28 @@ def basic_agent(
     alternate conversion scheme as required via `score_value`.
     Args:
-       init: (Solver | list[Solver] | None): Agent initialisation
-         (defaults to system_message with basic ReAct prompt)
-       tools (list[Tool | ToolDef] | Solver | None): Tools available for the agent. Either a
-         list of tools or a Solver that can yield dynamic tools per-sample.
-       cache: (bool | CachePolicy): Caching behaviour for generate responses
-         (defaults to no caching).
-       max_attempts (int): Maximum number of submissions to accept before terminating.
-       message_limit (int | None): Limit on messages in sample before terminating agent.
+       init: Agent initialisation (defaults to system_message with basic ReAct prompt)
+       tools: Tools available for the agent. Either a list of tools or a Solver that
+          can yield dynamic tools per-sample.
+       cache: Caching behaviour for generate responses (defaults to no caching).
+       max_attempts: Maximum number of submissions to accept before terminating.
+       message_limit: Limit on messages in sample before terminating agent.
           If not specified, will use limit_messages defined for the task. If there is none
           defined for the task, 50 will be used as a default.
-       token_limit (int | None): Limit on tokens used in sample before terminating agent.
-       max_tool_output (int | None): Maximum output length (in bytes).
+       token_limit: Limit on tokens used in sample before terminating agent.
+       max_tool_output: Maximum output length (in bytes).
           Defaults to max_tool_output from active GenerateConfig.
-       score_value (ValueToFloat): Function used to extract float from scores (defaults
-         to standard value_to_float())
-       incorrect_message (str | Callable[[TaskState, list[Score]], str | Awaitable[str]]):
-         User message reply for an incorrect submission from the model. Alternatively,
-         a function which returns a message (function may optionally be async)
-       continue_message (str): User message to urge the model to continue when it
-         doesn't make a tool call.
-       submit_name (str): Name for tool used to make submissions
-        (defaults to 'submit')
-       submit_description (str): Description of submit tool (defaults to
-        'Submit an answer for evaluation')
-       **kwargs (Any): Deprecated arguments for backward compatibility.
+       score_value: Function used to extract float from scores (defaults
+          to standard value_to_float())
+       incorrect_message: User message reply for an incorrect submission from the model.
+          Alternatively, a function which returns a message (function may optionally be async)
+       continue_message: User message to urge the model to continue when it
+          doesn't make a tool call.
+       submit_name: Name for tool used to make submissions
+          (defaults to 'submit')
+       submit_description: Description of submit tool (defaults to
+          'Submit an answer for evaluation')
+       **kwargs: Deprecated arguments for backward compatibility.
     Returns:
         Plan for agent.

inspect_ai/solver/_bridge/__init__.py CHANGED Viewed

@@ -1,3 +0,0 @@
-from .bridge import bridge
-__all__ = ["bridge"]

inspect_ai/solver/_bridge/bridge.py CHANGED Viewed

@@ -17,7 +17,7 @@ from .._task_state import TaskState
 def bridge(agent: Callable[[dict[str, Any]], Awaitable[dict[str, Any]]]) -> Solver:
     """Bridge an external agent into an Inspect Solver.
-    See documentation at https://inspect.ai-safety-institute.org.uk/agent-bridge.html
+    See documentation at <https://inspect.ai-safety-institute.org.uk/agent-bridge.html>
     Args:
       agent: Callable which takes a sample `dict` and returns a result `dict`.
@@ -63,11 +63,11 @@ def bridge(agent: Callable[[dict[str, Any]], Awaitable[dict[str, Any]]]) -> Solv
             else state.input
         )
-        # create sample
+        # create sample (use standard gpt-4 message encoding -- i.e. no 'developer' messages)
         sample = BridgeSample(
             sample_id=str(state.sample_id),
             epoch=state.epoch,
-            input=await openai_chat_messages(input, state.model.name),
+            input=await openai_chat_messages(input, model="gpt-4"),
             metadata=state.metadata,
             target=list(state.target),
         )

inspect_ai/solver/_chain.py CHANGED Viewed

@@ -15,8 +15,7 @@ def chain(*solvers: Solver | list[Solver]) -> Solver:
     early.
     Args:
-      solvers (*Solver | list[Solver]): One or more solvers
-        or lists of solvers to chain together.
+      *solvers: One or more solvers or lists of solvers to chain together.
     Returns:
       Solver that executes the passed solvers as a chain.

inspect_ai/solver/_critique.py CHANGED Viewed

@@ -25,15 +25,15 @@ def self_critique(
     need to use the model being evaluated).
     Args:
-      critique_template (str | None): String or path to file
+      critique_template: String or path to file
          containing critique template. The template uses two
          variables: `question` and `completion`.
          Variables from sample `metadata` are also available
          in the template.
-      completion_template (str | None): String or path to file
+      completion_template: String or path to file
           containing completion template. The template uses
           three variables: `question`,  `completion`, and `critique`
-      model (str | Model | None): Alternate model to be used
+      model: Alternate model to be used
          for critique (by default the model being evaluated
          is used).
     """

inspect_ai/solver/_fork.py CHANGED Viewed

@@ -32,8 +32,8 @@ async def fork(
     Store that doesn't affect the Store of other subtasks or the parent).
     Args:
-      state (TaskState): Beginning TaskState
-      solvers (Solver | list[Solver]): Solvers to apply on the TaskState.
+      state: Beginning TaskState
+      solvers: Solvers to apply on the TaskState.
         Each Solver will get a standalone copy of the TaskState.
     Returns:

inspect_ai/solver/_human_agent/__init__.py ADDED Viewed

File without changes

inspect_ai/solver/_human_agent/agent.py CHANGED Viewed

@@ -30,14 +30,11 @@ def human_agent(
     using a VS Code Window or Terminal.
     Args:
-       answer (bool | str): Is an explicit answer required for this
-          task or is it scored based on files in the container? Pass a
-          `str` with a regex to validate that the answer matches
-          the expected format.
-       intermediate_scoring (bool): Allow the human agent to
-          check their score while working.
-       record_session (bool): Record all user commands and outputs in
-          the sandbox bash session.
+       answer: Is an explicit answer required for this task or is it scored
+          based on files in the container? Pass a `str` with a regex to validate
+          that the answer matches the expected format.
+       intermediate_scoring: Allow the human agent to check their score while working.
+       record_session: Record all user commands and outputs in the sandbox bash session.
     Returns:
        Solver: Human agent solver.

inspect_ai/solver/_human_agent/commands/clock.py CHANGED Viewed

@@ -27,14 +27,10 @@ class StartCommand(HumanAgentCommand):
         print(call_human_agent("start"))
     def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
-        from inspect_ai.log._transcript import transcript
         async def start() -> str:
             if not state.running:
                 state.running = True
-                transcript().info(
-                    f"Task started (total time: {format_progress_time(state.time)})"
-                )
+                clock_action_event("start", state)
             return render_status(state)
         return start
@@ -57,14 +53,22 @@ class StopCommand(HumanAgentCommand):
         print(call_human_agent("stop"))
     def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
-        from inspect_ai.log._transcript import transcript
         async def stop() -> str:
             if state.running:
                 state.running = False
-                transcript().info(
-                    f"Task stopped (total time: {format_progress_time(state.time)})"
-                )
+                clock_action_event("stop", state)
             return render_status(state)
         return stop
+def clock_action_event(action: str, state: HumanAgentState) -> None:
+    from inspect_ai.log._transcript import transcript
+    transcript().info(
+        {
+            "action": action,
+            "total_time": format_progress_time(state.time, False),
+        },
+        source="human_agent",
+    )

inspect_ai/solver/_human_agent/commands/note.py CHANGED Viewed

@@ -37,6 +37,6 @@ class NoteCommand(HumanAgentCommand):
         from inspect_ai.log._transcript import transcript
         async def note(content: str) -> None:
-            transcript().info(content)
+            transcript().info(content, source="human_agent")
         return note

inspect_ai/solver/_human_agent/commands/score.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from argparse import Namespace
 from copy import deepcopy
-from textwrap import dedent
 from typing import Awaitable, Callable, Literal
 from pydantic import JsonValue
@@ -51,8 +50,6 @@ class ScoreCommand(HumanAgentCommand):
     def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
         async def score_task(answer: str | None) -> str:
-            from inspect_ai.log._transcript import transcript
             # make a copy of TaskState, add the answer, then score
             if answer:
                 task_state = deepcopy(self._state)
@@ -64,14 +61,6 @@ class ScoreCommand(HumanAgentCommand):
             # record the scoring action in our state
             state.scorings.append(IntermediateScoring(time=state.time, scores=result))
-            # record to transcript
-            transcript().info(
-                dedent(f"""
-            ### Intermediate Score
-            **Answer:** {result[0].answer}, **Score:** {result[0].as_str()}
-            """)
-            )
             # notify user
             return render_text(
                 f"[bold]Answer:[/bold] {result[0].answer}, [bold]Score:[/bold] {result[0].as_str()}"

inspect_ai/solver/_multiple_choice.py CHANGED Viewed

@@ -219,38 +219,35 @@ def multiple_choice(
     multiple_correct: bool = False,
     **kwargs: Unpack[DeprecatedArgs],
 ) -> Solver:
-    """Multiple choice question solver.
-    Formats a multiple choice question prompt, then calls `generate()`
-    ### Usage
+    """Multiple choice question solver. Formats a multiple choice question prompt, then calls `generate()`.
     Note that due to the way this solver works, it has some constraints:
-        1. The `Sample` must have the `choices` attribute set.
-        2. The only built-in compatible scorer is the `choice` scorer.
-        3. It calls `generate()` internally, so you don't need to call it again
-    ### Shuffling
-    You can shuffle choices when you load your dataset by using the `shuffle_choices` method or parameter of the datasets API.
+    1. The `Sample` must have the `choices` attribute set.
+    2. The only built-in compatible scorer is the `choice` scorer.
+    3. It calls `generate()` internally, so you don't need to call it again
     Args:
-      template (str | None): Template to use for the multiple choice question.
+      template: Template to use for the multiple choice question.
         The defaults vary based on the options and are taken from the `MultipleChoiceTemplate` enum. The template will have questions and possible answers substituted into it before being sent to the model. Consequently it requires three specific template variables:
-        - `{question}`: The question to be asked.
-        - `{choices}`: The choices available, which will be formatted as a
+          - `{question}`: The question to be asked.
+          - `{choices}`: The choices available, which will be formatted as a
             list of A) ... B) ... etc. before sending to the model.
-        - `{letters}`: (optional) A string of letters representing the choices, e.g.
+          - `{letters}`: (optional) A string of letters representing the choices, e.g.
             "A,B,C". Used to be explicit to the model about the possible answers.
-      cot (bool): Default `False`. Whether the solver should perform chain-of-thought
+      cot: Default `False`. Whether the solver should perform chain-of-thought
         reasoning before answering. NOTE: this has no effect if you provide a custom template.
-      multiple_correct (bool): Default `False`. Whether to allow multiple
+      multiple_correct: Default `False`. Whether to allow multiple
         answers to the multiple choice question. For example, "What numbers are
         squares? A) 3, B) 4, C) 9" has multiple correct answers, B and C. Leave
         as `False` if there's exactly one correct answer from the choices
         available. NOTE: this has no effect if you provide a custom template.
       **kwargs (Any): Deprecated arguments for backward compatibility.
+    #### Shuffling
+    You can shuffle choices when you load your dataset by using the `shuffle_choices` method or parameter of the datasets API.
     """
     shuffle: bool | Random = False
     if "shuffle" in kwargs:

inspect_ai/solver/_prompt.py CHANGED Viewed

@@ -20,8 +20,8 @@ def prompt_template(template: str, **params: Any) -> Solver:
     `params`.
     Args:
-      template: (str): Template for prompt.
-      **params (dict[str,Any]): Parameters to fill into the template.
+      template: Template for prompt.
+      **params: Parameters to fill into the template.
     Returns:
       A solver that uses the specified prompt template.
@@ -51,8 +51,8 @@ def system_message(template: str, **params: Any) -> Solver:
     are none it will be inserted at the beginning of the conversation).
     Args:
-      template (str): Template for system message.
-      **params (dict[str,Any]): Parameters to fill into the template.
+      template: Template for system message.
+      **params: Parameters to fill into the template.
     Returns:
       A solver that inserts the parameterised system message.
@@ -80,8 +80,8 @@ def user_message(template: str, **params: Any) -> Solver:
     included in the `params`.
     Args:
-      template (str): Template for user message.
-      **params (dict[str,Any]): Parameters to fill into the template.
+      template: Template for user message.
+      **params: Parameters to fill into the template.
     Returns:
       A solver that inserts the parameterised user message.
@@ -109,7 +109,7 @@ def chain_of_thought(template: str = DEFAULT_COT_TEMPLATE) -> Solver:
     """Solver which modifies the user prompt to encourage chain of thought.
     Args:
-       template (str): String or path to file containing CoT template.
+       template: String or path to file containing CoT template.
           The template uses a single variable: `prompt`.
     """

inspect-ai 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl

inspect-ai 0.3.63py3-none-any.whl → 0.3.65py3-none-any.whl