PyPI - inspect-ai - Versions diffs - 0.3.60__py3-none-any.whl → 0.3.62__py3-none-any.whl - Mend

inspect-ai 0.3.60py3-none-any.whl → 0.3.62py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

inspect_ai/_cli/eval.py +13 -1
inspect_ai/_cli/view.py +4 -0
inspect_ai/_display/textual/widgets/transcript.py +15 -9
inspect_ai/_eval/task/error.py +10 -14
inspect_ai/_eval/task/generate.py +41 -35
inspect_ai/_eval/task/run.py +20 -12
inspect_ai/_util/hooks.py +17 -7
inspect_ai/_util/transcript.py +11 -0
inspect_ai/_view/www/dist/assets/index.css +1 -0
inspect_ai/_view/www/dist/assets/index.js +100 -94
inspect_ai/_view/www/log-schema.json +35 -19
inspect_ai/_view/www/package.json +1 -1
inspect_ai/_view/www/src/components/ChatView.mjs +23 -0
inspect_ai/_view/www/src/types/log.d.ts +6 -4
inspect_ai/log/_recorders/eval.py +1 -1
inspect_ai/model/_chat_message.py +29 -2
inspect_ai/model/_conversation.py +10 -3
inspect_ai/model/_generate_config.py +6 -0
inspect_ai/model/_model.py +164 -25
inspect_ai/model/_openai.py +33 -1
inspect_ai/model/_providers/anthropic.py +12 -3
inspect_ai/model/_providers/groq.py +4 -0
inspect_ai/model/_providers/openai.py +21 -9
inspect_ai/model/_providers/providers.py +1 -1
inspect_ai/model/_reasoning.py +17 -0
inspect_ai/solver/__init__.py +2 -0
inspect_ai/solver/_basic_agent.py +78 -58
inspect_ai/{util → solver}/_limit.py +13 -0
inspect_ai/solver/_task_state.py +37 -7
inspect_ai/tool/_tools/_web_browser/_web_browser.py +3 -1
inspect_ai/tool/beta/_computer/_resources/Dockerfile +5 -3
inspect_ai/tool/beta/_computer/_resources/entrypoint/x11vnc_startup.sh +1 -1
inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/Code/User/settings.json +3 -0
inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +61 -0
inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +10 -0
inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +10 -0
inspect_ai/util/__init__.py +0 -2
inspect_ai/util/_sandbox/self_check.py +51 -28
{inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/METADATA +2 -2
{inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/RECORD +45 -40
inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/XPaint.desktop +0 -10
{inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/top_level.txt +0 -0

inspect_ai/_cli/eval.py CHANGED Viewed

@@ -385,6 +385,14 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
         help="Constrains effort on reasoning for reasoning models. Open AI o1 models only.",
         envvar="INSPECT_EVAL_REASONING_EFFORT",
     )
+    @click.option(
+        "--reasoning-history/--no-reasoning-history",
+        type=bool,
+        is_flag=True,
+        default=True,
+        help="Include reasoning in chat message history sent to generate.",
+        envvar="INSPECT_EVAL_REASONING_HISTORY",
+    )
     @click.option(
         "--log-format",
         type=click.Choice(["eval", "json"], case_sensitive=False),
@@ -444,6 +452,7 @@ def eval_command(
     max_tool_output: int | None,
     cache_prompt: str | None,
     reasoning_effort: str | None,
+    reasoning_history: bool | None,
     message_limit: int | None,
     token_limit: int | None,
     time_limit: int | None,
@@ -592,7 +601,6 @@ def eval_set_command(
     logit_bias: str | None,
     seed: int | None,
     stop_seqs: str | None,
-    suffix: str | None,
     temperature: float | None,
     top_p: float | None,
     top_k: int | None,
@@ -604,6 +612,7 @@ def eval_set_command(
     max_tool_output: int | None,
     cache_prompt: str | None,
     reasoning_effort: str | None,
+    reasoning_history: bool | None,
     message_limit: int | None,
     token_limit: int | None,
     time_limit: int | None,
@@ -842,6 +851,9 @@ def config_from_locals(locals: dict[str, Any]) -> GenerateConfigArgs:
             if key == "internal_tools":
                 if value is not False:
                     value = None
+            if key == "reasoning_history":
+                if value is not False:
+                    value = None
             config[key] = value  # type: ignore
     return config

inspect_ai/_cli/view.py CHANGED Viewed

@@ -63,6 +63,10 @@ def start(
     INSPECT_VIEW_AUTHORIZATION_TOKEN = "INSPECT_VIEW_AUTHORIZATION_TOKEN"
     authorization = os.environ.get(INSPECT_VIEW_AUTHORIZATION_TOKEN, None)
     if authorization:
+        # this indicates we are in vscode -- we want to set the log level to HTTP
+        # in vscode, updated versions of the extension do this but we set it
+        # manually here as a temporary bridge for running against older versions
+        common["log_level"] = "HTTP"
         del os.environ[INSPECT_VIEW_AUTHORIZATION_TOKEN]
         os.unsetenv(INSPECT_VIEW_AUTHORIZATION_TOKEN)

inspect_ai/_display/textual/widgets/transcript.py CHANGED Viewed

@@ -15,6 +15,7 @@ from inspect_ai._util.transcript import (
     set_transcript_markdown_options,
     transcript_function,
     transcript_markdown,
+    transcript_reasoning,
     transcript_separator,
 )
 from inspect_ai.log._samples import ActiveSample
@@ -33,7 +34,11 @@ from inspect_ai.log._transcript import (
     SubtaskEvent,
     ToolEvent,
 )
-from inspect_ai.model._chat_message import ChatMessage, ChatMessageUser
+from inspect_ai.model._chat_message import (
+    ChatMessage,
+    ChatMessageAssistant,
+    ChatMessageUser,
+)
 from inspect_ai.model._render import messages_preceding_assistant
 from inspect_ai.tool._tool import ToolResult
 from inspect_ai.tool._tool_transcript import transcript_tool_call
@@ -171,8 +176,8 @@ def render_model_event(event: ModelEvent) -> EventDisplay:
     # content
     content: list[RenderableType] = []
-    def append_message(message: ChatMessage, text: str | None = None) -> None:
-        content.extend(render_message(message, text))
+    def append_message(message: ChatMessage) -> None:
+        content.extend(render_message(message))
     # render preceding messages
     preceding = messages_preceding_assistant(event.input)
@@ -309,16 +314,17 @@ def render_as_json(json: Any) -> RenderableType:
     )
-def render_message(
-    message: ChatMessage, text: str | None = None
-) -> list[RenderableType]:
+def render_message(message: ChatMessage) -> list[RenderableType]:
     content: list[RenderableType] = [
         Text(message.role.capitalize(), style="bold"),
         Text(),
     ]
-    text = text or message.text
-    if text:
-        content.extend([transcript_markdown(text.strip(), escape=True)])
+    if isinstance(message, ChatMessageAssistant) and message.reasoning:
+        content.extend(transcript_reasoning(message.reasoning))
+    if message.text:
+        content.extend([transcript_markdown(message.text.strip(), escape=True)])
     return content

inspect_ai/_eval/task/error.py CHANGED Viewed

@@ -8,28 +8,24 @@ class SampleErrorHandler:
         self.fail_on_error = True if fail_on_error is None else fail_on_error
         self.total_samples = float(total_samples)
-    def __call__(self, ex: BaseException) -> EvalError:
+    def __call__(self, ex: BaseException) -> tuple[EvalError, BaseException | None]:
         # increment error count
         self.error_count += 1
         # create error (we may return it)
-        def sample_error() -> EvalError:
-            return eval_error(ex, type(ex), ex, ex.__traceback__)
+        def sample_error(
+            *, raise_error: bool
+        ) -> tuple[EvalError, BaseException | None]:
+            return eval_error(
+                ex, type(ex), ex, ex.__traceback__
+            ), ex if raise_error else None
         # check against limits
         if isinstance(self.fail_on_error, bool):
-            if self.fail_on_error:
-                raise ex
-            else:
-                return sample_error()
+            return sample_error(raise_error=self.fail_on_error)
         else:
             if self.fail_on_error < 1:
                 max_errors = self.fail_on_error * self.total_samples
-                if self.error_count >= max_errors:
-                    raise ex
-                else:
-                    return sample_error()
-            elif self.error_count >= self.fail_on_error:
-                raise ex
+                return sample_error(raise_error=self.error_count >= max_errors)
             else:
-                return sample_error()
+                return sample_error(raise_error=self.error_count >= self.fail_on_error)

inspect_ai/_eval/task/generate.py CHANGED Viewed

@@ -8,6 +8,7 @@ from inspect_ai.model import (
 )
 from inspect_ai.model._cache import epoch
 from inspect_ai.solver import TaskState
+from inspect_ai.solver._limit import SampleLimitExceededError
 from inspect_ai.tool import ToolFunction
@@ -21,45 +22,50 @@ async def task_generate(
     # track tool_choice (revert to "auto" after first forced call of a tool)
     tool_choice = state.tool_choice
-    while True:
-        # If we don't update the epoch here as we go, it's entirely possible
-        # we'd cache the same response for every single epoch, which would
-        # completely defeat the point!
-        epoch.set(state.epoch)
+    try:
+        while True:
+            # If we don't update the epoch here as we go, it's entirely possible
+            # we'd cache the same response for every single epoch, which would
+            # completely defeat the point!
+            epoch.set(state.epoch)
-        # call the model
-        state.output = await model.generate(
-            input=state.messages,
-            tools=state.tools,
-            tool_choice=tool_choice,
-            config=config,
-            cache=cache,
-        )
+            # call the model
+            state.output = await model.generate(
+                input=state.messages,
+                tools=state.tools,
+                tool_choice=tool_choice,
+                config=config,
+                cache=cache,
+            )
-        # append the assistant message
-        message = state.output.message
-        state.messages.append(message)
+            # append the assistant message
+            message = state.output.message
+            state.messages.append(message)
-        # check for completed
-        if state.completed:
-            return state
+            # check for completed
+            if state.completed:
+                return state
-        # resolve tool calls if necessary
-        if tool_calls != "none" and message.tool_calls:
-            # call tools and append messages to state
-            state.messages.extend(
-                await call_tools(message, state.tools, config.max_tool_output)
-            )
+            # resolve tool calls if necessary
+            if tool_calls != "none" and message.tool_calls:
+                # call tools and append messages to state
+                state.messages.extend(
+                    await call_tools(message, state.tools, config.max_tool_output)
+                )
-            # check for completed or only executing a single tool call
-            if state.completed or tool_calls == "single":
-                return state
+                # check for completed or only executing a single tool call
+                if state.completed or tool_calls == "single":
+                    return state
+                # if a tool_call was forced set tool_choice to 'auto'
+                # (otherwise it will get forced over and over again)
+                if isinstance(tool_choice, ToolFunction):
+                    tool_choice = "auto"
-            # if a tool_call was forced set tool_choice to 'auto'
-            # (otherwise it will get forced over and over again)
-            if isinstance(tool_choice, ToolFunction):
-                tool_choice = "auto"
+            # no tool calls or not resolving tool calls, we are done!
+            else:
+                return state
-        # no tool calls or not resolving tool calls, we are done!
-        else:
-            return state
+    # propagate current state along with sample limit exceeded
+    except SampleLimitExceededError as ex:
+        raise ex.with_state(state)

inspect_ai/_eval/task/run.py CHANGED Viewed

@@ -75,9 +75,9 @@ from inspect_ai.scorer._scorer import unique_scorer_name
 from inspect_ai.solver import Generate, Plan, TaskState
 from inspect_ai.solver._chain import Chain, unroll
 from inspect_ai.solver._fork import set_task_generate
+from inspect_ai.solver._limit import SampleLimitExceededError
 from inspect_ai.solver._solver import Solver
 from inspect_ai.solver._task_state import sample_state, set_sample_state, state_jsonable
-from inspect_ai.util._limit import SampleLimitExceededError
 from inspect_ai.util._sandbox.context import sandbox_connections
 from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
 from inspect_ai.util._subtask import init_subtask
@@ -402,7 +402,13 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
         view_notify_eval(logger.location)
         try:
-            await send_telemetry("eval_log", eval_log_json_str(eval_log))
+            if (
+                await send_telemetry("eval_log_location", eval_log.location)
+                == "not_handled"
+            ):
+                # Converting the eval log to JSON is expensive. Only do so if
+                # eval_log_location was not handled.
+                await send_telemetry("eval_log", eval_log_json_str(eval_log))
         except Exception as ex:
             py_logger.warning(
                 f"Error occurred sending telemetry: {exception_message(ex)}"
@@ -490,7 +496,7 @@ async def task_run_sample(
     logger: TaskLogger | None,
     log_images: bool,
     sample_source: EvalSampleSource | None,
-    sample_error: Callable[[BaseException], EvalError],
+    sample_error: SampleErrorHandler,
     sample_complete: Callable[[dict[str, SampleScore]], None],
     fails_on_error: bool,
     time_limit: int | None,
@@ -542,12 +548,12 @@ async def task_run_sample(
     )
     # helper to handle exceptions (will throw if we've exceeded the limit)
-    def handle_error(ex: BaseException) -> EvalError:
+    def handle_error(ex: BaseException) -> tuple[EvalError, BaseException | None]:
         err = sample_error(ex)
         py_logger.warning(
             f"Sample error (id: {sample.id}, epoch: {state.epoch}): {exception_message(ex)})"
         )
-        transcript()._event(ErrorEvent(error=err))
+        transcript()._event(ErrorEvent(error=err[0]))
         return err
     # solver loop
@@ -566,6 +572,7 @@ async def task_run_sample(
         ) as active,
     ):
         error: EvalError | None = None
+        raise_error: BaseException | None = None
         results: dict[str, SampleScore] = {}
         try:
             async with sandboxenv_cm:
@@ -634,7 +641,7 @@ async def task_run_sample(
                                 state = sample_state() or state
                             case "error":
                                 # default error handling
-                                error = handle_error(ex)
+                                error, raise_error = handle_error(ex)
                     else:
                         raise
@@ -650,11 +657,11 @@ async def task_run_sample(
                     )
                     # capture most recent state for scoring
-                    state = sample_state() or state
+                    state = ex.state or sample_state() or state
                     state.completed = True
                 except BaseException as ex:
-                    error = handle_error(ex)
+                    error, raise_error = handle_error(ex)
                 # set timeout for scoring. if the original timeout was hit we still
                 # want to provide opportunity for scoring, but we don't necessarily
@@ -731,11 +738,10 @@ async def task_run_sample(
                         )
                     # handle error (this will throw if we've exceeded the limit)
-                    error = handle_error(ex)
+                    error, raise_error = handle_error(ex)
-        # handle sandboxenv init errors
-        except BaseException as ex:
-            error = handle_error(ex)
+        except Exception as ex:
+            error, raise_error = handle_error(ex)
         # complete the sample
         progress(SAMPLE_TOTAL_PROGRESS_UNITS)
@@ -766,6 +772,8 @@ async def task_run_sample(
             if results is not None:
                 sample_complete(results)
             return results
+        elif raise_error:
+            raise raise_error
         else:
             return None

inspect_ai/_util/hooks.py CHANGED Viewed

@@ -17,19 +17,29 @@ from .error import PrerequisiteError
 #
 # Telemetry can be optionally enabled by setting an INSPECT_TELEMETRY
 # environment variable that points to a function in a package which
-# conforms to the TelemetrySend signature below.
+# conforms to the TelemetrySend signature below. A return value of True
+# indicates that the telemetry event was handled.
-# There are currently two types of telemetry sent:
-#    - model_usage (type ModelUsage)
-#    - eval_log    (type EvalLog)
+# There are currently three types of telemetry sent:
+#    - model_usage       (JSON string of the model usage)
+#    - eval_log_location (file path or URL string of the eval log)
+#    - eval_log          (JSON string of the eval log)
+#                        [only sent if eval_log_location unhandled]
+# The eval_log_location type is preferred over eval_log as it means we can take
+# advantage of the .eval format and avoid loading the whole log into memory.
-TelemetrySend = Callable[[str, str], Awaitable[None]]
+TelemetrySend = Callable[[str, str], Awaitable[bool]]
-async def send_telemetry(type: Literal["model_usage", "eval_log"], json: str) -> None:
+async def send_telemetry(
+    type: Literal["model_usage", "eval_log", "eval_log_location"], json: str
+) -> Literal["handled", "not_handled", "no_subscribers"]:
     global _send_telemetry
     if _send_telemetry:
-        await _send_telemetry(type, json)
+        if await _send_telemetry(type, json):
+            return "handled"
+        return "not_handled"
+    return "no_subscribers"
 _send_telemetry: TelemetrySend | None = None

inspect_ai/_util/transcript.py CHANGED Viewed

@@ -111,6 +111,17 @@ def transcript_panel(
     )
+def transcript_reasoning(reasoning: str) -> list[RenderableType]:
+    content: list[RenderableType] = []
+    content.append(
+        transcript_markdown(
+            f"**<think>**  \n{reasoning}  \n**</think>**\n\n", escape=True
+        )
+    )
+    content.append(Text())
+    return content
 def transcript_separator(title: str, color: str) -> RenderableType:
     return Rule(title=title, style=f"{color} bold", align="center", end="\n\n")

inspect_ai/_view/www/dist/assets/index.css CHANGED Viewed

@@ -15735,6 +15735,7 @@ pre.ap-terminal.ap-cursor-on .ap-line .ap-cursor.ap-inverse {
 }
 pre.ap-terminal:not(.ap-blink) .ap-line .ap-blink {
   color: transparent;
+  border-color: transparent;
 }
 pre.ap-terminal .ap-bright {
   font-weight: bold;

inspect-ai 0.3.60__py3-none-any.whl → 0.3.62__py3-none-any.whl

inspect-ai 0.3.60py3-none-any.whl → 0.3.62py3-none-any.whl