PyPI - inspect-ai - Versions diffs - 0.3.70__py3-none-any.whl → 0.3.72__py3-none-any.whl - Mend

inspect-ai 0.3.70py3-none-any.whl → 0.3.72py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (219) hide show

inspect_ai/_cli/eval.py CHANGED Viewed

@@ -390,15 +390,19 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
     @click.option(
         "--reasoning-effort",
         type=click.Choice(["low", "medium", "high"]),
-        help="Constrains effort on reasoning for reasoning models. Open AI o1 models only.",
+        help="Constrains effort on reasoning for reasoning models. Open AI o-series models only.",
         envvar="INSPECT_EVAL_REASONING_EFFORT",
     )
     @click.option(
-        "--reasoning-history/--no-reasoning-history",
-        type=bool,
-        is_flag=True,
-        default=True,
-        help="Include reasoning in chat message history sent to generate.",
+        "--reasoning-tokens",
+        type=int,
+        help="Maximum number of tokens to use for reasoning. Anthropic Claude models only.",
+        envvar="INSPECT_EVAL_REASONING_TOKENS",
+    )
+    @click.option(
+        "--reasoning-history",
+        type=click.Choice(["none", "all", "last", "auto"]),
+        help='Include reasoning in chat message history sent to generate (defaults to "auto", which uses the recommended default for each provider)',
         envvar="INSPECT_EVAL_REASONING_HISTORY",
     )
     @click.option(
@@ -470,7 +474,8 @@ def eval_command(
     max_tool_output: int | None,
     cache_prompt: str | None,
     reasoning_effort: str | None,
-    reasoning_history: bool | None,
+    reasoning_tokens: int | None,
+    reasoning_history: Literal["none", "all", "last", "auto"] | None,
     message_limit: int | None,
     token_limit: int | None,
     time_limit: int | None,
@@ -633,7 +638,8 @@ def eval_set_command(
     max_tool_output: int | None,
     cache_prompt: str | None,
     reasoning_effort: str | None,
-    reasoning_history: bool | None,
+    reasoning_tokens: int | None,
+    reasoning_history: Literal["none", "all", "last", "auto"] | None,
     message_limit: int | None,
     token_limit: int | None,
     time_limit: int | None,

inspect_ai/_display/core/display.py CHANGED Viewed

@@ -143,3 +143,5 @@ class Display(Protocol):
     @contextlib.contextmanager
     def task(self, profile: TaskProfile) -> Iterator[TaskDisplay]: ...
+    def display_counter(self, caption: str, value: str) -> None: ...

inspect_ai/_display/core/footer.py CHANGED Viewed

@@ -9,10 +9,12 @@ from .config import task_dict
 @throttle(1)
-def task_footer(style: str = "") -> tuple[RenderableType, RenderableType]:
+def task_footer(
+    counters: dict[str, str], style: str = ""
+) -> tuple[RenderableType, RenderableType]:
     return (
         Text.from_markup(task_resources(), style=style),
-        Text.from_markup(task_http_rate_limits(), style=style),
+        Text.from_markup(task_counters(counters), style=style),
     )
@@ -23,5 +25,13 @@ def task_resources() -> str:
     return task_dict(resources)
-def task_http_rate_limits() -> str:
+def task_counters(counters: dict[str, str]) -> str:
+    return task_dict(counters | task_http_rate_limits())
+def task_http_rate_limits() -> dict[str, str]:
+    return {"HTTP rate limits": f"{http_rate_limit_count():,}"}
+def task_http_rate_limits_str() -> str:
     return f"HTTP rate limits: {http_rate_limit_count():,}"

inspect_ai/_display/plain/display.py CHANGED Viewed

@@ -22,7 +22,7 @@ from ..core.display import (
     TaskSpec,
     TaskWithResult,
 )
-from ..core.footer import task_http_rate_limits
+from ..core.footer import task_http_rate_limits_str
 from ..core.panel import task_panel, task_targets
 from ..core.results import task_metric, tasks_results
@@ -89,6 +89,10 @@ class PlainDisplay(Display):
             show_model_names=self.multiple_model_names,
         )
+    def display_counter(self, caption: str, value: str) -> None:
+        # Not supported for plain display as counters are only shown for tasks.
+        pass
     def _print_results(self) -> None:
         """Print final results using rich panels"""
         panels = tasks_results(self.tasks)
@@ -178,7 +182,7 @@ class PlainTaskDisplay(TaskDisplay):
             status_parts.append(resources)
             # Add rate limits
-            rate_limits = task_http_rate_limits()
+            rate_limits = task_http_rate_limits_str()
             if rate_limits:
                 status_parts.append(rate_limits)

inspect_ai/_display/rich/display.py CHANGED Viewed

@@ -60,6 +60,7 @@ class RichDisplay(Display):
         self.parallel = False
         self.live: Live | None = None
         self.timer_handle: asyncio.TimerHandle | None = None
+        self.counters: dict[str, str] = {}
         rich_initialise()
     @override
@@ -153,13 +154,20 @@ class RichDisplay(Display):
             and self.live.is_started
         ):
             if self.parallel:
-                r = tasks_live_status(self.total_tasks, self.tasks, self.progress_ui)
+                r = tasks_live_status(
+                    self.total_tasks, self.tasks, self.progress_ui, self.counters
+                )
             else:
-                r = task_live_status(self.tasks, self.progress_ui)
+                r = task_live_status(self.tasks, self.progress_ui, self.counters)
             self.live.update(r, refresh=True)
         self.timer_handle = asyncio.get_event_loop().call_later(1, self._update_display)
+    @override
+    def display_counter(self, caption: str, value: str) -> None:
+        self.counters[caption] = value
+        self._update_display()
 class RichTaskScreen(TaskScreen):
     def __init__(self, live: Live) -> None:
@@ -286,7 +294,9 @@ class RichTaskDisplay(TaskDisplay):
         self.p.complete()
-def task_live_status(tasks: list[TaskStatus], progress: RProgress) -> RenderableType:
+def task_live_status(
+    tasks: list[TaskStatus], progress: RProgress, counters: dict[str, str]
+) -> RenderableType:
     theme = rich_theme()
     # the panel contents
@@ -300,13 +310,16 @@ def task_live_status(tasks: list[TaskStatus], progress: RProgress) -> Renderable
         show_model=len(tasks) == 1,
         body=Group("", progress),
         subtitle=subtitle,
-        footer=task_footer(theme.light),
+        footer=task_footer(counters, theme.light),
         log_location=None,
     )
 def tasks_live_status(
-    total_tasks: int, tasks: list[TaskStatus], progress: RProgress
+    total_tasks: int,
+    tasks: list[TaskStatus],
+    progress: RProgress,
+    counters: dict[str, str],
 ) -> RenderableType:
     # rendering context
     theme = rich_theme()
@@ -325,7 +338,7 @@ def tasks_live_status(
     footer_table = Table.grid(expand=True)
     footer_table.add_column()
     footer_table.add_column(justify="right")
-    footer = task_footer(theme.light)
+    footer = task_footer(counters, theme.light)
     footer_table.add_row()
     footer_table.add_row(footer[0], footer[1])

inspect_ai/_display/textual/app.py CHANGED Viewed

@@ -89,6 +89,7 @@ class TaskScreenApp(App[TR]):
         self._total_tasks = 0
         self._parallel = False
         self._tasks: list[TaskWithResult] = []
+        self._counters: dict[str, str] = {}
         # all tasks processed by app
         self._app_tasks: list[TaskWithResult] = []
@@ -302,7 +303,7 @@ class TaskScreenApp(App[TR]):
         samples_view.set_samples(active_and_started_samples)
     def update_footer(self) -> None:
-        left, right = task_footer()
+        left, right = task_footer(self._counters)
         footer = self.query_one(AppFooter)
         footer.left = left
         footer.right = right
@@ -377,6 +378,10 @@ class TaskScreenApp(App[TR]):
         except NoMatches:
             return None
+    def display_counter(self, caption: str, value: str) -> None:
+        self._counters[caption] = value
+        self.update_footer()
     class InputPanelHost(InputPanel.Host):
         def __init__(self, app: "TaskScreenApp[TR]", tab_id: str) -> None:
             self.app = app

inspect_ai/_display/textual/display.py CHANGED Viewed

@@ -72,3 +72,7 @@ class TextualDisplay(Display):
     def task(self, profile: TaskProfile) -> Iterator[TaskDisplay]:
         with self.app.task_display(profile) as task_display:
             yield task_display
+    @override
+    def display_counter(self, caption: str, value: str) -> None:
+        self.app.display_counter(caption, value)

inspect_ai/_display/textual/widgets/transcript.py CHANGED Viewed

@@ -9,7 +9,7 @@ from textual.containers import ScrollableContainer
 from textual.widget import Widget
 from textual.widgets import Static
-from inspect_ai._util.content import ContentText
+from inspect_ai._util.content import ContentReasoning, ContentText
 from inspect_ai._util.rich import lines_display
 from inspect_ai._util.transcript import (
     set_transcript_markdown_options,
@@ -36,7 +36,6 @@ from inspect_ai.log._transcript import (
 )
 from inspect_ai.model._chat_message import (
     ChatMessage,
-    ChatMessageAssistant,
     ChatMessageUser,
 )
 from inspect_ai.model._render import messages_preceding_assistant
@@ -333,11 +332,16 @@ def render_message(message: ChatMessage) -> list[RenderableType]:
         Text(),
     ]
-    if isinstance(message, ChatMessageAssistant) and message.reasoning:
-        content.extend(transcript_reasoning(message.reasoning))
-    if message.text:
+    # deal with plain text or with content blocks
+    if isinstance(message.content, str):
         content.extend([transcript_markdown(message.text.strip(), escape=True)])
+    else:
+        for c in message.content:
+            if isinstance(c, ContentReasoning):
+                content.extend(transcript_reasoning(c))
+            elif isinstance(c, ContentText):
+                content.extend([transcript_markdown(c.text.strip(), escape=True)])
     return content

inspect_ai/_eval/task/run.py CHANGED Viewed

@@ -50,11 +50,7 @@ from inspect_ai.log import (
 from inspect_ai.log._condense import condense_sample
 from inspect_ai.log._file import eval_log_json_str
 from inspect_ai.log._log import EvalSampleLimit, EvalSampleReductions, eval_error
-from inspect_ai.log._samples import (
-    active_sample,
-    set_active_sample_message_limit,
-    set_active_sample_token_limit,
-)
+from inspect_ai.log._samples import active_sample
 from inspect_ai.log._transcript import (
     ErrorEvent,
     SampleInitEvent,
@@ -695,9 +691,10 @@ async def task_run_sample(
                     assert time_limit
                     timeout_cm = timeout(time_limit / 2)
-                # turn off sample limits
-                set_active_sample_token_limit(None)
-                set_active_sample_message_limit(None)
+                # turn off message and token limits
+                state.message_limit = None
+                state.token_limit = None
+                set_sample_state(state)
                 # scoring
                 try:

inspect_ai/_util/content.py CHANGED Viewed

@@ -13,6 +13,25 @@ class ContentText(BaseModel):
     """Text content."""
+class ContentReasoning(BaseModel):
+    """Reasoning content.
+    See the specification for [thinking blocks](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#understanding-thinking-blocks) for Claude models.
+    """
+    type: Literal["reasoning"] = Field(default="reasoning")
+    """Type."""
+    reasoning: str
+    """Reasoning content."""
+    signature: str | None = Field(default=None)
+    """Signature for reasoning content (used by some models to ensure that reasoning content is not modified for replay)"""
+    redacted: bool = Field(default=False)
+    """Indicates that the explicit content of this reasoning block has been redacted."""
 class ContentImage(BaseModel):
     """Image content."""
@@ -55,5 +74,5 @@ class ContentVideo(BaseModel):
     """Format of video data ('mp4', 'mpeg', or 'mov')"""
-Content = Union[ContentText, ContentImage, ContentAudio, ContentVideo]
+Content = Union[ContentText, ContentReasoning, ContentImage, ContentAudio, ContentVideo]
 """Content sent to or received from a model."""

inspect_ai/_util/transcript.py CHANGED Viewed

@@ -10,6 +10,8 @@ from rich.panel import Panel
 from rich.rule import Rule
 from rich.text import Text
+from inspect_ai._util.content import ContentReasoning
 from .format import format_function_call
@@ -111,12 +113,16 @@ def transcript_panel(
     )
-def transcript_reasoning(reasoning: str) -> list[RenderableType]:
+def transcript_reasoning(reasoning: ContentReasoning) -> list[RenderableType]:
     content: list[RenderableType] = []
+    text = (
+        reasoning.reasoning
+        if not reasoning.redacted
+        else "Reasoning encrypted by model provider."
+    )
     content.append(
-        transcript_markdown(
-            f"**<think>**  \n{reasoning}  \n**</think>**\n\n", escape=True
-        )
+        transcript_markdown(f"**<think>**  \n{text}  \n**</think>**\n\n", escape=True)
     )
     content.append(Text())
     return content

inspect_ai/_util/working.py CHANGED Viewed

@@ -12,6 +12,10 @@ def sample_waiting_time() -> float:
     return _sample_waiting_time.get()
+def sample_working_time() -> float:
+    return time.monotonic() - _sample_start_time.get() - sample_waiting_time()
 def report_sample_waiting_time(waiting_time: float) -> None:
     _sample_waiting_time.set(_sample_waiting_time.get() + waiting_time)
     check_sample_working_limit()

inspect_ai/_view/www/App.css CHANGED Viewed

@@ -805,15 +805,21 @@ table.table.table-sm td {
   overflow: unset;
 }
+.markdown-content pre[class*="language-"],
 pre[class*="language-"].tool-output,
 .tool-output {
   background-color: #f8f8f8;
 }
+.vscode-dark .model-call pre[class*="language-"],
+.vscode-dark .markdown-content pre[class*="language-"],
 .vscode-dark pre[class*="language-"].tool-output,
 .vscode-dark .tool-output {
   background-color: #333333;
 }
+.model-call pre[class*="language-"],
+.markdown-content pre[class*="language-"],
 pre[class*="language-"].tool-output {
   border: none !important;
   box-shadow: none !important;

inspect-ai 0.3.70__py3-none-any.whl → 0.3.72__py3-none-any.whl

inspect-ai 0.3.70py3-none-any.whl → 0.3.72py3-none-any.whl