inspect-ai 0.3.70__py3-none-any.whl → 0.3.72__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +14 -8
- inspect_ai/_display/core/display.py +2 -0
- inspect_ai/_display/core/footer.py +13 -3
- inspect_ai/_display/plain/display.py +6 -2
- inspect_ai/_display/rich/display.py +19 -6
- inspect_ai/_display/textual/app.py +6 -1
- inspect_ai/_display/textual/display.py +4 -0
- inspect_ai/_display/textual/widgets/transcript.py +10 -6
- inspect_ai/_eval/task/run.py +5 -8
- inspect_ai/_util/content.py +20 -1
- inspect_ai/_util/transcript.py +10 -4
- inspect_ai/_util/working.py +4 -0
- inspect_ai/_view/www/App.css +6 -0
- inspect_ai/_view/www/dist/assets/index.css +115 -87
- inspect_ai/_view/www/dist/assets/index.js +5324 -2276
- inspect_ai/_view/www/eslint.config.mjs +24 -1
- inspect_ai/_view/www/log-schema.json +283 -20
- inspect_ai/_view/www/package.json +8 -3
- inspect_ai/_view/www/src/App.tsx +2 -2
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +4 -3
- inspect_ai/_view/www/src/components/Card.tsx +9 -8
- inspect_ai/_view/www/src/components/DownloadButton.tsx +2 -1
- inspect_ai/_view/www/src/components/EmptyPanel.tsx +2 -2
- inspect_ai/_view/www/src/components/ErrorPanel.tsx +4 -3
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +13 -5
- inspect_ai/_view/www/src/components/FindBand.tsx +3 -3
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +3 -3
- inspect_ai/_view/www/src/components/LabeledValue.tsx +5 -4
- inspect_ai/_view/www/src/components/LargeModal.tsx +18 -13
- inspect_ai/_view/www/src/components/{LightboxCarousel.css → LightboxCarousel.module.css} +22 -18
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +36 -27
- inspect_ai/_view/www/src/components/MessageBand.tsx +2 -1
- inspect_ai/_view/www/src/components/NavPills.tsx +9 -8
- inspect_ai/_view/www/src/components/ProgressBar.tsx +2 -1
- inspect_ai/_view/www/src/components/TabSet.tsx +21 -15
- inspect_ai/_view/www/src/index.tsx +2 -2
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +11 -9
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +3 -2
- inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +1 -0
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +16 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +3 -2
- inspect_ai/_view/www/src/plan/DetailStep.tsx +2 -1
- inspect_ai/_view/www/src/plan/PlanCard.tsx +2 -5
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +6 -9
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +2 -1
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +3 -3
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +2 -2
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +3 -3
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +2 -2
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +2 -2
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +3 -19
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +22 -7
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +35 -6
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -2
- inspect_ai/_view/www/src/samples/chat/messages.ts +15 -2
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +13 -4
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +2 -2
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +18 -19
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +1 -1
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +4 -3
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +2 -2
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +2 -3
- inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +3 -2
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +57 -45
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +2 -1
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +2 -2
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +4 -3
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +2 -5
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +2 -2
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +2 -1
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +4 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +12 -2
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +1 -1
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +25 -28
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +5 -4
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +8 -7
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +3 -3
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +18 -14
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -5
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +34 -15
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +3 -2
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.module.css +28 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.tsx +115 -0
- inspect_ai/_view/www/src/samples/transcript/event/utils.ts +29 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +3 -3
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +11 -8
- inspect_ai/_view/www/src/types/log.d.ts +129 -34
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +6 -10
- inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +4 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +32 -9
- inspect_ai/_view/www/src/usage/TokenTable.tsx +4 -6
- inspect_ai/_view/www/src/usage/UsageCard.tsx +2 -1
- inspect_ai/_view/www/src/utils/format.ts +1 -1
- inspect_ai/_view/www/src/utils/json.ts +24 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +6 -5
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +9 -2
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +2 -1
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +2 -1
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +3 -3
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +4 -3
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +5 -4
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +5 -8
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +5 -4
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -2
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +2 -2
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -2
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +2 -5
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +12 -11
- inspect_ai/_view/www/yarn.lock +241 -5
- inspect_ai/log/_condense.py +3 -0
- inspect_ai/log/_recorders/eval.py +6 -1
- inspect_ai/log/_transcript.py +58 -1
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_call_tools.py +7 -0
- inspect_ai/model/_chat_message.py +22 -7
- inspect_ai/model/_conversation.py +10 -8
- inspect_ai/model/_generate_config.py +25 -4
- inspect_ai/model/_model.py +133 -57
- inspect_ai/model/_model_output.py +3 -0
- inspect_ai/model/_openai.py +106 -40
- inspect_ai/model/_providers/anthropic.py +281 -153
- inspect_ai/model/_providers/google.py +27 -8
- inspect_ai/model/_providers/groq.py +9 -4
- inspect_ai/model/_providers/openai.py +57 -4
- inspect_ai/model/_providers/openai_o1.py +10 -0
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/model/_reasoning.py +15 -2
- inspect_ai/scorer/_model.py +23 -19
- inspect_ai/solver/_human_agent/agent.py +14 -10
- inspect_ai/solver/_human_agent/commands/__init__.py +7 -3
- inspect_ai/solver/_human_agent/commands/submit.py +76 -30
- inspect_ai/tool/__init__.py +2 -0
- inspect_ai/tool/_tool.py +3 -1
- inspect_ai/tool/_tools/_computer/_common.py +117 -58
- inspect_ai/tool/_tools/_computer/_computer.py +80 -57
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +7 -1
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +91 -0
- inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +8 -0
- inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +12 -0
- inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +78 -0
- inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +20 -0
- inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +1 -1
- inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +175 -113
- inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +76 -20
- inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +65 -0
- inspect_ai/tool/_tools/_computer/test_args.py +151 -0
- inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +8 -0
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +24 -0
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +25 -0
- inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +5 -6
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +10 -11
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +71 -0
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +323 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +5 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +279 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +9 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +293 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +94 -0
- inspect_ai/tool/_tools/_web_browser/_resources/constants.py +2 -0
- inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +2 -0
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +50 -0
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +31 -359
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +280 -0
- inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +65 -0
- inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +64 -0
- inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +146 -0
- inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +64 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +180 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +15 -9
- inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +15 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +44 -0
- inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +39 -0
- inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +198 -48
- inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +26 -25
- inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +178 -39
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +38 -19
- inspect_ai/util/__init__.py +2 -1
- inspect_ai/util/_display.py +12 -0
- inspect_ai/util/_sandbox/events.py +55 -21
- inspect_ai/util/_sandbox/self_check.py +131 -43
- inspect_ai/util/_subtask.py +11 -0
- {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/RECORD +209 -186
- {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/WHEEL +1 -1
- inspect_ai/_view/www/src/components/VirtualList.module.css +0 -19
- inspect_ai/_view/www/src/components/VirtualList.tsx +0 -292
- inspect_ai/tool/_tools/_computer/_computer_split.py +0 -198
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_node.py +0 -312
- inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +0 -275
- inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.png +0 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_node.py +0 -176
- inspect_ai/tool/_tools/_web_browser/_resources/test_dm_env_servicer.py +0 -135
- inspect_ai/tool/_tools/_web_browser/_resources/test_web_environment.py +0 -71
- inspect_ai/tool/_tools/_web_browser/_resources/web_environment.py +0 -184
- {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py
CHANGED
@@ -390,15 +390,19 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
390
390
|
@click.option(
|
391
391
|
"--reasoning-effort",
|
392
392
|
type=click.Choice(["low", "medium", "high"]),
|
393
|
-
help="Constrains effort on reasoning for reasoning models. Open AI
|
393
|
+
help="Constrains effort on reasoning for reasoning models. Open AI o-series models only.",
|
394
394
|
envvar="INSPECT_EVAL_REASONING_EFFORT",
|
395
395
|
)
|
396
396
|
@click.option(
|
397
|
-
"--reasoning-
|
398
|
-
type=
|
399
|
-
|
400
|
-
|
401
|
-
|
397
|
+
"--reasoning-tokens",
|
398
|
+
type=int,
|
399
|
+
help="Maximum number of tokens to use for reasoning. Anthropic Claude models only.",
|
400
|
+
envvar="INSPECT_EVAL_REASONING_TOKENS",
|
401
|
+
)
|
402
|
+
@click.option(
|
403
|
+
"--reasoning-history",
|
404
|
+
type=click.Choice(["none", "all", "last", "auto"]),
|
405
|
+
help='Include reasoning in chat message history sent to generate (defaults to "auto", which uses the recommended default for each provider)',
|
402
406
|
envvar="INSPECT_EVAL_REASONING_HISTORY",
|
403
407
|
)
|
404
408
|
@click.option(
|
@@ -470,7 +474,8 @@ def eval_command(
|
|
470
474
|
max_tool_output: int | None,
|
471
475
|
cache_prompt: str | None,
|
472
476
|
reasoning_effort: str | None,
|
473
|
-
|
477
|
+
reasoning_tokens: int | None,
|
478
|
+
reasoning_history: Literal["none", "all", "last", "auto"] | None,
|
474
479
|
message_limit: int | None,
|
475
480
|
token_limit: int | None,
|
476
481
|
time_limit: int | None,
|
@@ -633,7 +638,8 @@ def eval_set_command(
|
|
633
638
|
max_tool_output: int | None,
|
634
639
|
cache_prompt: str | None,
|
635
640
|
reasoning_effort: str | None,
|
636
|
-
|
641
|
+
reasoning_tokens: int | None,
|
642
|
+
reasoning_history: Literal["none", "all", "last", "auto"] | None,
|
637
643
|
message_limit: int | None,
|
638
644
|
token_limit: int | None,
|
639
645
|
time_limit: int | None,
|
@@ -9,10 +9,12 @@ from .config import task_dict
|
|
9
9
|
|
10
10
|
|
11
11
|
@throttle(1)
|
12
|
-
def task_footer(
|
12
|
+
def task_footer(
|
13
|
+
counters: dict[str, str], style: str = ""
|
14
|
+
) -> tuple[RenderableType, RenderableType]:
|
13
15
|
return (
|
14
16
|
Text.from_markup(task_resources(), style=style),
|
15
|
-
Text.from_markup(
|
17
|
+
Text.from_markup(task_counters(counters), style=style),
|
16
18
|
)
|
17
19
|
|
18
20
|
|
@@ -23,5 +25,13 @@ def task_resources() -> str:
|
|
23
25
|
return task_dict(resources)
|
24
26
|
|
25
27
|
|
26
|
-
def
|
28
|
+
def task_counters(counters: dict[str, str]) -> str:
|
29
|
+
return task_dict(counters | task_http_rate_limits())
|
30
|
+
|
31
|
+
|
32
|
+
def task_http_rate_limits() -> dict[str, str]:
|
33
|
+
return {"HTTP rate limits": f"{http_rate_limit_count():,}"}
|
34
|
+
|
35
|
+
|
36
|
+
def task_http_rate_limits_str() -> str:
|
27
37
|
return f"HTTP rate limits: {http_rate_limit_count():,}"
|
@@ -22,7 +22,7 @@ from ..core.display import (
|
|
22
22
|
TaskSpec,
|
23
23
|
TaskWithResult,
|
24
24
|
)
|
25
|
-
from ..core.footer import
|
25
|
+
from ..core.footer import task_http_rate_limits_str
|
26
26
|
from ..core.panel import task_panel, task_targets
|
27
27
|
from ..core.results import task_metric, tasks_results
|
28
28
|
|
@@ -89,6 +89,10 @@ class PlainDisplay(Display):
|
|
89
89
|
show_model_names=self.multiple_model_names,
|
90
90
|
)
|
91
91
|
|
92
|
+
def display_counter(self, caption: str, value: str) -> None:
|
93
|
+
# Not supported for plain display as counters are only shown for tasks.
|
94
|
+
pass
|
95
|
+
|
92
96
|
def _print_results(self) -> None:
|
93
97
|
"""Print final results using rich panels"""
|
94
98
|
panels = tasks_results(self.tasks)
|
@@ -178,7 +182,7 @@ class PlainTaskDisplay(TaskDisplay):
|
|
178
182
|
status_parts.append(resources)
|
179
183
|
|
180
184
|
# Add rate limits
|
181
|
-
rate_limits =
|
185
|
+
rate_limits = task_http_rate_limits_str()
|
182
186
|
if rate_limits:
|
183
187
|
status_parts.append(rate_limits)
|
184
188
|
|
@@ -60,6 +60,7 @@ class RichDisplay(Display):
|
|
60
60
|
self.parallel = False
|
61
61
|
self.live: Live | None = None
|
62
62
|
self.timer_handle: asyncio.TimerHandle | None = None
|
63
|
+
self.counters: dict[str, str] = {}
|
63
64
|
rich_initialise()
|
64
65
|
|
65
66
|
@override
|
@@ -153,13 +154,20 @@ class RichDisplay(Display):
|
|
153
154
|
and self.live.is_started
|
154
155
|
):
|
155
156
|
if self.parallel:
|
156
|
-
r = tasks_live_status(
|
157
|
+
r = tasks_live_status(
|
158
|
+
self.total_tasks, self.tasks, self.progress_ui, self.counters
|
159
|
+
)
|
157
160
|
else:
|
158
|
-
r = task_live_status(self.tasks, self.progress_ui)
|
161
|
+
r = task_live_status(self.tasks, self.progress_ui, self.counters)
|
159
162
|
self.live.update(r, refresh=True)
|
160
163
|
|
161
164
|
self.timer_handle = asyncio.get_event_loop().call_later(1, self._update_display)
|
162
165
|
|
166
|
+
@override
|
167
|
+
def display_counter(self, caption: str, value: str) -> None:
|
168
|
+
self.counters[caption] = value
|
169
|
+
self._update_display()
|
170
|
+
|
163
171
|
|
164
172
|
class RichTaskScreen(TaskScreen):
|
165
173
|
def __init__(self, live: Live) -> None:
|
@@ -286,7 +294,9 @@ class RichTaskDisplay(TaskDisplay):
|
|
286
294
|
self.p.complete()
|
287
295
|
|
288
296
|
|
289
|
-
def task_live_status(
|
297
|
+
def task_live_status(
|
298
|
+
tasks: list[TaskStatus], progress: RProgress, counters: dict[str, str]
|
299
|
+
) -> RenderableType:
|
290
300
|
theme = rich_theme()
|
291
301
|
|
292
302
|
# the panel contents
|
@@ -300,13 +310,16 @@ def task_live_status(tasks: list[TaskStatus], progress: RProgress) -> Renderable
|
|
300
310
|
show_model=len(tasks) == 1,
|
301
311
|
body=Group("", progress),
|
302
312
|
subtitle=subtitle,
|
303
|
-
footer=task_footer(theme.light),
|
313
|
+
footer=task_footer(counters, theme.light),
|
304
314
|
log_location=None,
|
305
315
|
)
|
306
316
|
|
307
317
|
|
308
318
|
def tasks_live_status(
|
309
|
-
total_tasks: int,
|
319
|
+
total_tasks: int,
|
320
|
+
tasks: list[TaskStatus],
|
321
|
+
progress: RProgress,
|
322
|
+
counters: dict[str, str],
|
310
323
|
) -> RenderableType:
|
311
324
|
# rendering context
|
312
325
|
theme = rich_theme()
|
@@ -325,7 +338,7 @@ def tasks_live_status(
|
|
325
338
|
footer_table = Table.grid(expand=True)
|
326
339
|
footer_table.add_column()
|
327
340
|
footer_table.add_column(justify="right")
|
328
|
-
footer = task_footer(theme.light)
|
341
|
+
footer = task_footer(counters, theme.light)
|
329
342
|
footer_table.add_row()
|
330
343
|
footer_table.add_row(footer[0], footer[1])
|
331
344
|
|
@@ -89,6 +89,7 @@ class TaskScreenApp(App[TR]):
|
|
89
89
|
self._total_tasks = 0
|
90
90
|
self._parallel = False
|
91
91
|
self._tasks: list[TaskWithResult] = []
|
92
|
+
self._counters: dict[str, str] = {}
|
92
93
|
|
93
94
|
# all tasks processed by app
|
94
95
|
self._app_tasks: list[TaskWithResult] = []
|
@@ -302,7 +303,7 @@ class TaskScreenApp(App[TR]):
|
|
302
303
|
samples_view.set_samples(active_and_started_samples)
|
303
304
|
|
304
305
|
def update_footer(self) -> None:
|
305
|
-
left, right = task_footer()
|
306
|
+
left, right = task_footer(self._counters)
|
306
307
|
footer = self.query_one(AppFooter)
|
307
308
|
footer.left = left
|
308
309
|
footer.right = right
|
@@ -377,6 +378,10 @@ class TaskScreenApp(App[TR]):
|
|
377
378
|
except NoMatches:
|
378
379
|
return None
|
379
380
|
|
381
|
+
def display_counter(self, caption: str, value: str) -> None:
|
382
|
+
self._counters[caption] = value
|
383
|
+
self.update_footer()
|
384
|
+
|
380
385
|
class InputPanelHost(InputPanel.Host):
|
381
386
|
def __init__(self, app: "TaskScreenApp[TR]", tab_id: str) -> None:
|
382
387
|
self.app = app
|
@@ -72,3 +72,7 @@ class TextualDisplay(Display):
|
|
72
72
|
def task(self, profile: TaskProfile) -> Iterator[TaskDisplay]:
|
73
73
|
with self.app.task_display(profile) as task_display:
|
74
74
|
yield task_display
|
75
|
+
|
76
|
+
@override
|
77
|
+
def display_counter(self, caption: str, value: str) -> None:
|
78
|
+
self.app.display_counter(caption, value)
|
@@ -9,7 +9,7 @@ from textual.containers import ScrollableContainer
|
|
9
9
|
from textual.widget import Widget
|
10
10
|
from textual.widgets import Static
|
11
11
|
|
12
|
-
from inspect_ai._util.content import ContentText
|
12
|
+
from inspect_ai._util.content import ContentReasoning, ContentText
|
13
13
|
from inspect_ai._util.rich import lines_display
|
14
14
|
from inspect_ai._util.transcript import (
|
15
15
|
set_transcript_markdown_options,
|
@@ -36,7 +36,6 @@ from inspect_ai.log._transcript import (
|
|
36
36
|
)
|
37
37
|
from inspect_ai.model._chat_message import (
|
38
38
|
ChatMessage,
|
39
|
-
ChatMessageAssistant,
|
40
39
|
ChatMessageUser,
|
41
40
|
)
|
42
41
|
from inspect_ai.model._render import messages_preceding_assistant
|
@@ -333,11 +332,16 @@ def render_message(message: ChatMessage) -> list[RenderableType]:
|
|
333
332
|
Text(),
|
334
333
|
]
|
335
334
|
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
if message.text:
|
335
|
+
# deal with plain text or with content blocks
|
336
|
+
if isinstance(message.content, str):
|
340
337
|
content.extend([transcript_markdown(message.text.strip(), escape=True)])
|
338
|
+
else:
|
339
|
+
for c in message.content:
|
340
|
+
if isinstance(c, ContentReasoning):
|
341
|
+
content.extend(transcript_reasoning(c))
|
342
|
+
elif isinstance(c, ContentText):
|
343
|
+
content.extend([transcript_markdown(c.text.strip(), escape=True)])
|
344
|
+
|
341
345
|
return content
|
342
346
|
|
343
347
|
|
inspect_ai/_eval/task/run.py
CHANGED
@@ -50,11 +50,7 @@ from inspect_ai.log import (
|
|
50
50
|
from inspect_ai.log._condense import condense_sample
|
51
51
|
from inspect_ai.log._file import eval_log_json_str
|
52
52
|
from inspect_ai.log._log import EvalSampleLimit, EvalSampleReductions, eval_error
|
53
|
-
from inspect_ai.log._samples import
|
54
|
-
active_sample,
|
55
|
-
set_active_sample_message_limit,
|
56
|
-
set_active_sample_token_limit,
|
57
|
-
)
|
53
|
+
from inspect_ai.log._samples import active_sample
|
58
54
|
from inspect_ai.log._transcript import (
|
59
55
|
ErrorEvent,
|
60
56
|
SampleInitEvent,
|
@@ -695,9 +691,10 @@ async def task_run_sample(
|
|
695
691
|
assert time_limit
|
696
692
|
timeout_cm = timeout(time_limit / 2)
|
697
693
|
|
698
|
-
# turn off
|
699
|
-
|
700
|
-
|
694
|
+
# turn off message and token limits
|
695
|
+
state.message_limit = None
|
696
|
+
state.token_limit = None
|
697
|
+
set_sample_state(state)
|
701
698
|
|
702
699
|
# scoring
|
703
700
|
try:
|
inspect_ai/_util/content.py
CHANGED
@@ -13,6 +13,25 @@ class ContentText(BaseModel):
|
|
13
13
|
"""Text content."""
|
14
14
|
|
15
15
|
|
16
|
+
class ContentReasoning(BaseModel):
|
17
|
+
"""Reasoning content.
|
18
|
+
|
19
|
+
See the specification for [thinking blocks](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#understanding-thinking-blocks) for Claude models.
|
20
|
+
"""
|
21
|
+
|
22
|
+
type: Literal["reasoning"] = Field(default="reasoning")
|
23
|
+
"""Type."""
|
24
|
+
|
25
|
+
reasoning: str
|
26
|
+
"""Reasoning content."""
|
27
|
+
|
28
|
+
signature: str | None = Field(default=None)
|
29
|
+
"""Signature for reasoning content (used by some models to ensure that reasoning content is not modified for replay)"""
|
30
|
+
|
31
|
+
redacted: bool = Field(default=False)
|
32
|
+
"""Indicates that the explicit content of this reasoning block has been redacted."""
|
33
|
+
|
34
|
+
|
16
35
|
class ContentImage(BaseModel):
|
17
36
|
"""Image content."""
|
18
37
|
|
@@ -55,5 +74,5 @@ class ContentVideo(BaseModel):
|
|
55
74
|
"""Format of video data ('mp4', 'mpeg', or 'mov')"""
|
56
75
|
|
57
76
|
|
58
|
-
Content = Union[ContentText, ContentImage, ContentAudio, ContentVideo]
|
77
|
+
Content = Union[ContentText, ContentReasoning, ContentImage, ContentAudio, ContentVideo]
|
59
78
|
"""Content sent to or received from a model."""
|
inspect_ai/_util/transcript.py
CHANGED
@@ -10,6 +10,8 @@ from rich.panel import Panel
|
|
10
10
|
from rich.rule import Rule
|
11
11
|
from rich.text import Text
|
12
12
|
|
13
|
+
from inspect_ai._util.content import ContentReasoning
|
14
|
+
|
13
15
|
from .format import format_function_call
|
14
16
|
|
15
17
|
|
@@ -111,12 +113,16 @@ def transcript_panel(
|
|
111
113
|
)
|
112
114
|
|
113
115
|
|
114
|
-
def transcript_reasoning(reasoning:
|
116
|
+
def transcript_reasoning(reasoning: ContentReasoning) -> list[RenderableType]:
|
115
117
|
content: list[RenderableType] = []
|
118
|
+
text = (
|
119
|
+
reasoning.reasoning
|
120
|
+
if not reasoning.redacted
|
121
|
+
else "Reasoning encrypted by model provider."
|
122
|
+
)
|
123
|
+
|
116
124
|
content.append(
|
117
|
-
transcript_markdown(
|
118
|
-
f"**<think>** \n{reasoning} \n**</think>**\n\n", escape=True
|
119
|
-
)
|
125
|
+
transcript_markdown(f"**<think>** \n{text} \n**</think>**\n\n", escape=True)
|
120
126
|
)
|
121
127
|
content.append(Text())
|
122
128
|
return content
|
inspect_ai/_util/working.py
CHANGED
@@ -12,6 +12,10 @@ def sample_waiting_time() -> float:
|
|
12
12
|
return _sample_waiting_time.get()
|
13
13
|
|
14
14
|
|
15
|
+
def sample_working_time() -> float:
|
16
|
+
return time.monotonic() - _sample_start_time.get() - sample_waiting_time()
|
17
|
+
|
18
|
+
|
15
19
|
def report_sample_waiting_time(waiting_time: float) -> None:
|
16
20
|
_sample_waiting_time.set(_sample_waiting_time.get() + waiting_time)
|
17
21
|
check_sample_working_limit()
|
inspect_ai/_view/www/App.css
CHANGED
@@ -805,15 +805,21 @@ table.table.table-sm td {
|
|
805
805
|
overflow: unset;
|
806
806
|
}
|
807
807
|
|
808
|
+
.markdown-content pre[class*="language-"],
|
808
809
|
pre[class*="language-"].tool-output,
|
809
810
|
.tool-output {
|
810
811
|
background-color: #f8f8f8;
|
811
812
|
}
|
813
|
+
|
814
|
+
.vscode-dark .model-call pre[class*="language-"],
|
815
|
+
.vscode-dark .markdown-content pre[class*="language-"],
|
812
816
|
.vscode-dark pre[class*="language-"].tool-output,
|
813
817
|
.vscode-dark .tool-output {
|
814
818
|
background-color: #333333;
|
815
819
|
}
|
816
820
|
|
821
|
+
.model-call pre[class*="language-"],
|
822
|
+
.markdown-content pre[class*="language-"],
|
817
823
|
pre[class*="language-"].tool-output {
|
818
824
|
border: none !important;
|
819
825
|
box-shadow: none !important;
|