inspect-ai 0.3.69__py3-none-any.whl → 0.3.71__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +27 -9
- inspect_ai/_display/core/display.py +2 -0
- inspect_ai/_display/core/footer.py +13 -3
- inspect_ai/_display/plain/display.py +6 -2
- inspect_ai/_display/rich/display.py +19 -6
- inspect_ai/_display/textual/app.py +9 -3
- inspect_ai/_display/textual/display.py +4 -0
- inspect_ai/_display/textual/widgets/samples.py +4 -10
- inspect_ai/_display/textual/widgets/transcript.py +35 -18
- inspect_ai/_eval/eval.py +14 -2
- inspect_ai/_eval/evalset.py +6 -1
- inspect_ai/_eval/run.py +6 -0
- inspect_ai/_eval/task/run.py +49 -23
- inspect_ai/_eval/task/task.py +26 -3
- inspect_ai/_util/content.py +20 -1
- inspect_ai/_util/interrupt.py +6 -0
- inspect_ai/_util/logger.py +19 -0
- inspect_ai/_util/rich.py +7 -8
- inspect_ai/_util/text.py +13 -0
- inspect_ai/_util/transcript.py +20 -6
- inspect_ai/_util/working.py +50 -0
- inspect_ai/_view/www/App.css +6 -0
- inspect_ai/_view/www/dist/assets/index.css +171 -99
- inspect_ai/_view/www/dist/assets/index.js +5972 -2770
- inspect_ai/_view/www/eslint.config.mjs +24 -1
- inspect_ai/_view/www/log-schema.json +619 -21
- inspect_ai/_view/www/package.json +8 -3
- inspect_ai/_view/www/src/App.tsx +2 -2
- inspect_ai/_view/www/src/appearance/icons.ts +3 -1
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +4 -3
- inspect_ai/_view/www/src/components/Card.tsx +9 -8
- inspect_ai/_view/www/src/components/DownloadButton.tsx +2 -1
- inspect_ai/_view/www/src/components/EmptyPanel.tsx +2 -2
- inspect_ai/_view/www/src/components/ErrorPanel.tsx +4 -3
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +13 -5
- inspect_ai/_view/www/src/components/FindBand.tsx +3 -3
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +3 -3
- inspect_ai/_view/www/src/components/LabeledValue.tsx +5 -4
- inspect_ai/_view/www/src/components/LargeModal.tsx +18 -13
- inspect_ai/_view/www/src/components/{LightboxCarousel.css → LightboxCarousel.module.css} +22 -18
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +36 -27
- inspect_ai/_view/www/src/components/MessageBand.tsx +2 -1
- inspect_ai/_view/www/src/components/NavPills.tsx +9 -8
- inspect_ai/_view/www/src/components/ProgressBar.tsx +2 -1
- inspect_ai/_view/www/src/components/TabSet.tsx +21 -15
- inspect_ai/_view/www/src/index.tsx +2 -2
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +11 -9
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +3 -2
- inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +1 -0
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +16 -1
- inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +3 -2
- inspect_ai/_view/www/src/plan/DetailStep.tsx +2 -1
- inspect_ai/_view/www/src/plan/PlanCard.tsx +2 -5
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +6 -9
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +2 -1
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +3 -3
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +2 -2
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +3 -3
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +9 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +30 -3
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +25 -4
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +3 -19
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +22 -7
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +35 -6
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -2
- inspect_ai/_view/www/src/samples/chat/messages.ts +15 -2
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +13 -4
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +2 -2
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +18 -19
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +1 -1
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +4 -3
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +2 -2
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +2 -3
- inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +3 -2
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +57 -45
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +2 -1
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +2 -2
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +4 -3
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +2 -5
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +2 -2
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +2 -1
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +4 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +12 -2
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +1 -1
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +25 -28
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +9 -4
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.module.css +32 -0
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +153 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +12 -5
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +18 -14
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -5
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +53 -16
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +6 -3
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +3 -2
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.module.css +28 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.tsx +115 -0
- inspect_ai/_view/www/src/samples/transcript/event/utils.ts +29 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +3 -3
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +11 -8
- inspect_ai/_view/www/src/samples/transcript/types.ts +3 -1
- inspect_ai/_view/www/src/types/log.d.ts +312 -137
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +6 -10
- inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +4 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +32 -9
- inspect_ai/_view/www/src/usage/TokenTable.tsx +4 -6
- inspect_ai/_view/www/src/usage/UsageCard.tsx +2 -1
- inspect_ai/_view/www/src/utils/format.ts +8 -5
- inspect_ai/_view/www/src/utils/json.ts +24 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +6 -5
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +18 -8
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +2 -1
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +2 -1
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +3 -3
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +4 -3
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +5 -4
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +5 -8
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +5 -4
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -2
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +2 -2
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -2
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +2 -5
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +12 -11
- inspect_ai/_view/www/yarn.lock +241 -5
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_condense.py +4 -0
- inspect_ai/log/_log.py +72 -12
- inspect_ai/log/_recorders/eval.py +6 -1
- inspect_ai/log/_samples.py +5 -1
- inspect_ai/log/_transcript.py +89 -2
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_call_tools.py +8 -1
- inspect_ai/model/_chat_message.py +22 -7
- inspect_ai/model/_conversation.py +11 -9
- inspect_ai/model/_generate_config.py +25 -4
- inspect_ai/model/_model.py +164 -72
- inspect_ai/model/_model_call.py +10 -3
- inspect_ai/model/_model_output.py +3 -0
- inspect_ai/model/_openai.py +106 -40
- inspect_ai/model/_providers/anthropic.py +145 -26
- inspect_ai/model/_providers/bedrock.py +7 -0
- inspect_ai/model/_providers/cloudflare.py +20 -7
- inspect_ai/model/_providers/google.py +29 -8
- inspect_ai/model/_providers/groq.py +66 -27
- inspect_ai/model/_providers/hf.py +6 -0
- inspect_ai/model/_providers/mistral.py +78 -51
- inspect_ai/model/_providers/openai.py +66 -4
- inspect_ai/model/_providers/openai_o1.py +10 -0
- inspect_ai/model/_providers/providers.py +2 -2
- inspect_ai/model/_providers/util/tracker.py +92 -0
- inspect_ai/model/_providers/vllm.py +13 -5
- inspect_ai/model/_reasoning.py +15 -2
- inspect_ai/scorer/_model.py +23 -19
- inspect_ai/solver/_basic_agent.py +1 -3
- inspect_ai/solver/_bridge/patch.py +0 -2
- inspect_ai/solver/_human_agent/agent.py +14 -10
- inspect_ai/solver/_human_agent/commands/__init__.py +7 -3
- inspect_ai/solver/_human_agent/commands/submit.py +76 -30
- inspect_ai/solver/_limit.py +4 -4
- inspect_ai/solver/_plan.py +0 -3
- inspect_ai/solver/_task_state.py +7 -0
- inspect_ai/tool/__init__.py +2 -0
- inspect_ai/tool/_tool.py +3 -1
- inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +1 -1
- inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +8 -0
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +24 -0
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +25 -0
- inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +5 -6
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +10 -11
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +71 -0
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +323 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +5 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +279 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +9 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +293 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +94 -0
- inspect_ai/tool/_tools/_web_browser/_resources/constants.py +2 -0
- inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +2 -0
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +50 -0
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +31 -359
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +280 -0
- inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +65 -0
- inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +64 -0
- inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +146 -0
- inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +64 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +180 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +15 -9
- inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +15 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +44 -0
- inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +39 -0
- inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +198 -48
- inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +26 -25
- inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +178 -39
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +38 -19
- inspect_ai/tool/_tools/_web_search.py +3 -3
- inspect_ai/util/__init__.py +2 -1
- inspect_ai/util/_concurrency.py +14 -8
- inspect_ai/util/_display.py +12 -0
- inspect_ai/util/_sandbox/context.py +15 -0
- inspect_ai/util/_sandbox/docker/docker.py +7 -5
- inspect_ai/util/_sandbox/environment.py +32 -1
- inspect_ai/util/_sandbox/events.py +183 -0
- inspect_ai/util/_sandbox/local.py +3 -3
- inspect_ai/util/_sandbox/self_check.py +131 -43
- inspect_ai/util/_subtask.py +11 -0
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/METADATA +3 -3
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/RECORD +233 -211
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/WHEEL +1 -1
- inspect_ai/_view/www/src/components/VirtualList.module.css +0 -19
- inspect_ai/_view/www/src/components/VirtualList.tsx +0 -292
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_node.py +0 -312
- inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +0 -275
- inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.png +0 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_node.py +0 -176
- inspect_ai/tool/_tools/_web_browser/_resources/test_dm_env_servicer.py +0 -135
- inspect_ai/tool/_tools/_web_browser/_resources/test_web_environment.py +0 -71
- inspect_ai/tool/_tools/_web_browser/_resources/web_environment.py +0 -184
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py
CHANGED
@@ -218,9 +218,15 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
218
218
|
@click.option(
|
219
219
|
"--time-limit",
|
220
220
|
type=int,
|
221
|
-
help="Limit on total
|
221
|
+
help="Limit on total running time for each sample.",
|
222
222
|
envvar="INSPECT_EVAL_TIME_LIMIT",
|
223
223
|
)
|
224
|
+
@click.option(
|
225
|
+
"--working-limit",
|
226
|
+
type=int,
|
227
|
+
help="Limit on total working time (e.g. model generation, tool calls, etc.) for each sample.",
|
228
|
+
envvar="INSPECT_EVAL_WORKING_LIMIT",
|
229
|
+
)
|
224
230
|
@click.option(
|
225
231
|
"--fail-on-error",
|
226
232
|
type=float,
|
@@ -384,15 +390,19 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
384
390
|
@click.option(
|
385
391
|
"--reasoning-effort",
|
386
392
|
type=click.Choice(["low", "medium", "high"]),
|
387
|
-
help="Constrains effort on reasoning for reasoning models. Open AI
|
393
|
+
help="Constrains effort on reasoning for reasoning models. Open AI o-series models only.",
|
388
394
|
envvar="INSPECT_EVAL_REASONING_EFFORT",
|
389
395
|
)
|
390
396
|
@click.option(
|
391
|
-
"--reasoning-
|
392
|
-
type=
|
393
|
-
|
394
|
-
|
395
|
-
|
397
|
+
"--reasoning-tokens",
|
398
|
+
type=int,
|
399
|
+
help="Maximum number of tokens to use for reasoning. Anthropic Claude models only.",
|
400
|
+
envvar="INSPECT_EVAL_REASONING_TOKENS",
|
401
|
+
)
|
402
|
+
@click.option(
|
403
|
+
"--reasoning-history",
|
404
|
+
type=click.Choice(["none", "all", "last", "auto"]),
|
405
|
+
help='Include reasoning in chat message history sent to generate (defaults to "auto", which uses the recommended default for each provider)',
|
396
406
|
envvar="INSPECT_EVAL_REASONING_HISTORY",
|
397
407
|
)
|
398
408
|
@click.option(
|
@@ -464,10 +474,12 @@ def eval_command(
|
|
464
474
|
max_tool_output: int | None,
|
465
475
|
cache_prompt: str | None,
|
466
476
|
reasoning_effort: str | None,
|
467
|
-
|
477
|
+
reasoning_tokens: int | None,
|
478
|
+
reasoning_history: Literal["none", "all", "last", "auto"] | None,
|
468
479
|
message_limit: int | None,
|
469
480
|
token_limit: int | None,
|
470
481
|
time_limit: int | None,
|
482
|
+
working_limit: int | None,
|
471
483
|
max_samples: int | None,
|
472
484
|
max_tasks: int | None,
|
473
485
|
max_subprocesses: int | None,
|
@@ -518,6 +530,7 @@ def eval_command(
|
|
518
530
|
message_limit=message_limit,
|
519
531
|
token_limit=token_limit,
|
520
532
|
time_limit=time_limit,
|
533
|
+
working_limit=working_limit,
|
521
534
|
max_samples=max_samples,
|
522
535
|
max_tasks=max_tasks,
|
523
536
|
max_subprocesses=max_subprocesses,
|
@@ -625,10 +638,12 @@ def eval_set_command(
|
|
625
638
|
max_tool_output: int | None,
|
626
639
|
cache_prompt: str | None,
|
627
640
|
reasoning_effort: str | None,
|
628
|
-
|
641
|
+
reasoning_tokens: int | None,
|
642
|
+
reasoning_history: Literal["none", "all", "last", "auto"] | None,
|
629
643
|
message_limit: int | None,
|
630
644
|
token_limit: int | None,
|
631
645
|
time_limit: int | None,
|
646
|
+
working_limit: int | None,
|
632
647
|
max_samples: int | None,
|
633
648
|
max_tasks: int | None,
|
634
649
|
max_subprocesses: int | None,
|
@@ -684,6 +699,7 @@ def eval_set_command(
|
|
684
699
|
message_limit=message_limit,
|
685
700
|
token_limit=token_limit,
|
686
701
|
time_limit=time_limit,
|
702
|
+
working_limit=working_limit,
|
687
703
|
max_samples=max_samples,
|
688
704
|
max_tasks=max_tasks,
|
689
705
|
max_subprocesses=max_subprocesses,
|
@@ -737,6 +753,7 @@ def eval_exec(
|
|
737
753
|
message_limit: int | None,
|
738
754
|
token_limit: int | None,
|
739
755
|
time_limit: int | None,
|
756
|
+
working_limit: int | None,
|
740
757
|
max_samples: int | None,
|
741
758
|
max_tasks: int | None,
|
742
759
|
max_subprocesses: int | None,
|
@@ -817,6 +834,7 @@ def eval_exec(
|
|
817
834
|
message_limit=message_limit,
|
818
835
|
token_limit=token_limit,
|
819
836
|
time_limit=time_limit,
|
837
|
+
working_limit=working_limit,
|
820
838
|
max_samples=max_samples,
|
821
839
|
max_tasks=max_tasks,
|
822
840
|
max_subprocesses=max_subprocesses,
|
@@ -9,10 +9,12 @@ from .config import task_dict
|
|
9
9
|
|
10
10
|
|
11
11
|
@throttle(1)
|
12
|
-
def task_footer(
|
12
|
+
def task_footer(
|
13
|
+
counters: dict[str, str], style: str = ""
|
14
|
+
) -> tuple[RenderableType, RenderableType]:
|
13
15
|
return (
|
14
16
|
Text.from_markup(task_resources(), style=style),
|
15
|
-
Text.from_markup(
|
17
|
+
Text.from_markup(task_counters(counters), style=style),
|
16
18
|
)
|
17
19
|
|
18
20
|
|
@@ -23,5 +25,13 @@ def task_resources() -> str:
|
|
23
25
|
return task_dict(resources)
|
24
26
|
|
25
27
|
|
26
|
-
def
|
28
|
+
def task_counters(counters: dict[str, str]) -> str:
|
29
|
+
return task_dict(counters | task_http_rate_limits())
|
30
|
+
|
31
|
+
|
32
|
+
def task_http_rate_limits() -> dict[str, str]:
|
33
|
+
return {"HTTP rate limits": f"{http_rate_limit_count():,}"}
|
34
|
+
|
35
|
+
|
36
|
+
def task_http_rate_limits_str() -> str:
|
27
37
|
return f"HTTP rate limits: {http_rate_limit_count():,}"
|
@@ -22,7 +22,7 @@ from ..core.display import (
|
|
22
22
|
TaskSpec,
|
23
23
|
TaskWithResult,
|
24
24
|
)
|
25
|
-
from ..core.footer import
|
25
|
+
from ..core.footer import task_http_rate_limits_str
|
26
26
|
from ..core.panel import task_panel, task_targets
|
27
27
|
from ..core.results import task_metric, tasks_results
|
28
28
|
|
@@ -89,6 +89,10 @@ class PlainDisplay(Display):
|
|
89
89
|
show_model_names=self.multiple_model_names,
|
90
90
|
)
|
91
91
|
|
92
|
+
def display_counter(self, caption: str, value: str) -> None:
|
93
|
+
# Not supported for plain display as counters are only shown for tasks.
|
94
|
+
pass
|
95
|
+
|
92
96
|
def _print_results(self) -> None:
|
93
97
|
"""Print final results using rich panels"""
|
94
98
|
panels = tasks_results(self.tasks)
|
@@ -178,7 +182,7 @@ class PlainTaskDisplay(TaskDisplay):
|
|
178
182
|
status_parts.append(resources)
|
179
183
|
|
180
184
|
# Add rate limits
|
181
|
-
rate_limits =
|
185
|
+
rate_limits = task_http_rate_limits_str()
|
182
186
|
if rate_limits:
|
183
187
|
status_parts.append(rate_limits)
|
184
188
|
|
@@ -60,6 +60,7 @@ class RichDisplay(Display):
|
|
60
60
|
self.parallel = False
|
61
61
|
self.live: Live | None = None
|
62
62
|
self.timer_handle: asyncio.TimerHandle | None = None
|
63
|
+
self.counters: dict[str, str] = {}
|
63
64
|
rich_initialise()
|
64
65
|
|
65
66
|
@override
|
@@ -153,13 +154,20 @@ class RichDisplay(Display):
|
|
153
154
|
and self.live.is_started
|
154
155
|
):
|
155
156
|
if self.parallel:
|
156
|
-
r = tasks_live_status(
|
157
|
+
r = tasks_live_status(
|
158
|
+
self.total_tasks, self.tasks, self.progress_ui, self.counters
|
159
|
+
)
|
157
160
|
else:
|
158
|
-
r = task_live_status(self.tasks, self.progress_ui)
|
161
|
+
r = task_live_status(self.tasks, self.progress_ui, self.counters)
|
159
162
|
self.live.update(r, refresh=True)
|
160
163
|
|
161
164
|
self.timer_handle = asyncio.get_event_loop().call_later(1, self._update_display)
|
162
165
|
|
166
|
+
@override
|
167
|
+
def display_counter(self, caption: str, value: str) -> None:
|
168
|
+
self.counters[caption] = value
|
169
|
+
self._update_display()
|
170
|
+
|
163
171
|
|
164
172
|
class RichTaskScreen(TaskScreen):
|
165
173
|
def __init__(self, live: Live) -> None:
|
@@ -286,7 +294,9 @@ class RichTaskDisplay(TaskDisplay):
|
|
286
294
|
self.p.complete()
|
287
295
|
|
288
296
|
|
289
|
-
def task_live_status(
|
297
|
+
def task_live_status(
|
298
|
+
tasks: list[TaskStatus], progress: RProgress, counters: dict[str, str]
|
299
|
+
) -> RenderableType:
|
290
300
|
theme = rich_theme()
|
291
301
|
|
292
302
|
# the panel contents
|
@@ -300,13 +310,16 @@ def task_live_status(tasks: list[TaskStatus], progress: RProgress) -> Renderable
|
|
300
310
|
show_model=len(tasks) == 1,
|
301
311
|
body=Group("", progress),
|
302
312
|
subtitle=subtitle,
|
303
|
-
footer=task_footer(theme.light),
|
313
|
+
footer=task_footer(counters, theme.light),
|
304
314
|
log_location=None,
|
305
315
|
)
|
306
316
|
|
307
317
|
|
308
318
|
def tasks_live_status(
|
309
|
-
total_tasks: int,
|
319
|
+
total_tasks: int,
|
320
|
+
tasks: list[TaskStatus],
|
321
|
+
progress: RProgress,
|
322
|
+
counters: dict[str, str],
|
310
323
|
) -> RenderableType:
|
311
324
|
# rendering context
|
312
325
|
theme = rich_theme()
|
@@ -325,7 +338,7 @@ def tasks_live_status(
|
|
325
338
|
footer_table = Table.grid(expand=True)
|
326
339
|
footer_table.add_column()
|
327
340
|
footer_table.add_column(justify="right")
|
328
|
-
footer = task_footer(theme.light)
|
341
|
+
footer = task_footer(counters, theme.light)
|
329
342
|
footer_table.add_row()
|
330
343
|
footer_table.add_row(footer[0], footer[1])
|
331
344
|
|
@@ -89,6 +89,7 @@ class TaskScreenApp(App[TR]):
|
|
89
89
|
self._total_tasks = 0
|
90
90
|
self._parallel = False
|
91
91
|
self._tasks: list[TaskWithResult] = []
|
92
|
+
self._counters: dict[str, str] = {}
|
92
93
|
|
93
94
|
# all tasks processed by app
|
94
95
|
self._app_tasks: list[TaskWithResult] = []
|
@@ -185,7 +186,8 @@ class TaskScreenApp(App[TR]):
|
|
185
186
|
# force repaint
|
186
187
|
self.refresh(repaint=True)
|
187
188
|
|
188
|
-
# enable mouse support (this broke in textual 2.0 when running in VS Code
|
189
|
+
# enable mouse support (this broke in textual 2.0 when running in VS Code
|
190
|
+
# however is fixed in textual 2.1)
|
189
191
|
assert self.app._driver
|
190
192
|
textual_enable_mouse_support(self.app._driver)
|
191
193
|
|
@@ -301,7 +303,7 @@ class TaskScreenApp(App[TR]):
|
|
301
303
|
samples_view.set_samples(active_and_started_samples)
|
302
304
|
|
303
305
|
def update_footer(self) -> None:
|
304
|
-
left, right = task_footer()
|
306
|
+
left, right = task_footer(self._counters)
|
305
307
|
footer = self.query_one(AppFooter)
|
306
308
|
footer.left = left
|
307
309
|
footer.right = right
|
@@ -315,7 +317,7 @@ class TaskScreenApp(App[TR]):
|
|
315
317
|
|
316
318
|
def set_unread(unread: int | None) -> None:
|
317
319
|
if unread is not None:
|
318
|
-
console_tab.label = f"Console ({unread}" # type: ignore[assignment]
|
320
|
+
console_tab.label = f"Console ({unread})" # type: ignore[assignment]
|
319
321
|
else:
|
320
322
|
console_tab.label = "Console" # type: ignore[assignment]
|
321
323
|
|
@@ -376,6 +378,10 @@ class TaskScreenApp(App[TR]):
|
|
376
378
|
except NoMatches:
|
377
379
|
return None
|
378
380
|
|
381
|
+
def display_counter(self, caption: str, value: str) -> None:
|
382
|
+
self._counters[caption] = value
|
383
|
+
self.update_footer()
|
384
|
+
|
379
385
|
class InputPanelHost(InputPanel.Host):
|
380
386
|
def __init__(self, app: "TaskScreenApp[TR]", tab_id: str) -> None:
|
381
387
|
self.app = app
|
@@ -72,3 +72,7 @@ class TextualDisplay(Display):
|
|
72
72
|
def task(self, profile: TaskProfile) -> Iterator[TaskDisplay]:
|
73
73
|
with self.app.task_display(profile) as task_display:
|
74
74
|
yield task_display
|
75
|
+
|
76
|
+
@override
|
77
|
+
def display_counter(self, caption: str, value: str) -> None:
|
78
|
+
self.app.display_counter(caption, value)
|
@@ -39,7 +39,7 @@ class SamplesView(Widget):
|
|
39
39
|
padding: 0 1 0 1;
|
40
40
|
layout: grid;
|
41
41
|
grid-size: 2 3;
|
42
|
-
grid-rows: auto 1fr
|
42
|
+
grid-rows: auto 1fr 3;
|
43
43
|
grid-columns: 32 1fr;
|
44
44
|
grid-gutter: 1;
|
45
45
|
}
|
@@ -141,8 +141,8 @@ class SamplesList(OptionList):
|
|
141
141
|
if highlighted_sample and (highlighted_sample not in self.samples):
|
142
142
|
self.samples.append(highlighted_sample)
|
143
143
|
|
144
|
-
# sort the samples by
|
145
|
-
self.samples.sort(key=lambda sample: sample.
|
144
|
+
# sort the samples by running time
|
145
|
+
self.samples.sort(key=lambda sample: sample.running_time, reverse=True)
|
146
146
|
|
147
147
|
# rebuild the list
|
148
148
|
self.clear_options()
|
@@ -154,9 +154,7 @@ class SamplesList(OptionList):
|
|
154
154
|
table.add_column(width=1)
|
155
155
|
task_name = Text.from_markup(f"{registry_unqualified_name(sample.task)}")
|
156
156
|
task_name.truncate(18, overflow="ellipsis", pad=True)
|
157
|
-
task_time = Text.from_markup(
|
158
|
-
f"{format_progress_time(sample.execution_time)}"
|
159
|
-
)
|
157
|
+
task_time = Text.from_markup(f"{format_progress_time(sample.running_time)}")
|
160
158
|
table.add_row(task_name, task_time, " ")
|
161
159
|
sample_id = Text.from_markup(f"id: {sample.sample.id}")
|
162
160
|
sample_id.truncate(18, overflow="ellipsis", pad=True)
|
@@ -423,10 +421,6 @@ class SampleToolbar(Horizontal):
|
|
423
421
|
CANCEL_DISABLED = "Cancelling sample..."
|
424
422
|
|
425
423
|
DEFAULT_CSS = f"""
|
426
|
-
SampleToolbar {{
|
427
|
-
grid-size: 5 1;
|
428
|
-
grid-columns: auto auto 1fr auto auto;
|
429
|
-
}}
|
430
424
|
SampleToolbar #{STATUS_GROUP} {{
|
431
425
|
width: 22;
|
432
426
|
}}
|
@@ -9,7 +9,7 @@ from textual.containers import ScrollableContainer
|
|
9
9
|
from textual.widget import Widget
|
10
10
|
from textual.widgets import Static
|
11
11
|
|
12
|
-
from inspect_ai._util.content import ContentText
|
12
|
+
from inspect_ai._util.content import ContentReasoning, ContentText
|
13
13
|
from inspect_ai._util.rich import lines_display
|
14
14
|
from inspect_ai._util.transcript import (
|
15
15
|
set_transcript_markdown_options,
|
@@ -36,7 +36,6 @@ from inspect_ai.log._transcript import (
|
|
36
36
|
)
|
37
37
|
from inspect_ai.model._chat_message import (
|
38
38
|
ChatMessage,
|
39
|
-
ChatMessageAssistant,
|
40
39
|
ChatMessageUser,
|
41
40
|
)
|
42
41
|
from inspect_ai.model._render import messages_preceding_assistant
|
@@ -193,16 +192,29 @@ def render_model_event(event: ModelEvent) -> EventDisplay:
|
|
193
192
|
return EventDisplay(f"model: {event.model}", Group(*content))
|
194
193
|
|
195
194
|
|
196
|
-
def
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
for
|
201
|
-
|
195
|
+
def render_sub_events(events: list[Event]) -> list[RenderableType]:
|
196
|
+
content: list[RenderableType] = []
|
197
|
+
for e in events:
|
198
|
+
event_displays = render_event(e) or []
|
199
|
+
for d in event_displays:
|
200
|
+
if d.content:
|
201
|
+
content.append(Text(" "))
|
202
|
+
content.append(transcript_separator(d.title, "black", "··"))
|
203
|
+
if isinstance(d.content, Markdown):
|
204
|
+
set_transcript_markdown_options(d.content)
|
205
|
+
content.append(d.content)
|
202
206
|
|
207
|
+
return content
|
208
|
+
|
209
|
+
|
210
|
+
def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
|
203
211
|
# render the call
|
204
212
|
content = transcript_tool_call(event)
|
205
213
|
|
214
|
+
# render sub-events
|
215
|
+
if event.events:
|
216
|
+
content.extend(render_sub_events(event.events))
|
217
|
+
|
206
218
|
# render the output
|
207
219
|
if isinstance(event.result, list):
|
208
220
|
result: ToolResult = "\n".join(
|
@@ -220,7 +232,7 @@ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
|
|
220
232
|
result = str(result).strip()
|
221
233
|
content.extend(lines_display(result, 50))
|
222
234
|
|
223
|
-
return
|
235
|
+
return [EventDisplay("tool call", Group(*content))]
|
224
236
|
|
225
237
|
|
226
238
|
def render_step_event(event: StepEvent) -> EventDisplay:
|
@@ -257,13 +269,13 @@ def render_score_event(event: ScoreEvent) -> EventDisplay:
|
|
257
269
|
|
258
270
|
|
259
271
|
def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
|
272
|
+
# render header
|
273
|
+
content: list[RenderableType] = [transcript_function(event.name, event.input)]
|
274
|
+
|
260
275
|
# render sub-events
|
261
|
-
display: list[EventDisplay] = []
|
262
276
|
if event.events:
|
263
|
-
|
264
|
-
display.extend(render_event(e) or [])
|
277
|
+
content.extend(render_sub_events(event.events))
|
265
278
|
|
266
|
-
content: list[RenderableType] = [transcript_function(event.name, event.input)]
|
267
279
|
if event.result:
|
268
280
|
content.append(Text())
|
269
281
|
if isinstance(event.result, str | int | float | bool | None):
|
@@ -271,7 +283,7 @@ def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
|
|
271
283
|
else:
|
272
284
|
content.append(render_as_json(event.result))
|
273
285
|
|
274
|
-
return
|
286
|
+
return [EventDisplay(f"subtask: {event.name}", Group(*content))]
|
275
287
|
|
276
288
|
|
277
289
|
def render_input_event(event: InputEvent) -> EventDisplay:
|
@@ -320,11 +332,16 @@ def render_message(message: ChatMessage) -> list[RenderableType]:
|
|
320
332
|
Text(),
|
321
333
|
]
|
322
334
|
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
if message.text:
|
335
|
+
# deal with plain text or with content blocks
|
336
|
+
if isinstance(message.content, str):
|
327
337
|
content.extend([transcript_markdown(message.text.strip(), escape=True)])
|
338
|
+
else:
|
339
|
+
for c in message.content:
|
340
|
+
if isinstance(c, ContentReasoning):
|
341
|
+
content.extend(transcript_reasoning(c))
|
342
|
+
elif isinstance(c, ContentText):
|
343
|
+
content.extend([transcript_markdown(c.text.strip(), escape=True)])
|
344
|
+
|
328
345
|
return content
|
329
346
|
|
330
347
|
|
inspect_ai/_eval/eval.py
CHANGED
@@ -75,6 +75,7 @@ def eval(
|
|
75
75
|
message_limit: int | None = None,
|
76
76
|
token_limit: int | None = None,
|
77
77
|
time_limit: int | None = None,
|
78
|
+
working_limit: int | None = None,
|
78
79
|
max_samples: int | None = None,
|
79
80
|
max_tasks: int | None = None,
|
80
81
|
max_subprocesses: int | None = None,
|
@@ -132,7 +133,10 @@ def eval(
|
|
132
133
|
so they can be debugged (defaults to False).
|
133
134
|
message_limit: Limit on total messages used for each sample.
|
134
135
|
token_limit: Limit on total tokens used for each sample.
|
135
|
-
time_limit: Limit on time (in seconds) for
|
136
|
+
time_limit: Limit on clock time (in seconds) for samples.
|
137
|
+
working_limit: Limit on working time (in seconds) for sample. Working
|
138
|
+
time includes model generation, tool calls, etc. but does not include
|
139
|
+
time spent waiting on retries or shared resources.
|
136
140
|
max_samples: Maximum number of samples to run in parallel
|
137
141
|
(default is max_connections)
|
138
142
|
max_tasks: Maximum number of tasks to run in parallel
|
@@ -186,6 +190,7 @@ def eval(
|
|
186
190
|
message_limit=message_limit,
|
187
191
|
token_limit=token_limit,
|
188
192
|
time_limit=time_limit,
|
193
|
+
working_limit=working_limit,
|
189
194
|
max_samples=max_samples,
|
190
195
|
max_tasks=max_tasks,
|
191
196
|
max_subprocesses=max_subprocesses,
|
@@ -227,6 +232,7 @@ async def eval_async(
|
|
227
232
|
message_limit: int | None = None,
|
228
233
|
token_limit: int | None = None,
|
229
234
|
time_limit: int | None = None,
|
235
|
+
working_limit: int | None = None,
|
230
236
|
max_samples: int | None = None,
|
231
237
|
max_tasks: int | None = None,
|
232
238
|
max_subprocesses: int | None = None,
|
@@ -281,7 +287,10 @@ async def eval_async(
|
|
281
287
|
so they can be debugged (defaults to False).
|
282
288
|
message_limit (int | None): Limit on total messages used for each sample.
|
283
289
|
token_limit (int | None): Limit on total tokens used for each sample.
|
284
|
-
time_limit
|
290
|
+
time_limit: Limit on clock time (in seconds) for samples.
|
291
|
+
working_limit: Limit on working time (in seconds) for sample. Working
|
292
|
+
time includes model generation, tool calls, etc. but does not include
|
293
|
+
time spent waiting on retries or shared resources.
|
285
294
|
max_samples (int | None): Maximum number of samples to run in parallel
|
286
295
|
(default is max_connections)
|
287
296
|
max_tasks (int | None): Maximum number of tasks to run in parallel
|
@@ -395,6 +404,7 @@ async def eval_async(
|
|
395
404
|
message_limit=message_limit,
|
396
405
|
token_limit=token_limit,
|
397
406
|
time_limit=time_limit,
|
407
|
+
working_limit=working_limit,
|
398
408
|
max_samples=max_samples,
|
399
409
|
max_tasks=max_tasks,
|
400
410
|
max_subprocesses=max_subprocesses,
|
@@ -702,6 +712,7 @@ async def eval_retry_async(
|
|
702
712
|
message_limit = eval_log.eval.config.message_limit
|
703
713
|
token_limit = eval_log.eval.config.token_limit
|
704
714
|
time_limit = eval_log.eval.config.time_limit
|
715
|
+
working_limit = eval_log.eval.config.working_limit
|
705
716
|
max_samples = max_samples or eval_log.eval.config.max_samples
|
706
717
|
max_tasks = max_tasks or eval_log.eval.config.max_tasks
|
707
718
|
max_subprocesses = max_subprocesses or eval_log.eval.config.max_subprocesses
|
@@ -763,6 +774,7 @@ async def eval_retry_async(
|
|
763
774
|
message_limit=message_limit,
|
764
775
|
token_limit=token_limit,
|
765
776
|
time_limit=time_limit,
|
777
|
+
working_limit=working_limit,
|
766
778
|
max_samples=max_samples,
|
767
779
|
max_tasks=max_tasks,
|
768
780
|
max_subprocesses=max_subprocesses,
|
inspect_ai/_eval/evalset.py
CHANGED
@@ -79,6 +79,7 @@ def eval_set(
|
|
79
79
|
message_limit: int | None = None,
|
80
80
|
token_limit: int | None = None,
|
81
81
|
time_limit: int | None = None,
|
82
|
+
working_limit: int | None = None,
|
82
83
|
max_samples: int | None = None,
|
83
84
|
max_tasks: int | None = None,
|
84
85
|
max_subprocesses: int | None = None,
|
@@ -146,7 +147,10 @@ def eval_set(
|
|
146
147
|
so they can be debugged (defaults to False).
|
147
148
|
message_limit: Limit on total messages used for each sample.
|
148
149
|
token_limit: Limit on total tokens used for each sample.
|
149
|
-
time_limit: Limit on time (in seconds) for
|
150
|
+
time_limit: Limit on clock time (in seconds) for samples.
|
151
|
+
working_limit: Limit on working time (in seconds) for sample. Working
|
152
|
+
time includes model generation, tool calls, etc. but does not include
|
153
|
+
time spent waiting on retries or shared resources.
|
150
154
|
max_samples: Maximum number of samples to run in parallel
|
151
155
|
(default is max_connections)
|
152
156
|
max_tasks: Maximum number of tasks to run in parallel
|
@@ -202,6 +206,7 @@ def eval_set(
|
|
202
206
|
message_limit=message_limit,
|
203
207
|
token_limit=token_limit,
|
204
208
|
time_limit=time_limit,
|
209
|
+
working_limit=working_limit,
|
205
210
|
max_samples=max_samples,
|
206
211
|
max_tasks=max_tasks,
|
207
212
|
max_subprocesses=max_subprocesses,
|
inspect_ai/_eval/run.py
CHANGED
@@ -163,6 +163,12 @@ async def eval_run(
|
|
163
163
|
else:
|
164
164
|
task.time_limit = task_eval_config.time_limit
|
165
165
|
|
166
|
+
# sample execution limit
|
167
|
+
if task_eval_config.working_limit is None:
|
168
|
+
task_eval_config.working_limit = task.working_limit
|
169
|
+
else:
|
170
|
+
task.working_limit = task_eval_config.working_limit
|
171
|
+
|
166
172
|
# fail_on_error
|
167
173
|
if task_eval_config.fail_on_error is None:
|
168
174
|
task_eval_config.fail_on_error = task.fail_on_error
|