inspect-ai 0.3.69__py3-none-any.whl → 0.3.71__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +27 -9
- inspect_ai/_display/core/display.py +2 -0
- inspect_ai/_display/core/footer.py +13 -3
- inspect_ai/_display/plain/display.py +6 -2
- inspect_ai/_display/rich/display.py +19 -6
- inspect_ai/_display/textual/app.py +9 -3
- inspect_ai/_display/textual/display.py +4 -0
- inspect_ai/_display/textual/widgets/samples.py +4 -10
- inspect_ai/_display/textual/widgets/transcript.py +35 -18
- inspect_ai/_eval/eval.py +14 -2
- inspect_ai/_eval/evalset.py +6 -1
- inspect_ai/_eval/run.py +6 -0
- inspect_ai/_eval/task/run.py +49 -23
- inspect_ai/_eval/task/task.py +26 -3
- inspect_ai/_util/content.py +20 -1
- inspect_ai/_util/interrupt.py +6 -0
- inspect_ai/_util/logger.py +19 -0
- inspect_ai/_util/rich.py +7 -8
- inspect_ai/_util/text.py +13 -0
- inspect_ai/_util/transcript.py +20 -6
- inspect_ai/_util/working.py +50 -0
- inspect_ai/_view/www/App.css +6 -0
- inspect_ai/_view/www/dist/assets/index.css +171 -99
- inspect_ai/_view/www/dist/assets/index.js +5972 -2770
- inspect_ai/_view/www/eslint.config.mjs +24 -1
- inspect_ai/_view/www/log-schema.json +619 -21
- inspect_ai/_view/www/package.json +8 -3
- inspect_ai/_view/www/src/App.tsx +2 -2
- inspect_ai/_view/www/src/appearance/icons.ts +3 -1
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +4 -3
- inspect_ai/_view/www/src/components/Card.tsx +9 -8
- inspect_ai/_view/www/src/components/DownloadButton.tsx +2 -1
- inspect_ai/_view/www/src/components/EmptyPanel.tsx +2 -2
- inspect_ai/_view/www/src/components/ErrorPanel.tsx +4 -3
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +13 -5
- inspect_ai/_view/www/src/components/FindBand.tsx +3 -3
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +3 -3
- inspect_ai/_view/www/src/components/LabeledValue.tsx +5 -4
- inspect_ai/_view/www/src/components/LargeModal.tsx +18 -13
- inspect_ai/_view/www/src/components/{LightboxCarousel.css → LightboxCarousel.module.css} +22 -18
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +36 -27
- inspect_ai/_view/www/src/components/MessageBand.tsx +2 -1
- inspect_ai/_view/www/src/components/NavPills.tsx +9 -8
- inspect_ai/_view/www/src/components/ProgressBar.tsx +2 -1
- inspect_ai/_view/www/src/components/TabSet.tsx +21 -15
- inspect_ai/_view/www/src/index.tsx +2 -2
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +11 -9
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +3 -2
- inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +1 -0
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +16 -1
- inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +3 -2
- inspect_ai/_view/www/src/plan/DetailStep.tsx +2 -1
- inspect_ai/_view/www/src/plan/PlanCard.tsx +2 -5
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +6 -9
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +2 -1
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +3 -3
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +2 -2
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +3 -3
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +9 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +30 -3
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +25 -4
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +3 -19
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +22 -7
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +35 -6
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -2
- inspect_ai/_view/www/src/samples/chat/messages.ts +15 -2
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +13 -4
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +2 -2
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +18 -19
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +1 -1
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +4 -3
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +2 -2
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +2 -3
- inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +3 -2
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +57 -45
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +2 -1
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +2 -2
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +4 -3
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +2 -5
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +2 -2
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +2 -1
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +4 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +12 -2
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +1 -1
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +25 -28
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +9 -4
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.module.css +32 -0
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +153 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +12 -5
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +18 -14
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -5
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +53 -16
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +6 -3
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +3 -2
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.module.css +28 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.tsx +115 -0
- inspect_ai/_view/www/src/samples/transcript/event/utils.ts +29 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +3 -3
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +11 -8
- inspect_ai/_view/www/src/samples/transcript/types.ts +3 -1
- inspect_ai/_view/www/src/types/log.d.ts +312 -137
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +6 -10
- inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +4 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +32 -9
- inspect_ai/_view/www/src/usage/TokenTable.tsx +4 -6
- inspect_ai/_view/www/src/usage/UsageCard.tsx +2 -1
- inspect_ai/_view/www/src/utils/format.ts +8 -5
- inspect_ai/_view/www/src/utils/json.ts +24 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +6 -5
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +18 -8
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +2 -1
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +2 -1
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +3 -3
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +4 -3
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +5 -4
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +5 -8
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +5 -4
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -2
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +2 -2
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -2
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +2 -5
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +12 -11
- inspect_ai/_view/www/yarn.lock +241 -5
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_condense.py +4 -0
- inspect_ai/log/_log.py +72 -12
- inspect_ai/log/_recorders/eval.py +6 -1
- inspect_ai/log/_samples.py +5 -1
- inspect_ai/log/_transcript.py +89 -2
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_call_tools.py +8 -1
- inspect_ai/model/_chat_message.py +22 -7
- inspect_ai/model/_conversation.py +11 -9
- inspect_ai/model/_generate_config.py +25 -4
- inspect_ai/model/_model.py +164 -72
- inspect_ai/model/_model_call.py +10 -3
- inspect_ai/model/_model_output.py +3 -0
- inspect_ai/model/_openai.py +106 -40
- inspect_ai/model/_providers/anthropic.py +145 -26
- inspect_ai/model/_providers/bedrock.py +7 -0
- inspect_ai/model/_providers/cloudflare.py +20 -7
- inspect_ai/model/_providers/google.py +29 -8
- inspect_ai/model/_providers/groq.py +66 -27
- inspect_ai/model/_providers/hf.py +6 -0
- inspect_ai/model/_providers/mistral.py +78 -51
- inspect_ai/model/_providers/openai.py +66 -4
- inspect_ai/model/_providers/openai_o1.py +10 -0
- inspect_ai/model/_providers/providers.py +2 -2
- inspect_ai/model/_providers/util/tracker.py +92 -0
- inspect_ai/model/_providers/vllm.py +13 -5
- inspect_ai/model/_reasoning.py +15 -2
- inspect_ai/scorer/_model.py +23 -19
- inspect_ai/solver/_basic_agent.py +1 -3
- inspect_ai/solver/_bridge/patch.py +0 -2
- inspect_ai/solver/_human_agent/agent.py +14 -10
- inspect_ai/solver/_human_agent/commands/__init__.py +7 -3
- inspect_ai/solver/_human_agent/commands/submit.py +76 -30
- inspect_ai/solver/_limit.py +4 -4
- inspect_ai/solver/_plan.py +0 -3
- inspect_ai/solver/_task_state.py +7 -0
- inspect_ai/tool/__init__.py +2 -0
- inspect_ai/tool/_tool.py +3 -1
- inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +1 -1
- inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +8 -0
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +24 -0
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +25 -0
- inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +5 -6
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +10 -11
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +71 -0
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +323 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +5 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +279 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +9 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +293 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +94 -0
- inspect_ai/tool/_tools/_web_browser/_resources/constants.py +2 -0
- inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +2 -0
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +50 -0
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +31 -359
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +280 -0
- inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +65 -0
- inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +64 -0
- inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +146 -0
- inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +64 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +180 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +15 -9
- inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +15 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +44 -0
- inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +39 -0
- inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +198 -48
- inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +26 -25
- inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +178 -39
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +38 -19
- inspect_ai/tool/_tools/_web_search.py +3 -3
- inspect_ai/util/__init__.py +2 -1
- inspect_ai/util/_concurrency.py +14 -8
- inspect_ai/util/_display.py +12 -0
- inspect_ai/util/_sandbox/context.py +15 -0
- inspect_ai/util/_sandbox/docker/docker.py +7 -5
- inspect_ai/util/_sandbox/environment.py +32 -1
- inspect_ai/util/_sandbox/events.py +183 -0
- inspect_ai/util/_sandbox/local.py +3 -3
- inspect_ai/util/_sandbox/self_check.py +131 -43
- inspect_ai/util/_subtask.py +11 -0
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/METADATA +3 -3
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/RECORD +233 -211
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/WHEEL +1 -1
- inspect_ai/_view/www/src/components/VirtualList.module.css +0 -19
- inspect_ai/_view/www/src/components/VirtualList.tsx +0 -292
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_node.py +0 -312
- inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +0 -275
- inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.png +0 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_node.py +0 -176
- inspect_ai/tool/_tools/_web_browser/_resources/test_dm_env_servicer.py +0 -135
- inspect_ai/tool/_tools/_web_browser/_resources/test_web_environment.py +0 -71
- inspect_ai/tool/_tools/_web_browser/_resources/web_environment.py +0 -184
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/top_level.txt +0 -0
inspect_ai/_eval/task/run.py
CHANGED
@@ -33,6 +33,10 @@ from inspect_ai._util.registry import (
|
|
33
33
|
registry_unqualified_name,
|
34
34
|
)
|
35
35
|
from inspect_ai._util.timeouts import Timeout, timeout
|
36
|
+
from inspect_ai._util.working import (
|
37
|
+
init_sample_working_limit,
|
38
|
+
sample_waiting_time,
|
39
|
+
)
|
36
40
|
from inspect_ai._view.notify import view_notify_eval
|
37
41
|
from inspect_ai.dataset import Dataset, Sample
|
38
42
|
from inspect_ai.log import (
|
@@ -46,16 +50,13 @@ from inspect_ai.log import (
|
|
46
50
|
from inspect_ai.log._condense import condense_sample
|
47
51
|
from inspect_ai.log._file import eval_log_json_str
|
48
52
|
from inspect_ai.log._log import EvalSampleLimit, EvalSampleReductions, eval_error
|
49
|
-
from inspect_ai.log._samples import
|
50
|
-
active_sample,
|
51
|
-
set_active_sample_message_limit,
|
52
|
-
set_active_sample_token_limit,
|
53
|
-
)
|
53
|
+
from inspect_ai.log._samples import active_sample
|
54
54
|
from inspect_ai.log._transcript import (
|
55
55
|
ErrorEvent,
|
56
56
|
SampleInitEvent,
|
57
57
|
SampleLimitEvent,
|
58
58
|
ScoreEvent,
|
59
|
+
StepEvent,
|
59
60
|
transcript,
|
60
61
|
)
|
61
62
|
from inspect_ai.model import (
|
@@ -182,9 +183,9 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
|
|
182
183
|
if isinstance(solver, Plan):
|
183
184
|
plan = solver
|
184
185
|
elif isinstance(solver, Chain):
|
185
|
-
plan = Plan(list(solver), internal=True)
|
186
|
+
plan = Plan(list(solver), cleanup=task.cleanup, internal=True)
|
186
187
|
else:
|
187
|
-
plan = Plan(unroll(solver), internal=True)
|
188
|
+
plan = Plan(unroll(solver), cleanup=task.cleanup, internal=True)
|
188
189
|
|
189
190
|
# add setup solver(s) if specified
|
190
191
|
if task.setup:
|
@@ -308,6 +309,7 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
|
|
308
309
|
or config.fail_on_error is True
|
309
310
|
),
|
310
311
|
time_limit=config.time_limit,
|
312
|
+
working_limit=config.working_limit,
|
311
313
|
semaphore=sample_semaphore,
|
312
314
|
)
|
313
315
|
for (sample, state) in zip(samples, states)
|
@@ -500,6 +502,7 @@ async def task_run_sample(
|
|
500
502
|
sample_complete: Callable[[dict[str, SampleScore]], None],
|
501
503
|
fails_on_error: bool,
|
502
504
|
time_limit: int | None,
|
505
|
+
working_limit: int | None,
|
503
506
|
semaphore: asyncio.Semaphore | None,
|
504
507
|
) -> dict[str, SampleScore] | None:
|
505
508
|
# if there is an existing sample then tick off its progress, log it, and return it
|
@@ -570,19 +573,37 @@ async def task_run_sample(
|
|
570
573
|
message_limit=state.message_limit,
|
571
574
|
token_limit=state.token_limit,
|
572
575
|
time_limit=time_limit,
|
576
|
+
working_limit=working_limit,
|
573
577
|
fails_on_error=fails_on_error,
|
574
578
|
transcript=sample_transcript,
|
575
579
|
) as active,
|
576
580
|
):
|
581
|
+
start_time: float | None = None
|
577
582
|
error: EvalError | None = None
|
578
583
|
raise_error: BaseException | None = None
|
579
584
|
results: dict[str, SampleScore] = {}
|
580
585
|
try:
|
586
|
+
# begin init
|
587
|
+
transcript()._event(StepEvent(action="begin", name="init"))
|
588
|
+
|
589
|
+
# sample init event (remove file bodies as they have content or absolute paths)
|
590
|
+
event_sample = sample.model_copy(
|
591
|
+
update=dict(files={k: "" for k in sample.files.keys()})
|
592
|
+
if sample.files
|
593
|
+
else None
|
594
|
+
)
|
595
|
+
transcript()._event(
|
596
|
+
SampleInitEvent(sample=event_sample, state=state_jsonable(state))
|
597
|
+
)
|
598
|
+
|
581
599
|
async with sandboxenv_cm:
|
582
600
|
try:
|
583
601
|
# update active sample wth sandboxes now that we are initialised
|
584
602
|
active.sandboxes = await sandbox_connections()
|
585
603
|
|
604
|
+
# end init
|
605
|
+
transcript()._event(StepEvent(action="end", name="init"))
|
606
|
+
|
586
607
|
# initialise timeout context manager
|
587
608
|
timeout_cm = (
|
588
609
|
timeout(time_limit)
|
@@ -590,23 +611,15 @@ async def task_run_sample(
|
|
590
611
|
else contextlib.nullcontext()
|
591
612
|
)
|
592
613
|
|
614
|
+
# record start time
|
615
|
+
start_time = time.monotonic()
|
616
|
+
init_sample_working_limit(start_time, working_limit)
|
617
|
+
|
593
618
|
# run sample w/ optional timeout
|
594
619
|
async with timeout_cm:
|
595
620
|
# mark started
|
596
621
|
active.started = datetime.now().timestamp()
|
597
622
|
|
598
|
-
# sample init event (remove file bodies as they have content or absolute paths)
|
599
|
-
event_sample = sample.model_copy(
|
600
|
-
update=dict(files={k: "" for k in sample.files.keys()})
|
601
|
-
if sample.files
|
602
|
-
else None
|
603
|
-
)
|
604
|
-
transcript()._event(
|
605
|
-
SampleInitEvent(
|
606
|
-
sample=event_sample, state=state_jsonable(state)
|
607
|
-
)
|
608
|
-
)
|
609
|
-
|
610
623
|
# set progress for plan then run it
|
611
624
|
state = await plan(state, generate)
|
612
625
|
|
@@ -661,11 +674,13 @@ async def task_run_sample(
|
|
661
674
|
|
662
675
|
# capture most recent state for scoring
|
663
676
|
state = ex.state or sample_state() or state
|
664
|
-
state.completed = True
|
665
677
|
|
666
678
|
except BaseException as ex:
|
667
679
|
error, raise_error = handle_error(ex)
|
668
680
|
|
681
|
+
# mark completed
|
682
|
+
state.completed = True
|
683
|
+
|
669
684
|
# set timeout for scoring. if the original timeout was hit we still
|
670
685
|
# want to provide opportunity for scoring, but we don't necessarily
|
671
686
|
# want to wait the full timeout again (especially in the case where
|
@@ -676,9 +691,10 @@ async def task_run_sample(
|
|
676
691
|
assert time_limit
|
677
692
|
timeout_cm = timeout(time_limit / 2)
|
678
693
|
|
679
|
-
# turn off
|
680
|
-
|
681
|
-
|
694
|
+
# turn off message and token limits
|
695
|
+
state.message_limit = None
|
696
|
+
state.token_limit = None
|
697
|
+
set_sample_state(state)
|
682
698
|
|
683
699
|
# scoring
|
684
700
|
try:
|
@@ -768,6 +784,7 @@ async def task_run_sample(
|
|
768
784
|
|
769
785
|
# log the sample
|
770
786
|
await log_sample(
|
787
|
+
start_time=start_time,
|
771
788
|
logger=logger,
|
772
789
|
sample=sample,
|
773
790
|
state=state,
|
@@ -788,6 +805,7 @@ async def task_run_sample(
|
|
788
805
|
|
789
806
|
|
790
807
|
async def log_sample(
|
808
|
+
start_time: float | None,
|
791
809
|
logger: TaskLogger,
|
792
810
|
sample: Sample,
|
793
811
|
state: TaskState,
|
@@ -804,6 +822,9 @@ async def log_sample(
|
|
804
822
|
|
805
823
|
# construct sample for logging
|
806
824
|
|
825
|
+
# compute total time if we can
|
826
|
+
total_time = time.monotonic() - start_time if start_time is not None else None
|
827
|
+
|
807
828
|
# if a limit was hit, note that in the Eval Sample
|
808
829
|
limit = None
|
809
830
|
for e in transcript().events:
|
@@ -827,8 +848,13 @@ async def log_sample(
|
|
827
848
|
output=state.output,
|
828
849
|
scores={k: v.score for k, v in scores.items()},
|
829
850
|
store=dict(state.store.items()),
|
851
|
+
uuid=state.uuid,
|
830
852
|
events=list(transcript().events),
|
831
853
|
model_usage=sample_model_usage(),
|
854
|
+
total_time=round(total_time, 3) if total_time is not None else None,
|
855
|
+
working_time=round(total_time - sample_waiting_time(), 3)
|
856
|
+
if total_time is not None
|
857
|
+
else None,
|
832
858
|
error=error,
|
833
859
|
limit=limit,
|
834
860
|
)
|
inspect_ai/_eval/task/task.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from copy import deepcopy
|
2
2
|
from dataclasses import dataclass
|
3
3
|
from logging import getLogger
|
4
|
-
from typing import Any, Callable, Sequence, cast
|
4
|
+
from typing import Any, Awaitable, Callable, Sequence, cast
|
5
5
|
|
6
6
|
from pydantic import BaseModel
|
7
7
|
from typing_extensions import TypedDict, Unpack
|
@@ -17,6 +17,7 @@ from inspect_ai.scorer import Metric, Scorer
|
|
17
17
|
from inspect_ai.scorer._reducer import ScoreReducers, create_reducers
|
18
18
|
from inspect_ai.solver import Plan, Solver, generate
|
19
19
|
from inspect_ai.solver._chain import chain
|
20
|
+
from inspect_ai.solver._task_state import TaskState
|
20
21
|
from inspect_ai.util._sandbox.environment import (
|
21
22
|
SandboxEnvironmentSpec,
|
22
23
|
SandboxEnvironmentType,
|
@@ -46,6 +47,7 @@ class Task:
|
|
46
47
|
dataset: Dataset | Sequence[Sample] | None = None,
|
47
48
|
setup: Solver | list[Solver] | None = None,
|
48
49
|
solver: Solver | list[Solver] = generate(),
|
50
|
+
cleanup: Callable[[TaskState], Awaitable[None]] | None = None,
|
49
51
|
scorer: Scorer | list[Scorer] | None = None,
|
50
52
|
metrics: list[Metric] | dict[str, list[Metric]] | None = None,
|
51
53
|
config: GenerateConfig = GenerateConfig(),
|
@@ -56,6 +58,7 @@ class Task:
|
|
56
58
|
message_limit: int | None = None,
|
57
59
|
token_limit: int | None = None,
|
58
60
|
time_limit: int | None = None,
|
61
|
+
working_limit: int | None = None,
|
59
62
|
name: str | None = None,
|
60
63
|
version: int = 0,
|
61
64
|
metadata: dict[str, Any] | None = None,
|
@@ -69,6 +72,9 @@ class Task:
|
|
69
72
|
even when the main `solver` is replaced).
|
70
73
|
solver: (Solver | list[Solver]): Solver or list of solvers.
|
71
74
|
Defaults to generate(), a normal call to the model.
|
75
|
+
cleanup: Optional cleanup function for task. Called after
|
76
|
+
all solvers have run for each sample (including if an
|
77
|
+
exception occurs during the run)
|
72
78
|
scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
|
73
79
|
metrics (list[Metric] | dict[str, list[Metric]] | None):
|
74
80
|
Alternative metrics (overrides the metrics provided by the specified scorer).
|
@@ -86,7 +92,10 @@ class Task:
|
|
86
92
|
eval if a count of samples fails.
|
87
93
|
message_limit (int | None): Limit on total messages used for each sample.
|
88
94
|
token_limit (int | None): Limit on total tokens used for each sample.
|
89
|
-
time_limit
|
95
|
+
time_limit: Limit on clock time (in seconds) for samples.
|
96
|
+
working_limit: Limit on working time (in seconds) for sample. Working
|
97
|
+
time includes model generation, tool calls, etc. but does not include
|
98
|
+
time spent waiting on retries or shared resources.
|
90
99
|
name: (str | None): Task name. If not specified is automatically
|
91
100
|
determined based on the name of the task directory (or "task")
|
92
101
|
if its anonymous task (e.g. created in a notebook and passed to
|
@@ -123,6 +132,7 @@ class Task:
|
|
123
132
|
self.dataset = resolve_dataset(dataset)
|
124
133
|
self.setup = setup
|
125
134
|
self.solver = resolve_solver(solver)
|
135
|
+
self.cleanup = cleanup
|
126
136
|
self.scorer = resolve_scorer(scorer)
|
127
137
|
self.metrics = metrics
|
128
138
|
self.config = config
|
@@ -135,6 +145,7 @@ class Task:
|
|
135
145
|
self.message_limit = message_limit
|
136
146
|
self.token_limit = token_limit
|
137
147
|
self.time_limit = time_limit
|
148
|
+
self.working_limit = working_limit
|
138
149
|
self.version = version
|
139
150
|
self._name = name
|
140
151
|
self.metadata = metadata
|
@@ -162,6 +173,7 @@ def task_with(
|
|
162
173
|
dataset: Dataset | Sequence[Sample] | None | NotGiven = NOT_GIVEN,
|
163
174
|
setup: Solver | list[Solver] | None | NotGiven = NOT_GIVEN,
|
164
175
|
solver: Solver | list[Solver] | NotGiven = NOT_GIVEN,
|
176
|
+
cleanup: Callable[[TaskState], Awaitable[None]] | None | NotGiven = NOT_GIVEN,
|
165
177
|
scorer: Scorer | list[Scorer] | None | NotGiven = NOT_GIVEN,
|
166
178
|
metrics: list[Metric] | dict[str, list[Metric]] | None | NotGiven = NOT_GIVEN,
|
167
179
|
config: GenerateConfig | NotGiven = NOT_GIVEN,
|
@@ -172,6 +184,7 @@ def task_with(
|
|
172
184
|
message_limit: int | None | NotGiven = NOT_GIVEN,
|
173
185
|
token_limit: int | None | NotGiven = NOT_GIVEN,
|
174
186
|
time_limit: int | None | NotGiven = NOT_GIVEN,
|
187
|
+
working_limit: int | None | NotGiven = NOT_GIVEN,
|
175
188
|
name: str | None | NotGiven = NOT_GIVEN,
|
176
189
|
version: int | NotGiven = NOT_GIVEN,
|
177
190
|
metadata: dict[str, Any] | None | NotGiven = NOT_GIVEN,
|
@@ -185,6 +198,9 @@ def task_with(
|
|
185
198
|
even when the main `solver` is replaced).
|
186
199
|
solver: (Solver | list[Solver]): Solver or list of solvers.
|
187
200
|
Defaults to generate(), a normal call to the model.
|
201
|
+
cleanup: Optional cleanup function for task. Called after
|
202
|
+
all solvers have run for each sample (including if an
|
203
|
+
exception occurs during the run)
|
188
204
|
scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
|
189
205
|
metrics (list[Metric] | dict[str, list[Metric]] | None):
|
190
206
|
Alternative metrics (overrides the metrics provided by the specified scorer).
|
@@ -202,7 +218,10 @@ def task_with(
|
|
202
218
|
eval if a count of samples fails.
|
203
219
|
message_limit (int | None): Limit on total messages used for each sample.
|
204
220
|
token_limit (int | None): Limit on total tokens used for each sample.
|
205
|
-
time_limit
|
221
|
+
time_limit: Limit on clock time (in seconds) for samples.
|
222
|
+
working_limit: Limit on execution time (in seconds) for sample. Execution
|
223
|
+
time includes model generation, tool calls, etc. but does not include
|
224
|
+
time spent waiting on retries or shared resources.
|
206
225
|
name: (str | None): Task name. If not specified is automatically
|
207
226
|
determined based on the name of the task directory (or "task")
|
208
227
|
if its anonymous task (e.g. created in a notebook and passed to
|
@@ -223,6 +242,8 @@ def task_with(
|
|
223
242
|
task.setup = setup
|
224
243
|
if not isinstance(solver, NotGiven):
|
225
244
|
task.solver = resolve_solver(solver)
|
245
|
+
if not isinstance(cleanup, NotGiven):
|
246
|
+
task.cleanup = cleanup
|
226
247
|
if not isinstance(scorer, NotGiven):
|
227
248
|
task.scorer = resolve_scorer(scorer)
|
228
249
|
if not isinstance(metrics, NotGiven):
|
@@ -245,6 +266,8 @@ def task_with(
|
|
245
266
|
task.token_limit = token_limit
|
246
267
|
if not isinstance(time_limit, NotGiven):
|
247
268
|
task.time_limit = time_limit
|
269
|
+
if not isinstance(working_limit, NotGiven):
|
270
|
+
task.working_limit = working_limit
|
248
271
|
if not isinstance(version, NotGiven):
|
249
272
|
task.version = version
|
250
273
|
if not isinstance(name, NotGiven):
|
inspect_ai/_util/content.py
CHANGED
@@ -13,6 +13,25 @@ class ContentText(BaseModel):
|
|
13
13
|
"""Text content."""
|
14
14
|
|
15
15
|
|
16
|
+
class ContentReasoning(BaseModel):
|
17
|
+
"""Reasoning content.
|
18
|
+
|
19
|
+
See the specification for [thinking blocks](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#understanding-thinking-blocks) for Claude models.
|
20
|
+
"""
|
21
|
+
|
22
|
+
type: Literal["reasoning"] = Field(default="reasoning")
|
23
|
+
"""Type."""
|
24
|
+
|
25
|
+
reasoning: str
|
26
|
+
"""Reasoning content."""
|
27
|
+
|
28
|
+
signature: str | None = Field(default=None)
|
29
|
+
"""Signature for reasoning content (used by some models to ensure that reasoning content is not modified for replay)"""
|
30
|
+
|
31
|
+
redacted: bool = Field(default=False)
|
32
|
+
"""Indicates that the explicit content of this reasoning block has been redacted."""
|
33
|
+
|
34
|
+
|
16
35
|
class ContentImage(BaseModel):
|
17
36
|
"""Image content."""
|
18
37
|
|
@@ -55,5 +74,5 @@ class ContentVideo(BaseModel):
|
|
55
74
|
"""Format of video data ('mp4', 'mpeg', or 'mov')"""
|
56
75
|
|
57
76
|
|
58
|
-
Content = Union[ContentText, ContentImage, ContentAudio, ContentVideo]
|
77
|
+
Content = Union[ContentText, ContentReasoning, ContentImage, ContentAudio, ContentVideo]
|
59
78
|
"""Content sent to or received from a model."""
|
inspect_ai/_util/interrupt.py
CHANGED
@@ -1,9 +1,15 @@
|
|
1
1
|
import asyncio
|
2
2
|
|
3
|
+
from .working import check_sample_working_limit
|
4
|
+
|
3
5
|
|
4
6
|
def check_sample_interrupt() -> None:
|
5
7
|
from inspect_ai.log._samples import sample_active
|
6
8
|
|
9
|
+
# check for user interrupt
|
7
10
|
sample = sample_active()
|
8
11
|
if sample and sample.interrupt_action:
|
9
12
|
raise asyncio.CancelledError()
|
13
|
+
|
14
|
+
# check for working_limit
|
15
|
+
check_sample_working_limit()
|
inspect_ai/_util/logger.py
CHANGED
@@ -160,7 +160,9 @@ def init_logger(
|
|
160
160
|
|
161
161
|
# init logging handler on demand
|
162
162
|
global _logHandler
|
163
|
+
removed_root_handlers = False
|
163
164
|
if not _logHandler:
|
165
|
+
removed_root_handlers = remove_non_pytest_root_logger_handlers()
|
164
166
|
_logHandler = LogHandler(min(DEBUG, levelno), transcript_levelno)
|
165
167
|
getLogger().addHandler(_logHandler)
|
166
168
|
|
@@ -173,6 +175,11 @@ def init_logger(
|
|
173
175
|
getLogger("httpx").setLevel(capture_level)
|
174
176
|
getLogger("botocore").setLevel(DEBUG)
|
175
177
|
|
178
|
+
if removed_root_handlers:
|
179
|
+
getLogger(PKG_NAME).warning(
|
180
|
+
"Inspect removed pre-existing root logger handlers and replaced them with its own handler."
|
181
|
+
)
|
182
|
+
|
176
183
|
# set the levelno on the global handler
|
177
184
|
_logHandler.display_level = levelno
|
178
185
|
|
@@ -180,6 +187,18 @@ def init_logger(
|
|
180
187
|
_logHandler: LogHandler | None = None
|
181
188
|
|
182
189
|
|
190
|
+
def remove_non_pytest_root_logger_handlers() -> bool:
|
191
|
+
root_logger = getLogger()
|
192
|
+
non_pytest_handlers = [
|
193
|
+
handler
|
194
|
+
for handler in root_logger.handlers
|
195
|
+
if handler.__module__ != "_pytest.logging"
|
196
|
+
]
|
197
|
+
for handler in non_pytest_handlers:
|
198
|
+
root_logger.removeHandler(handler)
|
199
|
+
return len(non_pytest_handlers) > 0
|
200
|
+
|
201
|
+
|
183
202
|
def notify_logger_record(record: LogRecord, write: bool) -> None:
|
184
203
|
from inspect_ai.log._message import LoggingMessage
|
185
204
|
from inspect_ai.log._transcript import LoggerEvent, transcript
|
inspect_ai/_util/rich.py
CHANGED
@@ -2,23 +2,22 @@ from rich.console import RenderableType
|
|
2
2
|
from rich.style import Style
|
3
3
|
from rich.text import Text
|
4
4
|
|
5
|
+
from inspect_ai._util.text import truncate_lines
|
6
|
+
|
5
7
|
|
6
8
|
def lines_display(
|
7
9
|
text: str, max_lines: int = 100, style: str | Style = ""
|
8
10
|
) -> list[RenderableType]:
|
9
|
-
lines = text
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
]
|
11
|
+
lines, truncated = truncate_lines(text, max_lines)
|
12
|
+
|
13
|
+
content: list[RenderableType] = [Text(lines, style=style)]
|
14
|
+
if truncated is not None:
|
14
15
|
content.append(Text())
|
15
16
|
content.append(
|
16
17
|
Text.from_markup(
|
17
|
-
f"[italic]Output truncated ({
|
18
|
+
f"[italic]Output truncated ({truncated} additional lines)...[/italic]",
|
18
19
|
style=style,
|
19
20
|
)
|
20
21
|
)
|
21
|
-
else:
|
22
|
-
content = [Text(text, style=style)]
|
23
22
|
|
24
23
|
return content
|
inspect_ai/_util/text.py
CHANGED
@@ -134,6 +134,19 @@ def truncate(text: str, length: int, overflow: str = "...", pad: bool = True) ->
|
|
134
134
|
return truncated
|
135
135
|
|
136
136
|
|
137
|
+
def truncate_lines(
|
138
|
+
text: str, max_lines: int = 100, max_characters: int | None = 100 * 100
|
139
|
+
) -> tuple[str, int | None]:
|
140
|
+
if max_characters is not None:
|
141
|
+
text = truncate(text, max_characters)
|
142
|
+
lines = text.splitlines()
|
143
|
+
if len(lines) > max_lines:
|
144
|
+
output = "\n".join(lines[0:max_lines])
|
145
|
+
return output, len(lines) - max_lines
|
146
|
+
else:
|
147
|
+
return text, None
|
148
|
+
|
149
|
+
|
137
150
|
def generate_large_text(target_tokens: int) -> str:
|
138
151
|
"""Generate a large amount of text with approximately the target number of tokens"""
|
139
152
|
generated_text = []
|
inspect_ai/_util/transcript.py
CHANGED
@@ -10,6 +10,8 @@ from rich.panel import Panel
|
|
10
10
|
from rich.rule import Rule
|
11
11
|
from rich.text import Text
|
12
12
|
|
13
|
+
from inspect_ai._util.content import ContentReasoning
|
14
|
+
|
13
15
|
from .format import format_function_call
|
14
16
|
|
15
17
|
|
@@ -111,19 +113,31 @@ def transcript_panel(
|
|
111
113
|
)
|
112
114
|
|
113
115
|
|
114
|
-
def transcript_reasoning(reasoning:
|
116
|
+
def transcript_reasoning(reasoning: ContentReasoning) -> list[RenderableType]:
|
115
117
|
content: list[RenderableType] = []
|
118
|
+
text = (
|
119
|
+
reasoning.reasoning
|
120
|
+
if not reasoning.redacted
|
121
|
+
else "Reasoning encrypted by model provider."
|
122
|
+
)
|
123
|
+
|
116
124
|
content.append(
|
117
|
-
transcript_markdown(
|
118
|
-
f"**<think>** \n{reasoning} \n**</think>**\n\n", escape=True
|
119
|
-
)
|
125
|
+
transcript_markdown(f"**<think>** \n{text} \n**</think>**\n\n", escape=True)
|
120
126
|
)
|
121
127
|
content.append(Text())
|
122
128
|
return content
|
123
129
|
|
124
130
|
|
125
|
-
def transcript_separator(
|
126
|
-
|
131
|
+
def transcript_separator(
|
132
|
+
title: str, color: str, characters: str = "─"
|
133
|
+
) -> RenderableType:
|
134
|
+
return Rule(
|
135
|
+
title=title,
|
136
|
+
characters=characters,
|
137
|
+
style=f"{color} bold",
|
138
|
+
align="center",
|
139
|
+
end="\n\n",
|
140
|
+
)
|
127
141
|
|
128
142
|
|
129
143
|
def transcript_function(function: str, arguments: dict[str, Any]) -> RenderableType:
|
@@ -0,0 +1,50 @@
|
|
1
|
+
import time
|
2
|
+
from contextvars import ContextVar
|
3
|
+
|
4
|
+
|
5
|
+
def init_sample_working_limit(start_time: float, working_limit: float | None) -> None:
|
6
|
+
_sample_working_limit.set(working_limit)
|
7
|
+
_sample_start_time.set(start_time)
|
8
|
+
_sample_waiting_time.set(0)
|
9
|
+
|
10
|
+
|
11
|
+
def sample_waiting_time() -> float:
|
12
|
+
return _sample_waiting_time.get()
|
13
|
+
|
14
|
+
|
15
|
+
def sample_working_time() -> float:
|
16
|
+
return time.monotonic() - _sample_start_time.get() - sample_waiting_time()
|
17
|
+
|
18
|
+
|
19
|
+
def report_sample_waiting_time(waiting_time: float) -> None:
|
20
|
+
_sample_waiting_time.set(_sample_waiting_time.get() + waiting_time)
|
21
|
+
check_sample_working_limit()
|
22
|
+
|
23
|
+
|
24
|
+
def check_sample_working_limit() -> None:
|
25
|
+
# no check if we don't have a limit
|
26
|
+
working_limit = _sample_working_limit.get()
|
27
|
+
if working_limit is None:
|
28
|
+
return
|
29
|
+
|
30
|
+
# are we over the limit?
|
31
|
+
running_time = time.monotonic() - _sample_start_time.get()
|
32
|
+
working_time = running_time - sample_waiting_time()
|
33
|
+
if working_time > working_limit:
|
34
|
+
from inspect_ai.solver._limit import SampleLimitExceededError
|
35
|
+
|
36
|
+
raise SampleLimitExceededError(
|
37
|
+
type="working",
|
38
|
+
value=int(working_time),
|
39
|
+
limit=int(working_limit),
|
40
|
+
message=f"Exceeded working time limit ({working_limit:,} seconds)",
|
41
|
+
)
|
42
|
+
|
43
|
+
|
44
|
+
_sample_working_limit: ContextVar[float | None] = ContextVar(
|
45
|
+
"sample_working_limit", default=None
|
46
|
+
)
|
47
|
+
|
48
|
+
_sample_start_time: ContextVar[float] = ContextVar("sample_start_time", default=0)
|
49
|
+
|
50
|
+
_sample_waiting_time: ContextVar[float] = ContextVar("sample_waiting_time", default=0)
|
inspect_ai/_view/www/App.css
CHANGED
@@ -805,15 +805,21 @@ table.table.table-sm td {
|
|
805
805
|
overflow: unset;
|
806
806
|
}
|
807
807
|
|
808
|
+
.markdown-content pre[class*="language-"],
|
808
809
|
pre[class*="language-"].tool-output,
|
809
810
|
.tool-output {
|
810
811
|
background-color: #f8f8f8;
|
811
812
|
}
|
813
|
+
|
814
|
+
.vscode-dark .model-call pre[class*="language-"],
|
815
|
+
.vscode-dark .markdown-content pre[class*="language-"],
|
812
816
|
.vscode-dark pre[class*="language-"].tool-output,
|
813
817
|
.vscode-dark .tool-output {
|
814
818
|
background-color: #333333;
|
815
819
|
}
|
816
820
|
|
821
|
+
.model-call pre[class*="language-"],
|
822
|
+
.markdown-content pre[class*="language-"],
|
817
823
|
pre[class*="language-"].tool-output {
|
818
824
|
border: none !important;
|
819
825
|
box-shadow: none !important;
|