inspect-ai 0.3.69__py3-none-any.whl → 0.3.71__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +27 -9
- inspect_ai/_display/core/display.py +2 -0
- inspect_ai/_display/core/footer.py +13 -3
- inspect_ai/_display/plain/display.py +6 -2
- inspect_ai/_display/rich/display.py +19 -6
- inspect_ai/_display/textual/app.py +9 -3
- inspect_ai/_display/textual/display.py +4 -0
- inspect_ai/_display/textual/widgets/samples.py +4 -10
- inspect_ai/_display/textual/widgets/transcript.py +35 -18
- inspect_ai/_eval/eval.py +14 -2
- inspect_ai/_eval/evalset.py +6 -1
- inspect_ai/_eval/run.py +6 -0
- inspect_ai/_eval/task/run.py +49 -23
- inspect_ai/_eval/task/task.py +26 -3
- inspect_ai/_util/content.py +20 -1
- inspect_ai/_util/interrupt.py +6 -0
- inspect_ai/_util/logger.py +19 -0
- inspect_ai/_util/rich.py +7 -8
- inspect_ai/_util/text.py +13 -0
- inspect_ai/_util/transcript.py +20 -6
- inspect_ai/_util/working.py +50 -0
- inspect_ai/_view/www/App.css +6 -0
- inspect_ai/_view/www/dist/assets/index.css +171 -99
- inspect_ai/_view/www/dist/assets/index.js +5972 -2770
- inspect_ai/_view/www/eslint.config.mjs +24 -1
- inspect_ai/_view/www/log-schema.json +619 -21
- inspect_ai/_view/www/package.json +8 -3
- inspect_ai/_view/www/src/App.tsx +2 -2
- inspect_ai/_view/www/src/appearance/icons.ts +3 -1
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +4 -3
- inspect_ai/_view/www/src/components/Card.tsx +9 -8
- inspect_ai/_view/www/src/components/DownloadButton.tsx +2 -1
- inspect_ai/_view/www/src/components/EmptyPanel.tsx +2 -2
- inspect_ai/_view/www/src/components/ErrorPanel.tsx +4 -3
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +13 -5
- inspect_ai/_view/www/src/components/FindBand.tsx +3 -3
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +3 -3
- inspect_ai/_view/www/src/components/LabeledValue.tsx +5 -4
- inspect_ai/_view/www/src/components/LargeModal.tsx +18 -13
- inspect_ai/_view/www/src/components/{LightboxCarousel.css → LightboxCarousel.module.css} +22 -18
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +36 -27
- inspect_ai/_view/www/src/components/MessageBand.tsx +2 -1
- inspect_ai/_view/www/src/components/NavPills.tsx +9 -8
- inspect_ai/_view/www/src/components/ProgressBar.tsx +2 -1
- inspect_ai/_view/www/src/components/TabSet.tsx +21 -15
- inspect_ai/_view/www/src/index.tsx +2 -2
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +11 -9
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +3 -2
- inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +1 -0
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +16 -1
- inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +3 -2
- inspect_ai/_view/www/src/plan/DetailStep.tsx +2 -1
- inspect_ai/_view/www/src/plan/PlanCard.tsx +2 -5
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +6 -9
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +2 -1
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +3 -3
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +2 -2
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +3 -3
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +9 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +30 -3
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +25 -4
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +3 -19
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +22 -7
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +35 -6
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -2
- inspect_ai/_view/www/src/samples/chat/messages.ts +15 -2
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +13 -4
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +2 -2
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +18 -19
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +1 -1
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +4 -3
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +2 -2
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +2 -3
- inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +3 -2
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +57 -45
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +2 -1
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +2 -2
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +4 -3
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +2 -5
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +2 -2
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +2 -1
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +4 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +12 -2
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +1 -1
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +25 -28
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +9 -4
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.module.css +32 -0
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +153 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +12 -5
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +18 -14
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -5
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +53 -16
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +6 -3
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +3 -2
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.module.css +28 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.tsx +115 -0
- inspect_ai/_view/www/src/samples/transcript/event/utils.ts +29 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +3 -3
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +11 -8
- inspect_ai/_view/www/src/samples/transcript/types.ts +3 -1
- inspect_ai/_view/www/src/types/log.d.ts +312 -137
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +6 -10
- inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +4 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +32 -9
- inspect_ai/_view/www/src/usage/TokenTable.tsx +4 -6
- inspect_ai/_view/www/src/usage/UsageCard.tsx +2 -1
- inspect_ai/_view/www/src/utils/format.ts +8 -5
- inspect_ai/_view/www/src/utils/json.ts +24 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +6 -5
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +18 -8
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +2 -1
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +2 -1
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +3 -3
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +4 -3
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +5 -4
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +5 -8
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +5 -4
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -2
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +2 -2
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -2
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +2 -5
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +12 -11
- inspect_ai/_view/www/yarn.lock +241 -5
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_condense.py +4 -0
- inspect_ai/log/_log.py +72 -12
- inspect_ai/log/_recorders/eval.py +6 -1
- inspect_ai/log/_samples.py +5 -1
- inspect_ai/log/_transcript.py +89 -2
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_call_tools.py +8 -1
- inspect_ai/model/_chat_message.py +22 -7
- inspect_ai/model/_conversation.py +11 -9
- inspect_ai/model/_generate_config.py +25 -4
- inspect_ai/model/_model.py +164 -72
- inspect_ai/model/_model_call.py +10 -3
- inspect_ai/model/_model_output.py +3 -0
- inspect_ai/model/_openai.py +106 -40
- inspect_ai/model/_providers/anthropic.py +145 -26
- inspect_ai/model/_providers/bedrock.py +7 -0
- inspect_ai/model/_providers/cloudflare.py +20 -7
- inspect_ai/model/_providers/google.py +29 -8
- inspect_ai/model/_providers/groq.py +66 -27
- inspect_ai/model/_providers/hf.py +6 -0
- inspect_ai/model/_providers/mistral.py +78 -51
- inspect_ai/model/_providers/openai.py +66 -4
- inspect_ai/model/_providers/openai_o1.py +10 -0
- inspect_ai/model/_providers/providers.py +2 -2
- inspect_ai/model/_providers/util/tracker.py +92 -0
- inspect_ai/model/_providers/vllm.py +13 -5
- inspect_ai/model/_reasoning.py +15 -2
- inspect_ai/scorer/_model.py +23 -19
- inspect_ai/solver/_basic_agent.py +1 -3
- inspect_ai/solver/_bridge/patch.py +0 -2
- inspect_ai/solver/_human_agent/agent.py +14 -10
- inspect_ai/solver/_human_agent/commands/__init__.py +7 -3
- inspect_ai/solver/_human_agent/commands/submit.py +76 -30
- inspect_ai/solver/_limit.py +4 -4
- inspect_ai/solver/_plan.py +0 -3
- inspect_ai/solver/_task_state.py +7 -0
- inspect_ai/tool/__init__.py +2 -0
- inspect_ai/tool/_tool.py +3 -1
- inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +1 -1
- inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +8 -0
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +24 -0
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +25 -0
- inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +5 -6
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +10 -11
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +71 -0
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +323 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +5 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +279 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +9 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +293 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +94 -0
- inspect_ai/tool/_tools/_web_browser/_resources/constants.py +2 -0
- inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +2 -0
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +50 -0
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +31 -359
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +280 -0
- inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +65 -0
- inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +64 -0
- inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +146 -0
- inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +64 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +180 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +15 -9
- inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +15 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +44 -0
- inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +39 -0
- inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +198 -48
- inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +26 -25
- inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +178 -39
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +38 -19
- inspect_ai/tool/_tools/_web_search.py +3 -3
- inspect_ai/util/__init__.py +2 -1
- inspect_ai/util/_concurrency.py +14 -8
- inspect_ai/util/_display.py +12 -0
- inspect_ai/util/_sandbox/context.py +15 -0
- inspect_ai/util/_sandbox/docker/docker.py +7 -5
- inspect_ai/util/_sandbox/environment.py +32 -1
- inspect_ai/util/_sandbox/events.py +183 -0
- inspect_ai/util/_sandbox/local.py +3 -3
- inspect_ai/util/_sandbox/self_check.py +131 -43
- inspect_ai/util/_subtask.py +11 -0
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/METADATA +3 -3
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/RECORD +233 -211
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/WHEEL +1 -1
- inspect_ai/_view/www/src/components/VirtualList.module.css +0 -19
- inspect_ai/_view/www/src/components/VirtualList.tsx +0 -292
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_node.py +0 -312
- inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +0 -275
- inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.png +0 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_node.py +0 -176
- inspect_ai/tool/_tools/_web_browser/_resources/test_dm_env_servicer.py +0 -135
- inspect_ai/tool/_tools/_web_browser/_resources/test_web_environment.py +0 -71
- inspect_ai/tool/_tools/_web_browser/_resources/web_environment.py +0 -184
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,8 @@
|
|
1
1
|
import asyncio
|
2
|
+
from typing import cast
|
2
3
|
|
3
4
|
from inspect_ai.util import display_type, input_panel, sandbox
|
5
|
+
from inspect_ai.util._sandbox.events import SandboxEnvironmentProxy
|
4
6
|
|
5
7
|
from .._solver import Generate, Solver, solver
|
6
8
|
from .._task_state import TaskState
|
@@ -56,19 +58,21 @@ def human_agent(
|
|
56
58
|
|
57
59
|
# helper function to run the agent (called for fullscreen vs. fallback below)
|
58
60
|
async def run_human_agent(view: HumanAgentView) -> TaskState:
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
61
|
+
sandbox_proxy = cast(SandboxEnvironmentProxy, sandbox())
|
62
|
+
with sandbox_proxy.no_events():
|
63
|
+
# create agent commands
|
64
|
+
commands = human_agent_commands(
|
65
|
+
state, answer, intermediate_scoring, record_session
|
66
|
+
)
|
63
67
|
|
64
|
-
|
65
|
-
|
68
|
+
# install agent tools
|
69
|
+
await install_human_agent(state, commands, record_session)
|
66
70
|
|
67
|
-
|
68
|
-
|
71
|
+
# hookup the view ui
|
72
|
+
view.connect(connection)
|
69
73
|
|
70
|
-
|
71
|
-
|
74
|
+
# run sandbox service
|
75
|
+
return await run_human_agent_service(state, commands, view)
|
72
76
|
|
73
77
|
# support both fullscreen ui and fallback
|
74
78
|
if display_type() == "full":
|
@@ -6,7 +6,7 @@ from .instructions import InstructionsCommand
|
|
6
6
|
from .note import NoteCommand
|
7
7
|
from .score import ScoreCommand
|
8
8
|
from .status import StatusCommand
|
9
|
-
from .submit import SubmitCommand, ValidateCommand
|
9
|
+
from .submit import QuitCommand, SubmitCommand, ValidateCommand
|
10
10
|
|
11
11
|
|
12
12
|
def human_agent_commands(
|
@@ -15,8 +15,12 @@ def human_agent_commands(
|
|
15
15
|
intermediate_scoring: bool,
|
16
16
|
record_session: bool,
|
17
17
|
) -> list[HumanAgentCommand]:
|
18
|
-
# base submit and
|
19
|
-
commands = [
|
18
|
+
# base submit, validate, and quit
|
19
|
+
commands = [
|
20
|
+
SubmitCommand(record_session),
|
21
|
+
ValidateCommand(answer),
|
22
|
+
QuitCommand(record_session),
|
23
|
+
]
|
20
24
|
|
21
25
|
# optional intermediate scoring
|
22
26
|
if intermediate_scoring:
|
@@ -16,22 +16,89 @@ from .command import HumanAgentCommand, call_human_agent
|
|
16
16
|
logger = getLogger(__name__)
|
17
17
|
|
18
18
|
|
19
|
-
class
|
19
|
+
class SessionEndCommand(HumanAgentCommand):
|
20
20
|
def __init__(self, record_session: bool):
|
21
21
|
super().__init__()
|
22
22
|
self._record_session = record_session
|
23
23
|
|
24
|
+
@property
|
25
|
+
def group(self) -> Literal[1, 2, 3]:
|
26
|
+
return 1
|
27
|
+
|
28
|
+
async def _read_session_logs(self) -> dict[str, str]:
|
29
|
+
# retreive session logs (don't fail)
|
30
|
+
sessions_dir = PurePosixPath(RECORD_SESSION_DIR)
|
31
|
+
result = await sandbox().exec(["ls", "-1", sessions_dir.as_posix()])
|
32
|
+
if not result.success:
|
33
|
+
logger.warning(f"Error listing human agent session logs: {result.stderr}")
|
34
|
+
return {}
|
35
|
+
|
36
|
+
# read logs
|
37
|
+
session_logs: dict[str, str] = {}
|
38
|
+
for session_log in result.stdout.strip().splitlines():
|
39
|
+
try:
|
40
|
+
session_logs[session_log] = await sandbox().read_file(
|
41
|
+
(sessions_dir / session_log).as_posix()
|
42
|
+
)
|
43
|
+
except Exception as ex:
|
44
|
+
logger.warning(f"Error reading human agent session log: {ex}")
|
45
|
+
|
46
|
+
return session_logs
|
47
|
+
|
48
|
+
|
49
|
+
class QuitCommand(SessionEndCommand):
|
24
50
|
@property
|
25
51
|
def name(self) -> str:
|
26
|
-
return "
|
52
|
+
return "quit"
|
27
53
|
|
28
54
|
@property
|
29
55
|
def description(self) -> str:
|
30
|
-
return "
|
56
|
+
return "Quit the task without submitting an answer."
|
57
|
+
|
58
|
+
def cli(self, args: Namespace) -> None:
|
59
|
+
# verify that the user wants to proceed
|
60
|
+
action = "quit the task without submitting an answer (ending the exercise)"
|
61
|
+
while True:
|
62
|
+
response = (
|
63
|
+
input(
|
64
|
+
f"\nDo you definitely want to {action}?\n\nThis will disconnect you from the task environment and you won't be able to reconnect.\n\nYes (y) or No (n): "
|
65
|
+
)
|
66
|
+
.lower()
|
67
|
+
.strip()
|
68
|
+
)
|
69
|
+
if response in ["yes", "y"]:
|
70
|
+
break
|
71
|
+
elif response in ["no", "n"]:
|
72
|
+
return
|
73
|
+
else:
|
74
|
+
print("Please enter yes or no.")
|
31
75
|
|
76
|
+
# thank the user!
|
77
|
+
print(
|
78
|
+
"\nThank you for working on this task!\n\n"
|
79
|
+
+ "Your task will now be scored and you will be disconnected from this container.\n"
|
80
|
+
)
|
81
|
+
|
82
|
+
call_human_agent("quit")
|
83
|
+
|
84
|
+
def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
|
85
|
+
async def submit() -> None:
|
86
|
+
if self._record_session:
|
87
|
+
state.logs = await self._read_session_logs()
|
88
|
+
state.running = False
|
89
|
+
state.answer = ""
|
90
|
+
|
91
|
+
return submit
|
92
|
+
|
93
|
+
|
94
|
+
class SubmitCommand(SessionEndCommand):
|
32
95
|
@property
|
33
|
-
def
|
34
|
-
return
|
96
|
+
def name(self) -> str:
|
97
|
+
return "submit"
|
98
|
+
|
99
|
+
@property
|
100
|
+
def description(self) -> str:
|
101
|
+
return "Submit your final answer for the task."
|
35
102
|
|
36
103
|
@property
|
37
104
|
def cli_args(self) -> list[HumanAgentCommand.CLIArg]:
|
@@ -55,10 +122,12 @@ class SubmitCommand(HumanAgentCommand):
|
|
55
122
|
# verify that the user wants to proceed
|
56
123
|
answer = call_args.get("answer", None)
|
57
124
|
answer_text = f" '{answer}'" if answer else ""
|
125
|
+
action = f"end the task and submit{answer_text}"
|
126
|
+
|
58
127
|
while True:
|
59
128
|
response = (
|
60
129
|
input(
|
61
|
-
f"\nDo you definitely want to
|
130
|
+
f"\nDo you definitely want to {action}?\n\nThis will disconnect you from the task environment and you won't be able to reconnect.\n\nYes (y) or No (n): "
|
62
131
|
)
|
63
132
|
.lower()
|
64
133
|
.strip()
|
@@ -76,13 +145,10 @@ class SubmitCommand(HumanAgentCommand):
|
|
76
145
|
+ "Your task will now be scored and you will be disconnected from this container.\n"
|
77
146
|
)
|
78
147
|
|
79
|
-
# submit the task
|
80
148
|
call_human_agent("submit", **call_args)
|
81
149
|
|
82
150
|
def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
|
83
|
-
async def submit(
|
84
|
-
answer: str | None, session_logs: dict[str, str] | None = None
|
85
|
-
) -> None:
|
151
|
+
async def submit(answer: str) -> None:
|
86
152
|
if self._record_session:
|
87
153
|
state.logs = await self._read_session_logs()
|
88
154
|
state.running = False
|
@@ -90,26 +156,6 @@ class SubmitCommand(HumanAgentCommand):
|
|
90
156
|
|
91
157
|
return submit
|
92
158
|
|
93
|
-
async def _read_session_logs(self) -> dict[str, str]:
|
94
|
-
# retreive session logs (don't fail)
|
95
|
-
sessions_dir = PurePosixPath(RECORD_SESSION_DIR)
|
96
|
-
result = await sandbox().exec(["ls", "-1", sessions_dir.as_posix()])
|
97
|
-
if not result.success:
|
98
|
-
logger.warning(f"Error listing human agent session logs: {result.stderr}")
|
99
|
-
return {}
|
100
|
-
|
101
|
-
# read logs
|
102
|
-
session_logs: dict[str, str] = {}
|
103
|
-
for session_log in result.stdout.strip().splitlines():
|
104
|
-
try:
|
105
|
-
session_logs[session_log] = await sandbox().read_file(
|
106
|
-
(sessions_dir / session_log).as_posix()
|
107
|
-
)
|
108
|
-
except Exception as ex:
|
109
|
-
logger.warning(f"Error reading human agent session log: {ex}")
|
110
|
-
|
111
|
-
return session_logs
|
112
|
-
|
113
159
|
|
114
160
|
class ValidateCommand(HumanAgentCommand):
|
115
161
|
def __init__(self, answer: bool | str) -> None:
|
inspect_ai/solver/_limit.py
CHANGED
@@ -7,15 +7,15 @@ class SampleLimitExceededError(Exception):
|
|
7
7
|
"""Exception raised when a sample limit is exceeded.
|
8
8
|
|
9
9
|
Args:
|
10
|
-
type
|
11
|
-
value
|
12
|
-
limit
|
10
|
+
type: Type of limit exceeded.
|
11
|
+
value: Value compared to
|
12
|
+
limit: Limit applied.
|
13
13
|
message (str | None): Optional. Human readable message.
|
14
14
|
"""
|
15
15
|
|
16
16
|
def __init__(
|
17
17
|
self,
|
18
|
-
type: Literal["message", "time", "token", "operator", "custom"],
|
18
|
+
type: Literal["message", "time", "working", "token", "operator", "custom"],
|
19
19
|
*,
|
20
20
|
value: int,
|
21
21
|
limit: int,
|
inspect_ai/solver/_plan.py
CHANGED
inspect_ai/solver/_task_state.py
CHANGED
@@ -7,6 +7,7 @@ from random import Random
|
|
7
7
|
from typing import Any, Iterable, SupportsIndex, Type, Union, cast, overload
|
8
8
|
|
9
9
|
from pydantic_core import to_jsonable_python
|
10
|
+
from shortuuid import uuid
|
10
11
|
|
11
12
|
from inspect_ai._util.interrupt import check_sample_interrupt
|
12
13
|
from inspect_ai.dataset._dataset import MT, Sample, metadata_as
|
@@ -165,6 +166,7 @@ class TaskState:
|
|
165
166
|
self._token_limit = token_limit
|
166
167
|
self._completed = completed
|
167
168
|
self._store = Store()
|
169
|
+
self._uuid = uuid()
|
168
170
|
|
169
171
|
if choices:
|
170
172
|
self.choices = Choices(choices)
|
@@ -373,6 +375,11 @@ class TaskState:
|
|
373
375
|
scores: dict[str, Score] | None = None
|
374
376
|
"""Scores yielded by running task."""
|
375
377
|
|
378
|
+
@property
|
379
|
+
def uuid(self) -> str:
|
380
|
+
"""Globally unique identifier for sample run."""
|
381
|
+
return self._uuid
|
382
|
+
|
376
383
|
def metadata_as(self, metadata_cls: Type[MT]) -> MT:
|
377
384
|
"""Pydantic model interface to metadata.
|
378
385
|
|
inspect_ai/tool/__init__.py
CHANGED
@@ -2,6 +2,7 @@ from inspect_ai._util.content import (
|
|
2
2
|
Content,
|
3
3
|
ContentAudio,
|
4
4
|
ContentImage,
|
5
|
+
ContentReasoning,
|
5
6
|
ContentText,
|
6
7
|
ContentVideo,
|
7
8
|
)
|
@@ -41,6 +42,7 @@ __all__ = [
|
|
41
42
|
"Content",
|
42
43
|
"ContentAudio",
|
43
44
|
"ContentImage",
|
45
|
+
"ContentReasoning",
|
44
46
|
"ContentText",
|
45
47
|
"ContentVideo",
|
46
48
|
"ToolCall",
|
inspect_ai/tool/_tool.py
CHANGED
@@ -14,6 +14,7 @@ from typing import (
|
|
14
14
|
from inspect_ai._util.content import (
|
15
15
|
ContentAudio,
|
16
16
|
ContentImage,
|
17
|
+
ContentReasoning,
|
17
18
|
ContentText,
|
18
19
|
ContentVideo,
|
19
20
|
)
|
@@ -35,10 +36,11 @@ ToolResult = (
|
|
35
36
|
| float
|
36
37
|
| bool
|
37
38
|
| ContentText
|
39
|
+
| ContentReasoning
|
38
40
|
| ContentImage
|
39
41
|
| ContentAudio
|
40
42
|
| ContentVideo
|
41
|
-
| list[ContentText | ContentImage | ContentAudio | ContentVideo]
|
43
|
+
| list[ContentText | ContentReasoning | ContentImage | ContentAudio | ContentVideo]
|
42
44
|
)
|
43
45
|
"""Valid types for results from tool calls."""
|
44
46
|
|
@@ -32,7 +32,7 @@ async def run(
|
|
32
32
|
maybe_truncate(stdout.decode(), truncate_after=truncate_after),
|
33
33
|
maybe_truncate(stderr.decode(), truncate_after=truncate_after),
|
34
34
|
)
|
35
|
-
except asyncio.TimeoutError as exc:
|
35
|
+
except (TimeoutError, asyncio.TimeoutError) as exc:
|
36
36
|
try:
|
37
37
|
process.kill()
|
38
38
|
except ProcessLookupError:
|
@@ -0,0 +1,24 @@
|
|
1
|
+
{
|
2
|
+
"version": "0.2.0",
|
3
|
+
"configurations": [
|
4
|
+
{
|
5
|
+
"type": "debugpy",
|
6
|
+
"request": "launch",
|
7
|
+
"name": "Debug Web Server",
|
8
|
+
"program": "${workspaceFolder}/web_server.py"
|
9
|
+
},
|
10
|
+
{
|
11
|
+
"type": "debugpy",
|
12
|
+
"request": "launch",
|
13
|
+
"name": "Debug Web Client interactive mode",
|
14
|
+
"program": "${workspaceFolder}/web_client.py"
|
15
|
+
},
|
16
|
+
{
|
17
|
+
"type": "debugpy",
|
18
|
+
"request": "launch",
|
19
|
+
"name": "Debug Web Client w/arguments",
|
20
|
+
"program": "${workspaceFolder}/web_client.py",
|
21
|
+
"args": ["${command:pickArgs}"]
|
22
|
+
}
|
23
|
+
]
|
24
|
+
}
|
@@ -0,0 +1,25 @@
|
|
1
|
+
{
|
2
|
+
"cSpell.words": [
|
3
|
+
"activedescendant",
|
4
|
+
"describedby",
|
5
|
+
"domcontentloaded",
|
6
|
+
"figcaption",
|
7
|
+
"flowto",
|
8
|
+
"framenavigated",
|
9
|
+
"headful",
|
10
|
+
"idref",
|
11
|
+
"jsonrpcclient",
|
12
|
+
"jsonrpcserver",
|
13
|
+
"keepalive",
|
14
|
+
"keyshortcuts",
|
15
|
+
"labelfor",
|
16
|
+
"labelledby",
|
17
|
+
"labelwrapped",
|
18
|
+
"multiselectable",
|
19
|
+
"Rects",
|
20
|
+
"roledescription",
|
21
|
+
"rubyannotation",
|
22
|
+
"tablecaption",
|
23
|
+
"valuetext"
|
24
|
+
]
|
25
|
+
}
|
@@ -8,16 +8,15 @@ RUN apt-get update
|
|
8
8
|
|
9
9
|
RUN pip install --upgrade pip
|
10
10
|
|
11
|
+
RUN pip install playwright jsonrpcclient jsonrpcserver httpx aiohttp pillow pydantic tenacity
|
12
|
+
|
11
13
|
# Install playwright
|
12
|
-
RUN pip install playwright
|
13
14
|
RUN playwright install
|
14
15
|
RUN playwright install-deps
|
15
16
|
|
16
|
-
# Install other dependancies
|
17
|
-
RUN pip install dm-env-rpc pillow bs4 lxml
|
18
|
-
|
19
17
|
# Copy Python files alongside the Dockerfile
|
20
|
-
COPY
|
18
|
+
COPY . .
|
21
19
|
|
22
20
|
# Run the server
|
23
|
-
CMD ["python3", "/app/web_browser/web_server.py"]
|
21
|
+
CMD ["python3", "/app/web_browser/web_server.py"]
|
22
|
+
# CMD ["tail", "-f", "/dev/null"]
|
@@ -1,7 +1,6 @@
|
|
1
1
|
## Headless Browser Tool
|
2
2
|
|
3
|
-
This directory contains an implementation for the Headless Browser Tool which can be used to test web browsing agents.
|
4
|
-
|
3
|
+
This directory contains an implementation for the Headless Browser Tool which can be used to test web browsing agents.
|
5
4
|
|
6
5
|
### Usage
|
7
6
|
|
@@ -37,27 +36,27 @@ The result will be printed out in _stdout_ in the following format:
|
|
37
36
|
|
38
37
|
```
|
39
38
|
# Inside the Docker container
|
40
|
-
error: <an ERROR message if one
|
39
|
+
error: <an ERROR message if one occurred>
|
41
40
|
info: <general info about the container>
|
42
41
|
web_url: <the URL of the page the browser is currently at>
|
43
42
|
web_at: <accessibility tree of the visible elements of the page>
|
44
|
-
```
|
45
|
-
|
43
|
+
```
|
46
44
|
|
47
45
|
### Design
|
48
46
|
|
49
47
|
The following diagram describes the design and the intended usage of the tool:
|
50
48
|
|
51
|
-

|
52
50
|
|
53
51
|
The tool consists of the following components:
|
54
52
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
53
|
+
- [WebServer](web_server.py) - a server which launches a stateful session with the headless chromium browser and interacts with it through the [Playwright API](https://playwright.dev/python/docs/intro) upon receiving client commands. The server components are:
|
54
|
+
|
55
|
+
- _dm_env_servicer.py_ - an implementation for the gRPC Service based on [dm_env_rpc protocol](https://github.com/google-deepmind/dm_env_rpc).
|
56
|
+
- _web_environment.py_ - an environment which gets instantiated by the servicer and which launches the browser, stores its state and maps client commands to Playwright API.
|
57
|
+
- _playwright_crawler.py_ - a wrapper over the sync Playwright API.
|
59
58
|
|
60
|
-
|
59
|
+
- [WebClient](web_client.py) - a simple stateless client to interact with the server. When launched, the client:
|
61
60
|
1. creates a connection with the server;
|
62
61
|
2. sends user command to the server;
|
63
62
|
3. receives the response in the form of observations and prints them to stdout;
|
@@ -0,0 +1,71 @@
|
|
1
|
+
from functools import reduce
|
2
|
+
from typing import Iterable, TypedDict
|
3
|
+
|
4
|
+
from accessibility_tree_node import AccessibilityTreeNode
|
5
|
+
from cdp.a11y import AXNode, AXNodeId
|
6
|
+
from cdp.dom_snapshot import DOMSnapshot, create_snapshot_context
|
7
|
+
from rectangle import Rectangle
|
8
|
+
|
9
|
+
_AccType = tuple[
|
10
|
+
AXNode | None,
|
11
|
+
dict[AXNodeId, AXNode],
|
12
|
+
]
|
13
|
+
|
14
|
+
|
15
|
+
class AccessibilityTree(TypedDict):
|
16
|
+
root: AccessibilityTreeNode
|
17
|
+
nodes: dict[AXNodeId, AccessibilityTreeNode]
|
18
|
+
|
19
|
+
|
20
|
+
def create_accessibility_tree(
|
21
|
+
*,
|
22
|
+
ax_nodes: Iterable[AXNode],
|
23
|
+
dom_snapshot: DOMSnapshot,
|
24
|
+
device_scale_factor: float,
|
25
|
+
window_bounds: Rectangle,
|
26
|
+
) -> AccessibilityTree | None:
|
27
|
+
"""
|
28
|
+
Creates an accessibility tree from the given Chrome DevTools Protocol AX nodes and DOM snapshot.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
ax_nodes (Iterable[AXNode]): An iterable of AXNode objects representing the accessibility nodes.
|
32
|
+
dom_snapshot (DOMSnapshot): A snapshot of the DOM at the time of accessibility tree creation.
|
33
|
+
device_scale_factor (float): The scale factor of the device.
|
34
|
+
window_bounds (Bounds): The bounds of the window.
|
35
|
+
|
36
|
+
Returns:
|
37
|
+
AccessibilityTree: The accessibility tree.
|
38
|
+
"""
|
39
|
+
|
40
|
+
# first make a dict of AXNodeId's to AXNode's and find the root on the way
|
41
|
+
def reducer(acc: _AccType, ax_node: AXNode) -> _AccType:
|
42
|
+
root_node, nodes = acc
|
43
|
+
nodes[ax_node.nodeId] = ax_node
|
44
|
+
return (
|
45
|
+
# TODO: What do we want for multiple roots?
|
46
|
+
root_node or (ax_node if ax_node.parentId is None else None),
|
47
|
+
nodes,
|
48
|
+
)
|
49
|
+
|
50
|
+
initial_acc: _AccType = (None, {}) # The inference engine is weak
|
51
|
+
root_node, nodes = reduce(reducer, ax_nodes, initial_acc)
|
52
|
+
|
53
|
+
if not root_node:
|
54
|
+
return None
|
55
|
+
|
56
|
+
# Now create the AccessibilityTreeNode hierarchy
|
57
|
+
snapshot_context = create_snapshot_context(dom_snapshot)
|
58
|
+
all_accessibility_tree_nodes: dict[AXNodeId, AccessibilityTreeNode] = {}
|
59
|
+
|
60
|
+
return AccessibilityTree(
|
61
|
+
root=AccessibilityTreeNode(
|
62
|
+
ax_node=root_node,
|
63
|
+
ax_nodes=nodes,
|
64
|
+
parent=None,
|
65
|
+
all_accessibility_tree_nodes=all_accessibility_tree_nodes,
|
66
|
+
snapshot_context=snapshot_context,
|
67
|
+
device_scale_factor=device_scale_factor,
|
68
|
+
window_bounds=window_bounds,
|
69
|
+
),
|
70
|
+
nodes=all_accessibility_tree_nodes,
|
71
|
+
)
|