hud-python 0.3.5__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +22 -89
- hud/agents/__init__.py +15 -0
- hud/agents/art.py +101 -0
- hud/agents/base.py +599 -0
- hud/{mcp → agents}/claude.py +373 -321
- hud/{mcp → agents}/langchain.py +250 -250
- hud/agents/misc/__init__.py +7 -0
- hud/{agent → agents}/misc/response_agent.py +80 -80
- hud/{mcp → agents}/openai.py +352 -334
- hud/agents/openai_chat_generic.py +154 -0
- hud/{mcp → agents}/tests/__init__.py +1 -1
- hud/agents/tests/test_base.py +742 -0
- hud/agents/tests/test_claude.py +324 -0
- hud/{mcp → agents}/tests/test_client.py +363 -324
- hud/{mcp → agents}/tests/test_openai.py +237 -238
- hud/cli/__init__.py +617 -0
- hud/cli/__main__.py +8 -0
- hud/cli/analyze.py +371 -0
- hud/cli/analyze_metadata.py +230 -0
- hud/cli/build.py +427 -0
- hud/cli/clone.py +185 -0
- hud/cli/cursor.py +92 -0
- hud/cli/debug.py +392 -0
- hud/cli/docker_utils.py +83 -0
- hud/cli/init.py +281 -0
- hud/cli/interactive.py +353 -0
- hud/cli/mcp_server.py +756 -0
- hud/cli/pull.py +336 -0
- hud/cli/push.py +370 -0
- hud/cli/remote_runner.py +311 -0
- hud/cli/runner.py +160 -0
- hud/cli/tests/__init__.py +3 -0
- hud/cli/tests/test_analyze.py +284 -0
- hud/cli/tests/test_cli_init.py +265 -0
- hud/cli/tests/test_cli_main.py +27 -0
- hud/cli/tests/test_clone.py +142 -0
- hud/cli/tests/test_cursor.py +253 -0
- hud/cli/tests/test_debug.py +453 -0
- hud/cli/tests/test_mcp_server.py +139 -0
- hud/cli/tests/test_utils.py +388 -0
- hud/cli/utils.py +263 -0
- hud/clients/README.md +143 -0
- hud/clients/__init__.py +16 -0
- hud/clients/base.py +379 -0
- hud/clients/fastmcp.py +222 -0
- hud/clients/mcp_use.py +278 -0
- hud/clients/tests/__init__.py +1 -0
- hud/clients/tests/test_client_integration.py +111 -0
- hud/clients/tests/test_fastmcp.py +342 -0
- hud/clients/tests/test_protocol.py +188 -0
- hud/clients/utils/__init__.py +1 -0
- hud/clients/utils/retry_transport.py +160 -0
- hud/datasets.py +322 -192
- hud/misc/__init__.py +1 -0
- hud/{agent → misc}/claude_plays_pokemon.py +292 -283
- hud/otel/__init__.py +35 -0
- hud/otel/collector.py +142 -0
- hud/otel/config.py +164 -0
- hud/otel/context.py +536 -0
- hud/otel/exporters.py +366 -0
- hud/otel/instrumentation.py +97 -0
- hud/otel/processors.py +118 -0
- hud/otel/tests/__init__.py +1 -0
- hud/otel/tests/test_processors.py +197 -0
- hud/server/__init__.py +5 -5
- hud/server/context.py +114 -0
- hud/server/helper/__init__.py +5 -0
- hud/server/low_level.py +132 -0
- hud/server/server.py +166 -0
- hud/server/tests/__init__.py +3 -0
- hud/settings.py +73 -79
- hud/shared/__init__.py +5 -0
- hud/{exceptions.py → shared/exceptions.py} +180 -180
- hud/{server → shared}/requests.py +264 -264
- hud/shared/tests/test_exceptions.py +157 -0
- hud/{server → shared}/tests/test_requests.py +275 -275
- hud/telemetry/__init__.py +25 -30
- hud/telemetry/instrument.py +379 -0
- hud/telemetry/job.py +309 -141
- hud/telemetry/replay.py +74 -0
- hud/telemetry/trace.py +83 -0
- hud/tools/__init__.py +33 -34
- hud/tools/base.py +365 -65
- hud/tools/bash.py +161 -137
- hud/tools/computer/__init__.py +15 -13
- hud/tools/computer/anthropic.py +437 -420
- hud/tools/computer/hud.py +376 -334
- hud/tools/computer/openai.py +295 -292
- hud/tools/computer/settings.py +82 -0
- hud/tools/edit.py +314 -290
- hud/tools/executors/__init__.py +30 -30
- hud/tools/executors/base.py +539 -532
- hud/tools/executors/pyautogui.py +621 -619
- hud/tools/executors/tests/__init__.py +1 -1
- hud/tools/executors/tests/test_base_executor.py +338 -338
- hud/tools/executors/tests/test_pyautogui_executor.py +165 -165
- hud/tools/executors/xdo.py +511 -503
- hud/tools/{playwright_tool.py → playwright.py} +412 -379
- hud/tools/tests/__init__.py +3 -3
- hud/tools/tests/test_base.py +282 -0
- hud/tools/tests/test_bash.py +158 -152
- hud/tools/tests/test_bash_extended.py +197 -0
- hud/tools/tests/test_computer.py +425 -52
- hud/tools/tests/test_computer_actions.py +34 -34
- hud/tools/tests/test_edit.py +259 -240
- hud/tools/tests/test_init.py +27 -27
- hud/tools/tests/test_playwright_tool.py +183 -183
- hud/tools/tests/test_tools.py +145 -157
- hud/tools/tests/test_utils.py +156 -156
- hud/tools/types.py +72 -0
- hud/tools/utils.py +50 -50
- hud/types.py +136 -89
- hud/utils/__init__.py +10 -16
- hud/utils/async_utils.py +65 -0
- hud/utils/design.py +168 -0
- hud/utils/mcp.py +55 -0
- hud/utils/progress.py +149 -149
- hud/utils/telemetry.py +66 -66
- hud/utils/tests/test_async_utils.py +173 -0
- hud/utils/tests/test_init.py +17 -21
- hud/utils/tests/test_progress.py +261 -225
- hud/utils/tests/test_telemetry.py +82 -37
- hud/utils/tests/test_version.py +8 -8
- hud/version.py +7 -7
- hud_python-0.4.1.dist-info/METADATA +476 -0
- hud_python-0.4.1.dist-info/RECORD +132 -0
- hud_python-0.4.1.dist-info/entry_points.txt +3 -0
- {hud_python-0.3.5.dist-info → hud_python-0.4.1.dist-info}/licenses/LICENSE +21 -21
- hud/adapters/__init__.py +0 -8
- hud/adapters/claude/__init__.py +0 -5
- hud/adapters/claude/adapter.py +0 -180
- hud/adapters/claude/tests/__init__.py +0 -1
- hud/adapters/claude/tests/test_adapter.py +0 -519
- hud/adapters/common/__init__.py +0 -6
- hud/adapters/common/adapter.py +0 -178
- hud/adapters/common/tests/test_adapter.py +0 -289
- hud/adapters/common/types.py +0 -446
- hud/adapters/operator/__init__.py +0 -5
- hud/adapters/operator/adapter.py +0 -108
- hud/adapters/operator/tests/__init__.py +0 -1
- hud/adapters/operator/tests/test_adapter.py +0 -370
- hud/agent/__init__.py +0 -19
- hud/agent/base.py +0 -126
- hud/agent/claude.py +0 -271
- hud/agent/langchain.py +0 -215
- hud/agent/misc/__init__.py +0 -3
- hud/agent/operator.py +0 -268
- hud/agent/tests/__init__.py +0 -1
- hud/agent/tests/test_base.py +0 -202
- hud/env/__init__.py +0 -11
- hud/env/client.py +0 -35
- hud/env/docker_client.py +0 -349
- hud/env/environment.py +0 -446
- hud/env/local_docker_client.py +0 -358
- hud/env/remote_client.py +0 -212
- hud/env/remote_docker_client.py +0 -292
- hud/gym.py +0 -130
- hud/job.py +0 -773
- hud/mcp/__init__.py +0 -17
- hud/mcp/base.py +0 -631
- hud/mcp/client.py +0 -312
- hud/mcp/tests/test_base.py +0 -512
- hud/mcp/tests/test_claude.py +0 -294
- hud/task.py +0 -149
- hud/taskset.py +0 -237
- hud/telemetry/_trace.py +0 -347
- hud/telemetry/context.py +0 -230
- hud/telemetry/exporter.py +0 -575
- hud/telemetry/instrumentation/__init__.py +0 -3
- hud/telemetry/instrumentation/mcp.py +0 -259
- hud/telemetry/instrumentation/registry.py +0 -59
- hud/telemetry/mcp_models.py +0 -270
- hud/telemetry/tests/__init__.py +0 -1
- hud/telemetry/tests/test_context.py +0 -210
- hud/telemetry/tests/test_trace.py +0 -312
- hud/tools/helper/README.md +0 -56
- hud/tools/helper/__init__.py +0 -9
- hud/tools/helper/mcp_server.py +0 -78
- hud/tools/helper/server_initialization.py +0 -115
- hud/tools/helper/utils.py +0 -58
- hud/trajectory.py +0 -94
- hud/utils/agent.py +0 -37
- hud/utils/common.py +0 -256
- hud/utils/config.py +0 -120
- hud/utils/deprecation.py +0 -115
- hud/utils/misc.py +0 -53
- hud/utils/tests/test_common.py +0 -277
- hud/utils/tests/test_config.py +0 -129
- hud_python-0.3.5.dist-info/METADATA +0 -284
- hud_python-0.3.5.dist-info/RECORD +0 -120
- /hud/{adapters/common → shared}/tests/__init__.py +0 -0
- {hud_python-0.3.5.dist-info → hud_python-0.4.1.dist-info}/WHEEL +0 -0
hud/agent/claude.py
DELETED
|
@@ -1,271 +0,0 @@
|
|
|
1
|
-
import copy
|
|
2
|
-
import logging
|
|
3
|
-
from typing import Any, cast
|
|
4
|
-
|
|
5
|
-
from anthropic import AsyncAnthropic, BadRequestError
|
|
6
|
-
from anthropic.types.beta import (
|
|
7
|
-
BetaMessageParam,
|
|
8
|
-
BetaToolResultBlockParam,
|
|
9
|
-
BetaToolComputerUse20250124Param,
|
|
10
|
-
BetaTextBlockParam,
|
|
11
|
-
BetaImageBlockParam,
|
|
12
|
-
BetaCacheControlEphemeralParam,
|
|
13
|
-
)
|
|
14
|
-
|
|
15
|
-
from hud.adapters import Adapter
|
|
16
|
-
from hud.agent.base import Agent
|
|
17
|
-
from hud.adapters.claude import ClaudeAdapter
|
|
18
|
-
from hud.types import Gym
|
|
19
|
-
from hud.utils.common import Observation
|
|
20
|
-
from hud.settings import settings
|
|
21
|
-
from hud.adapters.common.types import LogType
|
|
22
|
-
|
|
23
|
-
logger = logging.getLogger(__name__)
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def base64_to_content_block(base64: str) -> BetaImageBlockParam:
|
|
27
|
-
return {
|
|
28
|
-
"type": "image",
|
|
29
|
-
"source": {"type": "base64", "media_type": "image/png", "data": base64},
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def text_to_content_block(text: str) -> BetaTextBlockParam:
|
|
34
|
-
return {"type": "text", "text": text}
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def tool_use_content_block(
|
|
38
|
-
tool_use_id: str, content: list[BetaTextBlockParam | BetaImageBlockParam]
|
|
39
|
-
) -> BetaToolResultBlockParam:
|
|
40
|
-
return {"type": "tool_result", "tool_use_id": tool_use_id, "content": content}
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
# Claude's Computer Use Tool definition
|
|
44
|
-
COMPUTER_TOOL: BetaToolComputerUse20250124Param = {
|
|
45
|
-
"type": "computer_20250124",
|
|
46
|
-
"name": "computer",
|
|
47
|
-
"display_width_px": 1024,
|
|
48
|
-
"display_height_px": 768,
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
class ClaudeAgent(Agent[AsyncAnthropic, Any]):
|
|
53
|
-
"""
|
|
54
|
-
An agent implementation using Anthropic's Claude API with Computer Use.
|
|
55
|
-
|
|
56
|
-
This agent interacts with HUD environments using Claude's Computer Use API
|
|
57
|
-
through the ClaudeAdapter which converts actions to the format expected by HUD.
|
|
58
|
-
"""
|
|
59
|
-
|
|
60
|
-
transfer_gyms: dict[Gym, Gym] = {"qa": "hud-browser"}
|
|
61
|
-
|
|
62
|
-
def __init__(
|
|
63
|
-
self,
|
|
64
|
-
client: AsyncAnthropic | None = None,
|
|
65
|
-
adapter: Adapter | None = None,
|
|
66
|
-
model: str = "claude-3-7-sonnet-20250219",
|
|
67
|
-
max_tokens: int = 4096,
|
|
68
|
-
max_iterations: int = 10,
|
|
69
|
-
name: str | None = None,
|
|
70
|
-
):
|
|
71
|
-
"""
|
|
72
|
-
Initialize the ClaudeAgent.
|
|
73
|
-
|
|
74
|
-
Args:
|
|
75
|
-
client: The AsyncAnthropic client for API calls (optional, created automatically if not provided)
|
|
76
|
-
adapter: The adapter to use for preprocessing and postprocessing
|
|
77
|
-
model: The Claude model to use
|
|
78
|
-
max_tokens: Maximum tokens for Claude's response
|
|
79
|
-
max_iterations: Maximum number of iterations for the agent
|
|
80
|
-
name: The name of the agent
|
|
81
|
-
"""
|
|
82
|
-
# Initialize client if not provided
|
|
83
|
-
if client is None:
|
|
84
|
-
# Get API key from settings
|
|
85
|
-
api_key = settings.anthropic_api_key
|
|
86
|
-
if not api_key:
|
|
87
|
-
raise ValueError(
|
|
88
|
-
"Anthropic API key not found in settings or environment variables. Set ANTHROPIC_API_KEY."
|
|
89
|
-
)
|
|
90
|
-
|
|
91
|
-
# Create client
|
|
92
|
-
client = AsyncAnthropic(api_key=api_key)
|
|
93
|
-
|
|
94
|
-
adapter = adapter or ClaudeAdapter()
|
|
95
|
-
|
|
96
|
-
if name is None:
|
|
97
|
-
name = model
|
|
98
|
-
|
|
99
|
-
super().__init__(client=client, adapter=adapter, name=name)
|
|
100
|
-
|
|
101
|
-
self.model = model
|
|
102
|
-
self.max_tokens = max_tokens
|
|
103
|
-
self.max_iterations = max_iterations
|
|
104
|
-
|
|
105
|
-
# Default dimensions - will be updated if adapter is provided
|
|
106
|
-
self.width_px = 1024
|
|
107
|
-
self.height_px = 768
|
|
108
|
-
|
|
109
|
-
# Update dimensions if adapter is provided
|
|
110
|
-
if self.adapter:
|
|
111
|
-
self.width_px = self.adapter.agent_width
|
|
112
|
-
self.height_px = self.adapter.agent_height
|
|
113
|
-
|
|
114
|
-
# Message history
|
|
115
|
-
self.messages: list[BetaMessageParam] = []
|
|
116
|
-
self.pending_computer_use_tool_id = None
|
|
117
|
-
|
|
118
|
-
async def fetch_response(self, observation: Observation) -> tuple[list[Any], bool]:
|
|
119
|
-
"""
|
|
120
|
-
Fetch a response from Claude based on the observation.
|
|
121
|
-
|
|
122
|
-
Args:
|
|
123
|
-
observation: The preprocessed observation
|
|
124
|
-
|
|
125
|
-
Returns:
|
|
126
|
-
tuple[list[Any], bool, list[str | dict[str, Any]] | None]: A tuple containing the list of raw actions,
|
|
127
|
-
boolean indicating if the agent believes the task is complete, and a list of strings or dictionaries of logs.
|
|
128
|
-
"""
|
|
129
|
-
if not self.client:
|
|
130
|
-
raise ValueError("Client is required")
|
|
131
|
-
|
|
132
|
-
if not observation.text and not observation.screenshot:
|
|
133
|
-
raise ValueError("Observation must contain either text or screenshot")
|
|
134
|
-
|
|
135
|
-
# Prepare the user content for Claude
|
|
136
|
-
user_content: list[BetaImageBlockParam | BetaTextBlockParam | BetaToolResultBlockParam] = []
|
|
137
|
-
|
|
138
|
-
# Add text instruction if present
|
|
139
|
-
if observation.text:
|
|
140
|
-
# logger.info("Adding text to user content: %s", observation.text)
|
|
141
|
-
user_content.append(text_to_content_block(str(observation.text)))
|
|
142
|
-
|
|
143
|
-
# Add screenshot if present
|
|
144
|
-
if observation.screenshot:
|
|
145
|
-
# logger.info("Adding screenshot to user content")
|
|
146
|
-
if not self.pending_computer_use_tool_id:
|
|
147
|
-
# logger.info("Adding screenshot to user content, no tool id")
|
|
148
|
-
user_content.append(base64_to_content_block(observation.screenshot))
|
|
149
|
-
else:
|
|
150
|
-
# logger.info(
|
|
151
|
-
# "Adding screenshot to user content, tool id: %s",
|
|
152
|
-
# self.pending_computer_use_tool_id,
|
|
153
|
-
# )
|
|
154
|
-
user_content.append(
|
|
155
|
-
tool_use_content_block(
|
|
156
|
-
self.pending_computer_use_tool_id,
|
|
157
|
-
[base64_to_content_block(observation.screenshot)],
|
|
158
|
-
)
|
|
159
|
-
)
|
|
160
|
-
self.pending_computer_use_tool_id = None
|
|
161
|
-
|
|
162
|
-
# Add the user content to the messages
|
|
163
|
-
self.messages.append(
|
|
164
|
-
cast(
|
|
165
|
-
BetaMessageParam,
|
|
166
|
-
{
|
|
167
|
-
"role": "user",
|
|
168
|
-
"content": user_content,
|
|
169
|
-
},
|
|
170
|
-
)
|
|
171
|
-
)
|
|
172
|
-
|
|
173
|
-
# Call Claude API using async client, truncating 50 messages at a time if needed
|
|
174
|
-
while True:
|
|
175
|
-
# first, make a copy and add prompt caching to the last message
|
|
176
|
-
messages_cached = copy.deepcopy(self.messages)
|
|
177
|
-
# Mark last user message with cache control for prompt caching
|
|
178
|
-
last_msg = messages_cached[-1]
|
|
179
|
-
if last_msg.get("role") == "user":
|
|
180
|
-
last_content = last_msg["content"]
|
|
181
|
-
if isinstance(last_content, list):
|
|
182
|
-
for block in last_content:
|
|
183
|
-
if (
|
|
184
|
-
not block["type"] == "thinking"
|
|
185
|
-
and not block["type"] == "redacted_thinking"
|
|
186
|
-
):
|
|
187
|
-
cache_control: BetaCacheControlEphemeralParam = {"type": "ephemeral"}
|
|
188
|
-
block["cache_control"] = cache_control
|
|
189
|
-
|
|
190
|
-
try:
|
|
191
|
-
response = await self.client.beta.messages.create(
|
|
192
|
-
model=self.model,
|
|
193
|
-
max_tokens=self.max_tokens,
|
|
194
|
-
messages=messages_cached,
|
|
195
|
-
tools=[COMPUTER_TOOL],
|
|
196
|
-
betas=["computer-use-2025-01-24"],
|
|
197
|
-
tool_choice={"type": "auto", "disable_parallel_tool_use": True},
|
|
198
|
-
)
|
|
199
|
-
except BadRequestError as e:
|
|
200
|
-
if e.message.startswith("prompt is too long"):
|
|
201
|
-
logger.warning(
|
|
202
|
-
f"Prompt is too long, removing the first 50 messages except for the first user message: {e.message}"
|
|
203
|
-
)
|
|
204
|
-
self.messages = [self.messages[0]] + self.messages[50:]
|
|
205
|
-
continue
|
|
206
|
-
else:
|
|
207
|
-
raise e
|
|
208
|
-
|
|
209
|
-
# break out of the while loop if we get a response
|
|
210
|
-
break
|
|
211
|
-
|
|
212
|
-
# Add Claude's response to the conversation history
|
|
213
|
-
response_content = response.content
|
|
214
|
-
self.messages.append(
|
|
215
|
-
cast(
|
|
216
|
-
BetaMessageParam,
|
|
217
|
-
{
|
|
218
|
-
"role": "assistant",
|
|
219
|
-
"content": response_content,
|
|
220
|
-
},
|
|
221
|
-
)
|
|
222
|
-
)
|
|
223
|
-
|
|
224
|
-
# Process tool use
|
|
225
|
-
actions: list[Any] = []
|
|
226
|
-
done = True # Assume we're done unless we find a tool use
|
|
227
|
-
|
|
228
|
-
for block in response_content:
|
|
229
|
-
# logger.info("Processing block: %s", block)
|
|
230
|
-
if block.type == "tool_use":
|
|
231
|
-
# logger.info("Processing tool use: %s", block)
|
|
232
|
-
assert block.name == "computer"
|
|
233
|
-
|
|
234
|
-
# Store the raw action
|
|
235
|
-
actions.append(block.input)
|
|
236
|
-
self.pending_computer_use_tool_id = block.id
|
|
237
|
-
|
|
238
|
-
# If we found a tool use, we're not done
|
|
239
|
-
done = False
|
|
240
|
-
break
|
|
241
|
-
|
|
242
|
-
# If no tool use action was found, check for a final text response
|
|
243
|
-
if len(actions) == 0 and done:
|
|
244
|
-
final_text_response = ""
|
|
245
|
-
for block in response_content:
|
|
246
|
-
if block.type == "text":
|
|
247
|
-
final_text_response += block.text
|
|
248
|
-
|
|
249
|
-
if final_text_response.strip():
|
|
250
|
-
# logger.info(
|
|
251
|
-
# f"No tool use found. Using final text as response: {final_text_response}"
|
|
252
|
-
# )
|
|
253
|
-
actions = [{"action": "response", "text": final_text_response.strip()}]
|
|
254
|
-
done = True
|
|
255
|
-
# else:
|
|
256
|
-
# logger.info("No tool use and no final text block found.")
|
|
257
|
-
# Keep done = True, actions remains empty
|
|
258
|
-
|
|
259
|
-
reasoning = ""
|
|
260
|
-
for block in response_content:
|
|
261
|
-
if block.type == "thinking":
|
|
262
|
-
reasoning += f"Thinking: {block.thinking}\n"
|
|
263
|
-
elif block.type == "text":
|
|
264
|
-
reasoning += block.text
|
|
265
|
-
|
|
266
|
-
# add reasoning to the actions
|
|
267
|
-
for action in actions:
|
|
268
|
-
action["reasoning"] = reasoning
|
|
269
|
-
action["logs"] = response.model_dump()
|
|
270
|
-
|
|
271
|
-
return actions, done
|
hud/agent/langchain.py
DELETED
|
@@ -1,215 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
from typing import Any, Generic, List, Optional, TypeVar, Union, cast
|
|
3
|
-
|
|
4
|
-
# Langchain imports
|
|
5
|
-
from langchain_core.language_models import BaseLanguageModel
|
|
6
|
-
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage
|
|
7
|
-
from langchain_core.runnables import Runnable, RunnableSerializable
|
|
8
|
-
from pydantic import Field, BaseModel
|
|
9
|
-
|
|
10
|
-
# HUD imports
|
|
11
|
-
from hud.adapters import Adapter
|
|
12
|
-
from hud.agent.base import Agent
|
|
13
|
-
from hud.types import Gym
|
|
14
|
-
from hud.utils.common import Observation
|
|
15
|
-
from hud.adapters.common.types import (
|
|
16
|
-
ClickAction,
|
|
17
|
-
TypeAction,
|
|
18
|
-
ScrollAction,
|
|
19
|
-
MoveAction,
|
|
20
|
-
DragAction,
|
|
21
|
-
PressAction,
|
|
22
|
-
KeyDownAction,
|
|
23
|
-
KeyUpAction,
|
|
24
|
-
WaitAction,
|
|
25
|
-
ResponseAction,
|
|
26
|
-
CustomAction,
|
|
27
|
-
LogType,
|
|
28
|
-
# Exclude ScreenshotFetch, PositionFetch as they are internal
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
logger = logging.getLogger(__name__)
|
|
32
|
-
|
|
33
|
-
# Define a Pydantic Union type representing exactly ONE possible CLA action
|
|
34
|
-
# This is what we'll ask the Langchain model to output.
|
|
35
|
-
SingleCLAction = Union[
|
|
36
|
-
ClickAction,
|
|
37
|
-
TypeAction,
|
|
38
|
-
ScrollAction,
|
|
39
|
-
MoveAction,
|
|
40
|
-
DragAction,
|
|
41
|
-
PressAction,
|
|
42
|
-
KeyDownAction,
|
|
43
|
-
KeyUpAction,
|
|
44
|
-
WaitAction,
|
|
45
|
-
ResponseAction,
|
|
46
|
-
]
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
# Define a Pydantic model to wrap the single action, potentially making it
|
|
50
|
-
# easier for the LLM to consistently output the desired structure.
|
|
51
|
-
class StepAction(BaseModel):
|
|
52
|
-
"""Wrapper model requesting a single concrete CLA action from the Langchain model."""
|
|
53
|
-
|
|
54
|
-
action: SingleCLAction = Field(
|
|
55
|
-
..., description="The single CLA action to perform for this step."
|
|
56
|
-
)
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
# Generic Type for the Langchain Model/Runnable
|
|
60
|
-
# Allows flexibility in what the user provides (model, chain, etc.)
|
|
61
|
-
# Bound to BaseLanguageModel as .with_structured_output is expected
|
|
62
|
-
LangchainModelOrRunnable = TypeVar("LangchainModelOrRunnable", bound=BaseLanguageModel)
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainModelOrRunnable]):
|
|
66
|
-
"""
|
|
67
|
-
An agent that uses an arbitrary Langchain model or runnable, leveraging
|
|
68
|
-
Langchain's structured output capabilities to produce a single CLA action per step.
|
|
69
|
-
"""
|
|
70
|
-
|
|
71
|
-
transfer_gyms: dict[Gym, Gym] = {"qa": "hud-browser"}
|
|
72
|
-
|
|
73
|
-
def __init__(
|
|
74
|
-
self,
|
|
75
|
-
langchain_model: LangchainModelOrRunnable,
|
|
76
|
-
adapter: Optional[Adapter] = None,
|
|
77
|
-
system_prompt: str | None = None,
|
|
78
|
-
name: str | None = None,
|
|
79
|
-
):
|
|
80
|
-
"""
|
|
81
|
-
Initialize the LangchainAgent.
|
|
82
|
-
|
|
83
|
-
Args:
|
|
84
|
-
langchain_model: The Langchain language model or runnable chain to use.
|
|
85
|
-
Must support asynchronous invocation (`ainvoke`) and
|
|
86
|
-
`.with_structured_output()`.
|
|
87
|
-
adapter: An optional HUD adapter. If provided, it will be used for
|
|
88
|
-
preprocessing observations (rescaling) and postprocessing
|
|
89
|
-
the single CLA action (coordinate rescaling).
|
|
90
|
-
system_prompt: An optional system prompt to guide the Langchain model.
|
|
91
|
-
If None, a default prompt encouraging single CLA output is used.
|
|
92
|
-
"""
|
|
93
|
-
super().__init__(
|
|
94
|
-
client=langchain_model, adapter=adapter, name=name
|
|
95
|
-
) # Store model as 'client'
|
|
96
|
-
self.langchain_model = langchain_model # Also store with specific name
|
|
97
|
-
|
|
98
|
-
self.system_prompt_str = system_prompt or self._get_default_system_prompt()
|
|
99
|
-
self.history: List[BaseMessage] = []
|
|
100
|
-
|
|
101
|
-
def _get_default_system_prompt(self) -> str:
|
|
102
|
-
# TODO: Refine this prompt based on testing.
|
|
103
|
-
# It needs to strongly encourage outputting *only* the StepAction structure.
|
|
104
|
-
return (
|
|
105
|
-
"You are an agent interacting with a computer environment (either a web browser or an OS desktop). "
|
|
106
|
-
"Your goal is to follow the user's instructions based on the provided text and screenshot observations."
|
|
107
|
-
"For each step, you must choose exactly ONE action to perform from the available CLA action types."
|
|
108
|
-
"Output your chosen action using the provided 'StepAction' tool/function."
|
|
109
|
-
"If you believe the task is complete based on the user's prompt and the observations, use the 'ResponseAction'."
|
|
110
|
-
)
|
|
111
|
-
|
|
112
|
-
async def fetch_response(
|
|
113
|
-
self, observation: Observation
|
|
114
|
-
) -> tuple[list[dict | SingleCLAction], bool]:
|
|
115
|
-
"""
|
|
116
|
-
Fetches a response from the configured Langchain model, expecting a single
|
|
117
|
-
structured CLA action.
|
|
118
|
-
|
|
119
|
-
Args:
|
|
120
|
-
observation: The preprocessed observation (screenshot potentially rescaled by adapter).
|
|
121
|
-
|
|
122
|
-
Returns:
|
|
123
|
-
A tuple containing:
|
|
124
|
-
- A list with a single dictionary representing the raw CLA action (before adapter postprocessing).
|
|
125
|
-
- A boolean indicating if the agent chose ResponseAction (task completion).
|
|
126
|
-
"""
|
|
127
|
-
# 1. Format observation into Langchain message(s)
|
|
128
|
-
human_content: List[Union[str, dict]] = []
|
|
129
|
-
if observation.text:
|
|
130
|
-
human_content.append(observation.text)
|
|
131
|
-
if observation.screenshot:
|
|
132
|
-
# Assuming the Langchain model/chain can handle base64 images
|
|
133
|
-
# This might need adjustment based on the specific model used.
|
|
134
|
-
human_content.append(
|
|
135
|
-
{
|
|
136
|
-
"type": "image_url",
|
|
137
|
-
"image_url": {"url": f"data:image/png;base64,{observation.screenshot}"},
|
|
138
|
-
}
|
|
139
|
-
)
|
|
140
|
-
|
|
141
|
-
if not human_content:
|
|
142
|
-
logger.warning("LangchainAgent received an observation with no text or screenshot.")
|
|
143
|
-
# Decide how to handle empty observation - perhaps return no action?
|
|
144
|
-
return [], False
|
|
145
|
-
|
|
146
|
-
current_human_message = HumanMessage(content=human_content)
|
|
147
|
-
|
|
148
|
-
# 2. Prepare message history for the model
|
|
149
|
-
messages_for_llm: List[BaseMessage] = [
|
|
150
|
-
SystemMessage(content=self.system_prompt_str),
|
|
151
|
-
*self.history,
|
|
152
|
-
current_human_message,
|
|
153
|
-
]
|
|
154
|
-
|
|
155
|
-
# 3. Configure structured output
|
|
156
|
-
# We ask for the StepAction wrapper, which contains the actual SingleCLAAction
|
|
157
|
-
# Explicitly use method="function_calling" to handle schemas with default values
|
|
158
|
-
structured_llm = self.langchain_model.with_structured_output(
|
|
159
|
-
schema=StepAction, method="function_calling"
|
|
160
|
-
)
|
|
161
|
-
|
|
162
|
-
# 4. Invoke Langchain model asynchronously
|
|
163
|
-
try:
|
|
164
|
-
ai_response_structured = await structured_llm.ainvoke(messages_for_llm)
|
|
165
|
-
except Exception as e:
|
|
166
|
-
logger.error(f"Langchain model invocation failed: {e}", exc_info=True)
|
|
167
|
-
# Decide how to handle LLM errors - maybe retry or return empty action?
|
|
168
|
-
return [], False
|
|
169
|
-
|
|
170
|
-
# 5. Process the structured response
|
|
171
|
-
is_done = False
|
|
172
|
-
ai_message_content_for_history = "" # For storing in history
|
|
173
|
-
|
|
174
|
-
if isinstance(ai_response_structured, StepAction):
|
|
175
|
-
# Successfully got the wrapper, extract the actual action
|
|
176
|
-
actual_action = ai_response_structured.action
|
|
177
|
-
ai_message_content_for_history = actual_action.model_dump()
|
|
178
|
-
if isinstance(actual_action, ResponseAction):
|
|
179
|
-
is_done = True
|
|
180
|
-
# logger.info(
|
|
181
|
-
# f"LangchainAgent determined task is done with response: {actual_action.text[:100]}..."
|
|
182
|
-
# )
|
|
183
|
-
# else:
|
|
184
|
-
# logger.info(f"LangchainAgent produced action: {type(actual_action).__name__}")
|
|
185
|
-
|
|
186
|
-
else:
|
|
187
|
-
logger.warning(
|
|
188
|
-
f"Langchain model did not return the expected StepAction structure. "
|
|
189
|
-
f"Received type: {type(ai_response_structured)}. Value: {ai_response_structured!r}"
|
|
190
|
-
)
|
|
191
|
-
# Attempt to add raw response to history for debugging
|
|
192
|
-
if isinstance(ai_response_structured, BaseMessage):
|
|
193
|
-
ai_message_content_for_history = ai_response_structured.content
|
|
194
|
-
elif isinstance(ai_response_structured, str):
|
|
195
|
-
ai_message_content_for_history = ai_response_structured
|
|
196
|
-
else:
|
|
197
|
-
ai_message_content_for_history = repr(ai_response_structured)
|
|
198
|
-
# Return no action as we didn't get the expected structure
|
|
199
|
-
return [], False
|
|
200
|
-
|
|
201
|
-
# 6. Update history
|
|
202
|
-
self.history.append(current_human_message)
|
|
203
|
-
# Add the AI response (containing the structured action dict) to history
|
|
204
|
-
# Convert dict to string representation for AIMessage content
|
|
205
|
-
self.history.append(AIMessage(content=repr(ai_message_content_for_history)))
|
|
206
|
-
# TODO: Consider history truncation/summarization if it grows too long
|
|
207
|
-
|
|
208
|
-
if actual_action:
|
|
209
|
-
actual_action = actual_action.model_dump()
|
|
210
|
-
# Return the single action dictionary within a list
|
|
211
|
-
actual_action["logs"] = ai_message_content_for_history
|
|
212
|
-
return [actual_action], is_done
|
|
213
|
-
else:
|
|
214
|
-
# Should ideally not happen if structure validation worked, but as a fallback
|
|
215
|
-
return [], is_done
|
hud/agent/misc/__init__.py
DELETED