hud-python 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (192) hide show
  1. hud/__init__.py +22 -89
  2. hud/agents/__init__.py +17 -0
  3. hud/agents/art.py +101 -0
  4. hud/agents/base.py +599 -0
  5. hud/{mcp → agents}/claude.py +373 -321
  6. hud/{mcp → agents}/langchain.py +250 -250
  7. hud/agents/misc/__init__.py +7 -0
  8. hud/{agent → agents}/misc/response_agent.py +80 -80
  9. hud/{mcp → agents}/openai.py +352 -334
  10. hud/agents/openai_chat_generic.py +154 -0
  11. hud/{mcp → agents}/tests/__init__.py +1 -1
  12. hud/agents/tests/test_base.py +742 -0
  13. hud/agents/tests/test_claude.py +324 -0
  14. hud/{mcp → agents}/tests/test_client.py +363 -324
  15. hud/{mcp → agents}/tests/test_openai.py +237 -238
  16. hud/cli/__init__.py +617 -0
  17. hud/cli/__main__.py +8 -0
  18. hud/cli/analyze.py +371 -0
  19. hud/cli/analyze_metadata.py +230 -0
  20. hud/cli/build.py +427 -0
  21. hud/cli/clone.py +185 -0
  22. hud/cli/cursor.py +92 -0
  23. hud/cli/debug.py +392 -0
  24. hud/cli/docker_utils.py +83 -0
  25. hud/cli/init.py +281 -0
  26. hud/cli/interactive.py +353 -0
  27. hud/cli/mcp_server.py +756 -0
  28. hud/cli/pull.py +336 -0
  29. hud/cli/push.py +379 -0
  30. hud/cli/remote_runner.py +311 -0
  31. hud/cli/runner.py +160 -0
  32. hud/cli/tests/__init__.py +3 -0
  33. hud/cli/tests/test_analyze.py +284 -0
  34. hud/cli/tests/test_cli_init.py +265 -0
  35. hud/cli/tests/test_cli_main.py +27 -0
  36. hud/cli/tests/test_clone.py +142 -0
  37. hud/cli/tests/test_cursor.py +253 -0
  38. hud/cli/tests/test_debug.py +453 -0
  39. hud/cli/tests/test_mcp_server.py +139 -0
  40. hud/cli/tests/test_utils.py +388 -0
  41. hud/cli/utils.py +263 -0
  42. hud/clients/README.md +143 -0
  43. hud/clients/__init__.py +16 -0
  44. hud/clients/base.py +354 -0
  45. hud/clients/fastmcp.py +202 -0
  46. hud/clients/mcp_use.py +278 -0
  47. hud/clients/tests/__init__.py +1 -0
  48. hud/clients/tests/test_client_integration.py +111 -0
  49. hud/clients/tests/test_fastmcp.py +342 -0
  50. hud/clients/tests/test_protocol.py +188 -0
  51. hud/clients/utils/__init__.py +1 -0
  52. hud/clients/utils/retry_transport.py +160 -0
  53. hud/datasets.py +322 -192
  54. hud/misc/__init__.py +1 -0
  55. hud/{agent → misc}/claude_plays_pokemon.py +292 -283
  56. hud/otel/__init__.py +35 -0
  57. hud/otel/collector.py +142 -0
  58. hud/otel/config.py +164 -0
  59. hud/otel/context.py +536 -0
  60. hud/otel/exporters.py +366 -0
  61. hud/otel/instrumentation.py +97 -0
  62. hud/otel/processors.py +118 -0
  63. hud/otel/tests/__init__.py +1 -0
  64. hud/otel/tests/test_processors.py +197 -0
  65. hud/server/__init__.py +5 -5
  66. hud/server/context.py +114 -0
  67. hud/server/helper/__init__.py +5 -0
  68. hud/server/low_level.py +132 -0
  69. hud/server/server.py +166 -0
  70. hud/server/tests/__init__.py +3 -0
  71. hud/settings.py +73 -79
  72. hud/shared/__init__.py +5 -0
  73. hud/{exceptions.py → shared/exceptions.py} +180 -180
  74. hud/{server → shared}/requests.py +264 -264
  75. hud/shared/tests/test_exceptions.py +157 -0
  76. hud/{server → shared}/tests/test_requests.py +275 -275
  77. hud/telemetry/__init__.py +25 -30
  78. hud/telemetry/instrument.py +379 -0
  79. hud/telemetry/job.py +309 -141
  80. hud/telemetry/replay.py +74 -0
  81. hud/telemetry/trace.py +83 -0
  82. hud/tools/__init__.py +33 -34
  83. hud/tools/base.py +365 -65
  84. hud/tools/bash.py +161 -137
  85. hud/tools/computer/__init__.py +15 -13
  86. hud/tools/computer/anthropic.py +437 -414
  87. hud/tools/computer/hud.py +376 -328
  88. hud/tools/computer/openai.py +295 -286
  89. hud/tools/computer/settings.py +82 -0
  90. hud/tools/edit.py +314 -290
  91. hud/tools/executors/__init__.py +30 -30
  92. hud/tools/executors/base.py +539 -532
  93. hud/tools/executors/pyautogui.py +621 -619
  94. hud/tools/executors/tests/__init__.py +1 -1
  95. hud/tools/executors/tests/test_base_executor.py +338 -338
  96. hud/tools/executors/tests/test_pyautogui_executor.py +165 -165
  97. hud/tools/executors/xdo.py +511 -503
  98. hud/tools/{playwright_tool.py → playwright.py} +412 -379
  99. hud/tools/tests/__init__.py +3 -3
  100. hud/tools/tests/test_base.py +282 -0
  101. hud/tools/tests/test_bash.py +158 -152
  102. hud/tools/tests/test_bash_extended.py +197 -0
  103. hud/tools/tests/test_computer.py +425 -52
  104. hud/tools/tests/test_computer_actions.py +34 -34
  105. hud/tools/tests/test_edit.py +259 -240
  106. hud/tools/tests/test_init.py +27 -27
  107. hud/tools/tests/test_playwright_tool.py +183 -183
  108. hud/tools/tests/test_tools.py +145 -157
  109. hud/tools/tests/test_utils.py +156 -156
  110. hud/tools/types.py +72 -0
  111. hud/tools/utils.py +50 -50
  112. hud/types.py +136 -89
  113. hud/utils/__init__.py +10 -16
  114. hud/utils/async_utils.py +65 -0
  115. hud/utils/design.py +168 -0
  116. hud/utils/mcp.py +55 -0
  117. hud/utils/progress.py +149 -149
  118. hud/utils/telemetry.py +66 -66
  119. hud/utils/tests/test_async_utils.py +173 -0
  120. hud/utils/tests/test_init.py +17 -21
  121. hud/utils/tests/test_progress.py +261 -225
  122. hud/utils/tests/test_telemetry.py +82 -37
  123. hud/utils/tests/test_version.py +8 -8
  124. hud/version.py +7 -7
  125. hud_python-0.4.0.dist-info/METADATA +474 -0
  126. hud_python-0.4.0.dist-info/RECORD +132 -0
  127. hud_python-0.4.0.dist-info/entry_points.txt +3 -0
  128. {hud_python-0.3.4.dist-info → hud_python-0.4.0.dist-info}/licenses/LICENSE +21 -21
  129. hud/adapters/__init__.py +0 -8
  130. hud/adapters/claude/__init__.py +0 -5
  131. hud/adapters/claude/adapter.py +0 -180
  132. hud/adapters/claude/tests/__init__.py +0 -1
  133. hud/adapters/claude/tests/test_adapter.py +0 -519
  134. hud/adapters/common/__init__.py +0 -6
  135. hud/adapters/common/adapter.py +0 -178
  136. hud/adapters/common/tests/test_adapter.py +0 -289
  137. hud/adapters/common/types.py +0 -446
  138. hud/adapters/operator/__init__.py +0 -5
  139. hud/adapters/operator/adapter.py +0 -108
  140. hud/adapters/operator/tests/__init__.py +0 -1
  141. hud/adapters/operator/tests/test_adapter.py +0 -370
  142. hud/agent/__init__.py +0 -19
  143. hud/agent/base.py +0 -126
  144. hud/agent/claude.py +0 -271
  145. hud/agent/langchain.py +0 -215
  146. hud/agent/misc/__init__.py +0 -3
  147. hud/agent/operator.py +0 -268
  148. hud/agent/tests/__init__.py +0 -1
  149. hud/agent/tests/test_base.py +0 -202
  150. hud/env/__init__.py +0 -11
  151. hud/env/client.py +0 -35
  152. hud/env/docker_client.py +0 -349
  153. hud/env/environment.py +0 -446
  154. hud/env/local_docker_client.py +0 -358
  155. hud/env/remote_client.py +0 -212
  156. hud/env/remote_docker_client.py +0 -292
  157. hud/gym.py +0 -130
  158. hud/job.py +0 -773
  159. hud/mcp/__init__.py +0 -17
  160. hud/mcp/base.py +0 -631
  161. hud/mcp/client.py +0 -312
  162. hud/mcp/tests/test_base.py +0 -512
  163. hud/mcp/tests/test_claude.py +0 -294
  164. hud/task.py +0 -149
  165. hud/taskset.py +0 -237
  166. hud/telemetry/_trace.py +0 -347
  167. hud/telemetry/context.py +0 -230
  168. hud/telemetry/exporter.py +0 -575
  169. hud/telemetry/instrumentation/__init__.py +0 -3
  170. hud/telemetry/instrumentation/mcp.py +0 -259
  171. hud/telemetry/instrumentation/registry.py +0 -59
  172. hud/telemetry/mcp_models.py +0 -270
  173. hud/telemetry/tests/__init__.py +0 -1
  174. hud/telemetry/tests/test_context.py +0 -210
  175. hud/telemetry/tests/test_trace.py +0 -312
  176. hud/tools/helper/README.md +0 -56
  177. hud/tools/helper/__init__.py +0 -9
  178. hud/tools/helper/mcp_server.py +0 -78
  179. hud/tools/helper/server_initialization.py +0 -115
  180. hud/tools/helper/utils.py +0 -58
  181. hud/trajectory.py +0 -94
  182. hud/utils/agent.py +0 -37
  183. hud/utils/common.py +0 -256
  184. hud/utils/config.py +0 -120
  185. hud/utils/deprecation.py +0 -115
  186. hud/utils/misc.py +0 -53
  187. hud/utils/tests/test_common.py +0 -277
  188. hud/utils/tests/test_config.py +0 -129
  189. hud_python-0.3.4.dist-info/METADATA +0 -284
  190. hud_python-0.3.4.dist-info/RECORD +0 -120
  191. /hud/{adapters/common → shared}/tests/__init__.py +0 -0
  192. {hud_python-0.3.4.dist-info → hud_python-0.4.0.dist-info}/WHEEL +0 -0
hud/agent/claude.py DELETED
@@ -1,271 +0,0 @@
1
- import copy
2
- import logging
3
- from typing import Any, cast
4
-
5
- from anthropic import AsyncAnthropic, BadRequestError
6
- from anthropic.types.beta import (
7
- BetaMessageParam,
8
- BetaToolResultBlockParam,
9
- BetaToolComputerUse20250124Param,
10
- BetaTextBlockParam,
11
- BetaImageBlockParam,
12
- BetaCacheControlEphemeralParam,
13
- )
14
-
15
- from hud.adapters import Adapter
16
- from hud.agent.base import Agent
17
- from hud.adapters.claude import ClaudeAdapter
18
- from hud.types import Gym
19
- from hud.utils.common import Observation
20
- from hud.settings import settings
21
- from hud.adapters.common.types import LogType
22
-
23
- logger = logging.getLogger(__name__)
24
-
25
-
26
- def base64_to_content_block(base64: str) -> BetaImageBlockParam:
27
- return {
28
- "type": "image",
29
- "source": {"type": "base64", "media_type": "image/png", "data": base64},
30
- }
31
-
32
-
33
- def text_to_content_block(text: str) -> BetaTextBlockParam:
34
- return {"type": "text", "text": text}
35
-
36
-
37
- def tool_use_content_block(
38
- tool_use_id: str, content: list[BetaTextBlockParam | BetaImageBlockParam]
39
- ) -> BetaToolResultBlockParam:
40
- return {"type": "tool_result", "tool_use_id": tool_use_id, "content": content}
41
-
42
-
43
- # Claude's Computer Use Tool definition
44
- COMPUTER_TOOL: BetaToolComputerUse20250124Param = {
45
- "type": "computer_20250124",
46
- "name": "computer",
47
- "display_width_px": 1024,
48
- "display_height_px": 768,
49
- }
50
-
51
-
52
- class ClaudeAgent(Agent[AsyncAnthropic, Any]):
53
- """
54
- An agent implementation using Anthropic's Claude API with Computer Use.
55
-
56
- This agent interacts with HUD environments using Claude's Computer Use API
57
- through the ClaudeAdapter which converts actions to the format expected by HUD.
58
- """
59
-
60
- transfer_gyms: dict[Gym, Gym] = {"qa": "hud-browser"}
61
-
62
- def __init__(
63
- self,
64
- client: AsyncAnthropic | None = None,
65
- adapter: Adapter | None = None,
66
- model: str = "claude-3-7-sonnet-20250219",
67
- max_tokens: int = 4096,
68
- max_iterations: int = 10,
69
- name: str | None = None,
70
- ):
71
- """
72
- Initialize the ClaudeAgent.
73
-
74
- Args:
75
- client: The AsyncAnthropic client for API calls (optional, created automatically if not provided)
76
- adapter: The adapter to use for preprocessing and postprocessing
77
- model: The Claude model to use
78
- max_tokens: Maximum tokens for Claude's response
79
- max_iterations: Maximum number of iterations for the agent
80
- name: The name of the agent
81
- """
82
- # Initialize client if not provided
83
- if client is None:
84
- # Get API key from settings
85
- api_key = settings.anthropic_api_key
86
- if not api_key:
87
- raise ValueError(
88
- "Anthropic API key not found in settings or environment variables. Set ANTHROPIC_API_KEY."
89
- )
90
-
91
- # Create client
92
- client = AsyncAnthropic(api_key=api_key)
93
-
94
- adapter = adapter or ClaudeAdapter()
95
-
96
- if name is None:
97
- name = model
98
-
99
- super().__init__(client=client, adapter=adapter, name=name)
100
-
101
- self.model = model
102
- self.max_tokens = max_tokens
103
- self.max_iterations = max_iterations
104
-
105
- # Default dimensions - will be updated if adapter is provided
106
- self.width_px = 1024
107
- self.height_px = 768
108
-
109
- # Update dimensions if adapter is provided
110
- if self.adapter:
111
- self.width_px = self.adapter.agent_width
112
- self.height_px = self.adapter.agent_height
113
-
114
- # Message history
115
- self.messages: list[BetaMessageParam] = []
116
- self.pending_computer_use_tool_id = None
117
-
118
- async def fetch_response(self, observation: Observation) -> tuple[list[Any], bool]:
119
- """
120
- Fetch a response from Claude based on the observation.
121
-
122
- Args:
123
- observation: The preprocessed observation
124
-
125
- Returns:
126
- tuple[list[Any], bool, list[str | dict[str, Any]] | None]: A tuple containing the list of raw actions,
127
- boolean indicating if the agent believes the task is complete, and a list of strings or dictionaries of logs.
128
- """
129
- if not self.client:
130
- raise ValueError("Client is required")
131
-
132
- if not observation.text and not observation.screenshot:
133
- raise ValueError("Observation must contain either text or screenshot")
134
-
135
- # Prepare the user content for Claude
136
- user_content: list[BetaImageBlockParam | BetaTextBlockParam | BetaToolResultBlockParam] = []
137
-
138
- # Add text instruction if present
139
- if observation.text:
140
- # logger.info("Adding text to user content: %s", observation.text)
141
- user_content.append(text_to_content_block(str(observation.text)))
142
-
143
- # Add screenshot if present
144
- if observation.screenshot:
145
- # logger.info("Adding screenshot to user content")
146
- if not self.pending_computer_use_tool_id:
147
- # logger.info("Adding screenshot to user content, no tool id")
148
- user_content.append(base64_to_content_block(observation.screenshot))
149
- else:
150
- # logger.info(
151
- # "Adding screenshot to user content, tool id: %s",
152
- # self.pending_computer_use_tool_id,
153
- # )
154
- user_content.append(
155
- tool_use_content_block(
156
- self.pending_computer_use_tool_id,
157
- [base64_to_content_block(observation.screenshot)],
158
- )
159
- )
160
- self.pending_computer_use_tool_id = None
161
-
162
- # Add the user content to the messages
163
- self.messages.append(
164
- cast(
165
- BetaMessageParam,
166
- {
167
- "role": "user",
168
- "content": user_content,
169
- },
170
- )
171
- )
172
-
173
- # Call Claude API using async client, truncating 50 messages at a time if needed
174
- while True:
175
- # first, make a copy and add prompt caching to the last message
176
- messages_cached = copy.deepcopy(self.messages)
177
- # Mark last user message with cache control for prompt caching
178
- last_msg = messages_cached[-1]
179
- if last_msg.get("role") == "user":
180
- last_content = last_msg["content"]
181
- if isinstance(last_content, list):
182
- for block in last_content:
183
- if (
184
- not block["type"] == "thinking"
185
- and not block["type"] == "redacted_thinking"
186
- ):
187
- cache_control: BetaCacheControlEphemeralParam = {"type": "ephemeral"}
188
- block["cache_control"] = cache_control
189
-
190
- try:
191
- response = await self.client.beta.messages.create(
192
- model=self.model,
193
- max_tokens=self.max_tokens,
194
- messages=messages_cached,
195
- tools=[COMPUTER_TOOL],
196
- betas=["computer-use-2025-01-24"],
197
- tool_choice={"type": "auto", "disable_parallel_tool_use": True},
198
- )
199
- except BadRequestError as e:
200
- if e.message.startswith("prompt is too long"):
201
- logger.warning(
202
- f"Prompt is too long, removing the first 50 messages except for the first user message: {e.message}"
203
- )
204
- self.messages = [self.messages[0]] + self.messages[50:]
205
- continue
206
- else:
207
- raise e
208
-
209
- # break out of the while loop if we get a response
210
- break
211
-
212
- # Add Claude's response to the conversation history
213
- response_content = response.content
214
- self.messages.append(
215
- cast(
216
- BetaMessageParam,
217
- {
218
- "role": "assistant",
219
- "content": response_content,
220
- },
221
- )
222
- )
223
-
224
- # Process tool use
225
- actions: list[Any] = []
226
- done = True # Assume we're done unless we find a tool use
227
-
228
- for block in response_content:
229
- # logger.info("Processing block: %s", block)
230
- if block.type == "tool_use":
231
- # logger.info("Processing tool use: %s", block)
232
- assert block.name == "computer"
233
-
234
- # Store the raw action
235
- actions.append(block.input)
236
- self.pending_computer_use_tool_id = block.id
237
-
238
- # If we found a tool use, we're not done
239
- done = False
240
- break
241
-
242
- # If no tool use action was found, check for a final text response
243
- if len(actions) == 0 and done:
244
- final_text_response = ""
245
- for block in response_content:
246
- if block.type == "text":
247
- final_text_response += block.text
248
-
249
- if final_text_response.strip():
250
- # logger.info(
251
- # f"No tool use found. Using final text as response: {final_text_response}"
252
- # )
253
- actions = [{"action": "response", "text": final_text_response.strip()}]
254
- done = True
255
- # else:
256
- # logger.info("No tool use and no final text block found.")
257
- # Keep done = True, actions remains empty
258
-
259
- reasoning = ""
260
- for block in response_content:
261
- if block.type == "thinking":
262
- reasoning += f"Thinking: {block.thinking}\n"
263
- elif block.type == "text":
264
- reasoning += block.text
265
-
266
- # add reasoning to the actions
267
- for action in actions:
268
- action["reasoning"] = reasoning
269
- action["logs"] = response.model_dump()
270
-
271
- return actions, done
hud/agent/langchain.py DELETED
@@ -1,215 +0,0 @@
1
- import logging
2
- from typing import Any, Generic, List, Optional, TypeVar, Union, cast
3
-
4
- # Langchain imports
5
- from langchain_core.language_models import BaseLanguageModel
6
- from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage
7
- from langchain_core.runnables import Runnable, RunnableSerializable
8
- from pydantic import Field, BaseModel
9
-
10
- # HUD imports
11
- from hud.adapters import Adapter
12
- from hud.agent.base import Agent
13
- from hud.types import Gym
14
- from hud.utils.common import Observation
15
- from hud.adapters.common.types import (
16
- ClickAction,
17
- TypeAction,
18
- ScrollAction,
19
- MoveAction,
20
- DragAction,
21
- PressAction,
22
- KeyDownAction,
23
- KeyUpAction,
24
- WaitAction,
25
- ResponseAction,
26
- CustomAction,
27
- LogType,
28
- # Exclude ScreenshotFetch, PositionFetch as they are internal
29
- )
30
-
31
- logger = logging.getLogger(__name__)
32
-
33
- # Define a Pydantic Union type representing exactly ONE possible CLA action
34
- # This is what we'll ask the Langchain model to output.
35
- SingleCLAction = Union[
36
- ClickAction,
37
- TypeAction,
38
- ScrollAction,
39
- MoveAction,
40
- DragAction,
41
- PressAction,
42
- KeyDownAction,
43
- KeyUpAction,
44
- WaitAction,
45
- ResponseAction,
46
- ]
47
-
48
-
49
- # Define a Pydantic model to wrap the single action, potentially making it
50
- # easier for the LLM to consistently output the desired structure.
51
- class StepAction(BaseModel):
52
- """Wrapper model requesting a single concrete CLA action from the Langchain model."""
53
-
54
- action: SingleCLAction = Field(
55
- ..., description="The single CLA action to perform for this step."
56
- )
57
-
58
-
59
- # Generic Type for the Langchain Model/Runnable
60
- # Allows flexibility in what the user provides (model, chain, etc.)
61
- # Bound to BaseLanguageModel as .with_structured_output is expected
62
- LangchainModelOrRunnable = TypeVar("LangchainModelOrRunnable", bound=BaseLanguageModel)
63
-
64
-
65
- class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainModelOrRunnable]):
66
- """
67
- An agent that uses an arbitrary Langchain model or runnable, leveraging
68
- Langchain's structured output capabilities to produce a single CLA action per step.
69
- """
70
-
71
- transfer_gyms: dict[Gym, Gym] = {"qa": "hud-browser"}
72
-
73
- def __init__(
74
- self,
75
- langchain_model: LangchainModelOrRunnable,
76
- adapter: Optional[Adapter] = None,
77
- system_prompt: str | None = None,
78
- name: str | None = None,
79
- ):
80
- """
81
- Initialize the LangchainAgent.
82
-
83
- Args:
84
- langchain_model: The Langchain language model or runnable chain to use.
85
- Must support asynchronous invocation (`ainvoke`) and
86
- `.with_structured_output()`.
87
- adapter: An optional HUD adapter. If provided, it will be used for
88
- preprocessing observations (rescaling) and postprocessing
89
- the single CLA action (coordinate rescaling).
90
- system_prompt: An optional system prompt to guide the Langchain model.
91
- If None, a default prompt encouraging single CLA output is used.
92
- """
93
- super().__init__(
94
- client=langchain_model, adapter=adapter, name=name
95
- ) # Store model as 'client'
96
- self.langchain_model = langchain_model # Also store with specific name
97
-
98
- self.system_prompt_str = system_prompt or self._get_default_system_prompt()
99
- self.history: List[BaseMessage] = []
100
-
101
- def _get_default_system_prompt(self) -> str:
102
- # TODO: Refine this prompt based on testing.
103
- # It needs to strongly encourage outputting *only* the StepAction structure.
104
- return (
105
- "You are an agent interacting with a computer environment (either a web browser or an OS desktop). "
106
- "Your goal is to follow the user's instructions based on the provided text and screenshot observations."
107
- "For each step, you must choose exactly ONE action to perform from the available CLA action types."
108
- "Output your chosen action using the provided 'StepAction' tool/function."
109
- "If you believe the task is complete based on the user's prompt and the observations, use the 'ResponseAction'."
110
- )
111
-
112
- async def fetch_response(
113
- self, observation: Observation
114
- ) -> tuple[list[dict | SingleCLAction], bool]:
115
- """
116
- Fetches a response from the configured Langchain model, expecting a single
117
- structured CLA action.
118
-
119
- Args:
120
- observation: The preprocessed observation (screenshot potentially rescaled by adapter).
121
-
122
- Returns:
123
- A tuple containing:
124
- - A list with a single dictionary representing the raw CLA action (before adapter postprocessing).
125
- - A boolean indicating if the agent chose ResponseAction (task completion).
126
- """
127
- # 1. Format observation into Langchain message(s)
128
- human_content: List[Union[str, dict]] = []
129
- if observation.text:
130
- human_content.append(observation.text)
131
- if observation.screenshot:
132
- # Assuming the Langchain model/chain can handle base64 images
133
- # This might need adjustment based on the specific model used.
134
- human_content.append(
135
- {
136
- "type": "image_url",
137
- "image_url": {"url": f"data:image/png;base64,{observation.screenshot}"},
138
- }
139
- )
140
-
141
- if not human_content:
142
- logger.warning("LangchainAgent received an observation with no text or screenshot.")
143
- # Decide how to handle empty observation - perhaps return no action?
144
- return [], False
145
-
146
- current_human_message = HumanMessage(content=human_content)
147
-
148
- # 2. Prepare message history for the model
149
- messages_for_llm: List[BaseMessage] = [
150
- SystemMessage(content=self.system_prompt_str),
151
- *self.history,
152
- current_human_message,
153
- ]
154
-
155
- # 3. Configure structured output
156
- # We ask for the StepAction wrapper, which contains the actual SingleCLAAction
157
- # Explicitly use method="function_calling" to handle schemas with default values
158
- structured_llm = self.langchain_model.with_structured_output(
159
- schema=StepAction, method="function_calling"
160
- )
161
-
162
- # 4. Invoke Langchain model asynchronously
163
- try:
164
- ai_response_structured = await structured_llm.ainvoke(messages_for_llm)
165
- except Exception as e:
166
- logger.error(f"Langchain model invocation failed: {e}", exc_info=True)
167
- # Decide how to handle LLM errors - maybe retry or return empty action?
168
- return [], False
169
-
170
- # 5. Process the structured response
171
- is_done = False
172
- ai_message_content_for_history = "" # For storing in history
173
-
174
- if isinstance(ai_response_structured, StepAction):
175
- # Successfully got the wrapper, extract the actual action
176
- actual_action = ai_response_structured.action
177
- ai_message_content_for_history = actual_action.model_dump()
178
- if isinstance(actual_action, ResponseAction):
179
- is_done = True
180
- # logger.info(
181
- # f"LangchainAgent determined task is done with response: {actual_action.text[:100]}..."
182
- # )
183
- # else:
184
- # logger.info(f"LangchainAgent produced action: {type(actual_action).__name__}")
185
-
186
- else:
187
- logger.warning(
188
- f"Langchain model did not return the expected StepAction structure. "
189
- f"Received type: {type(ai_response_structured)}. Value: {ai_response_structured!r}"
190
- )
191
- # Attempt to add raw response to history for debugging
192
- if isinstance(ai_response_structured, BaseMessage):
193
- ai_message_content_for_history = ai_response_structured.content
194
- elif isinstance(ai_response_structured, str):
195
- ai_message_content_for_history = ai_response_structured
196
- else:
197
- ai_message_content_for_history = repr(ai_response_structured)
198
- # Return no action as we didn't get the expected structure
199
- return [], False
200
-
201
- # 6. Update history
202
- self.history.append(current_human_message)
203
- # Add the AI response (containing the structured action dict) to history
204
- # Convert dict to string representation for AIMessage content
205
- self.history.append(AIMessage(content=repr(ai_message_content_for_history)))
206
- # TODO: Consider history truncation/summarization if it grows too long
207
-
208
- if actual_action:
209
- actual_action = actual_action.model_dump()
210
- # Return the single action dictionary within a list
211
- actual_action["logs"] = ai_message_content_for_history
212
- return [actual_action], is_done
213
- else:
214
- # Should ideally not happen if structure validation worked, but as a fallback
215
- return [], is_done
@@ -1,3 +0,0 @@
1
- from .response_agent import ResponseAgent
2
-
3
- __all__ = ["ResponseAgent"]