hud-python 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +3 -2
- hud/adapters/__init__.py +2 -1
- hud/adapters/claude/adapter.py +15 -2
- hud/adapters/common/types.py +7 -3
- hud/adapters/operator/adapter.py +10 -6
- hud/agent/__init__.py +2 -1
- hud/agent/claude.py +22 -2
- hud/agent/langchain.py +198 -0
- hud/agent/operator.py +35 -17
- hud/env/docker_client.py +1 -1
- hud/env/environment.py +182 -9
- hud/env/local_docker_client.py +3 -1
- hud/env/remote_client.py +4 -0
- hud/gym.py +3 -3
- hud/job.py +420 -12
- hud/task.py +41 -30
- hud/taskset.py +8 -0
- hud/types.py +5 -3
- hud/utils/common.py +31 -1
- hud/utils/config.py +2 -93
- hud/utils/progress.py +136 -0
- {hud_python-0.2.0.dist-info → hud_python-0.2.2.dist-info}/METADATA +52 -39
- hud_python-0.2.2.dist-info/RECORD +46 -0
- hud_python-0.2.0.dist-info/RECORD +0 -44
- {hud_python-0.2.0.dist-info → hud_python-0.2.2.dist-info}/WHEEL +0 -0
- {hud_python-0.2.0.dist-info → hud_python-0.2.2.dist-info}/licenses/LICENSE +0 -0
hud/__init__.py
CHANGED
|
@@ -5,10 +5,10 @@ HUD Gym SDK - A Python SDK for interacting with HUD environments.
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
7
|
from . import agent, env, gym, settings, task, taskset, types, utils
|
|
8
|
-
from .job import create_job, job, load_job
|
|
8
|
+
from .job import create_job, job, load_job, run_job
|
|
9
9
|
from .taskset import load_taskset
|
|
10
10
|
|
|
11
|
-
__version__ = "0.2.
|
|
11
|
+
__version__ = "0.2.2"
|
|
12
12
|
|
|
13
13
|
__all__ = [
|
|
14
14
|
"agent",
|
|
@@ -18,6 +18,7 @@ __all__ = [
|
|
|
18
18
|
"job",
|
|
19
19
|
"load_job",
|
|
20
20
|
"load_taskset",
|
|
21
|
+
"run_job",
|
|
21
22
|
"settings",
|
|
22
23
|
"task",
|
|
23
24
|
"taskset",
|
hud/adapters/__init__.py
CHANGED
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from .claude import ClaudeAdapter
|
|
4
4
|
from .common import CLA, Adapter
|
|
5
|
+
from .common.types import ResponseAction
|
|
5
6
|
from .operator import OperatorAdapter
|
|
6
7
|
|
|
7
|
-
__all__ = ["CLA", "Adapter", "ClaudeAdapter", "OperatorAdapter"]
|
|
8
|
+
__all__ = ["CLA", "Adapter", "ClaudeAdapter", "OperatorAdapter", "ResponseAction"]
|
hud/adapters/claude/adapter.py
CHANGED
|
@@ -13,6 +13,7 @@ from hud.adapters.common.types import (
|
|
|
13
13
|
Point,
|
|
14
14
|
PositionFetch,
|
|
15
15
|
PressAction,
|
|
16
|
+
ResponseAction,
|
|
16
17
|
ScreenshotFetch,
|
|
17
18
|
ScrollAction,
|
|
18
19
|
TypeAction,
|
|
@@ -21,7 +22,14 @@ from hud.adapters.common.types import (
|
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
class ClaudeAdapter(Adapter):
|
|
24
|
-
KEY_MAP: ClassVar[dict[str, CLAKey]] = {
|
|
25
|
+
KEY_MAP: ClassVar[dict[str, CLAKey]] = {
|
|
26
|
+
"return": "enter",
|
|
27
|
+
"super": "win",
|
|
28
|
+
"super_l": "win",
|
|
29
|
+
"super_r": "win",
|
|
30
|
+
"right shift": "shift",
|
|
31
|
+
"left shift": "shift",
|
|
32
|
+
}
|
|
25
33
|
|
|
26
34
|
def __init__(self) -> None:
|
|
27
35
|
super().__init__()
|
|
@@ -30,7 +38,8 @@ class ClaudeAdapter(Adapter):
|
|
|
30
38
|
|
|
31
39
|
def _map_key(self, key: str) -> CLAKey:
|
|
32
40
|
"""Map a key to its standardized form."""
|
|
33
|
-
return self.KEY_MAP.get(key, key.lower()) # type: ignore
|
|
41
|
+
return self.KEY_MAP.get(key.lower(), key.lower()) # type: ignore
|
|
42
|
+
|
|
34
43
|
def convert(self, data: Any) -> CLA:
|
|
35
44
|
try:
|
|
36
45
|
action_type = data.get("action")
|
|
@@ -151,6 +160,10 @@ class ClaudeAdapter(Adapter):
|
|
|
151
160
|
elif action_type == "wait":
|
|
152
161
|
assert "duration" in data
|
|
153
162
|
return WaitAction(time=data["duration"])
|
|
163
|
+
|
|
164
|
+
elif action_type == "response":
|
|
165
|
+
return ResponseAction(text=data.get("text", ""))
|
|
166
|
+
|
|
154
167
|
else:
|
|
155
168
|
raise ValueError(f"Unsupported action type: {action_type}")
|
|
156
169
|
except AssertionError:
|
hud/adapters/common/types.py
CHANGED
|
@@ -20,7 +20,6 @@ class Point(BaseModel):
|
|
|
20
20
|
class ClickAction(CLAAction):
|
|
21
21
|
type: Literal["click"] = "click"
|
|
22
22
|
point: Point | None = None
|
|
23
|
-
selector: str | None = None
|
|
24
23
|
button: Literal["left", "right", "wheel", "back", "forward"] = "left"
|
|
25
24
|
pattern: list[int] | None = None # [delay_1, delay_2, ...]
|
|
26
25
|
hold_keys: list[CLAKey] | None = None
|
|
@@ -48,7 +47,6 @@ class KeyUpAction(CLAAction):
|
|
|
48
47
|
class TypeAction(CLAAction):
|
|
49
48
|
type: Literal["type"] = "type"
|
|
50
49
|
text: str
|
|
51
|
-
selector: str | None = None
|
|
52
50
|
enter_after: bool | None = False
|
|
53
51
|
|
|
54
52
|
|
|
@@ -64,7 +62,6 @@ class ScrollAction(CLAAction):
|
|
|
64
62
|
class MoveAction(CLAAction):
|
|
65
63
|
type: Literal["move"] = "move"
|
|
66
64
|
point: Point | None = None
|
|
67
|
-
selector: str | None = None
|
|
68
65
|
offset: Point | None = None
|
|
69
66
|
|
|
70
67
|
|
|
@@ -82,6 +79,12 @@ class DragAction(CLAAction):
|
|
|
82
79
|
hold_keys: list[CLAKey] | None = None
|
|
83
80
|
|
|
84
81
|
|
|
82
|
+
# RESPONSE ACTION from agent
|
|
83
|
+
class ResponseAction(CLAAction):
|
|
84
|
+
type: Literal["response"] = "response"
|
|
85
|
+
text: str # The final textual response from the agent
|
|
86
|
+
|
|
87
|
+
|
|
85
88
|
# SCREENSHOT ACTION
|
|
86
89
|
class ScreenshotFetch(CLAAction):
|
|
87
90
|
type: Literal["screenshot"] = "screenshot"
|
|
@@ -103,6 +106,7 @@ CLA = Annotated[
|
|
|
103
106
|
| KeyDownAction
|
|
104
107
|
| KeyUpAction
|
|
105
108
|
| TypeAction
|
|
109
|
+
| ResponseAction
|
|
106
110
|
| ScrollAction
|
|
107
111
|
| MoveAction
|
|
108
112
|
| WaitAction
|
hud/adapters/operator/adapter.py
CHANGED
|
@@ -10,6 +10,7 @@ from hud.adapters.common.types import (
|
|
|
10
10
|
MoveAction,
|
|
11
11
|
Point,
|
|
12
12
|
PressAction,
|
|
13
|
+
ResponseAction,
|
|
13
14
|
ScreenshotFetch,
|
|
14
15
|
ScrollAction,
|
|
15
16
|
TypeAction,
|
|
@@ -19,11 +20,11 @@ from hud.adapters.common.types import (
|
|
|
19
20
|
|
|
20
21
|
class OperatorAdapter(Adapter):
|
|
21
22
|
KEY_MAP: ClassVar[dict[str, CLAKey]] = {
|
|
22
|
-
"
|
|
23
|
-
"
|
|
24
|
-
"
|
|
25
|
-
"
|
|
26
|
-
"
|
|
23
|
+
"return": "enter",
|
|
24
|
+
"arrowup": "up",
|
|
25
|
+
"arrowdown": "down",
|
|
26
|
+
"arrowleft": "left",
|
|
27
|
+
"arrowright": "right",
|
|
27
28
|
}
|
|
28
29
|
|
|
29
30
|
def __init__(self) -> None:
|
|
@@ -34,7 +35,7 @@ class OperatorAdapter(Adapter):
|
|
|
34
35
|
|
|
35
36
|
def _map_key(self, key: str) -> CLAKey:
|
|
36
37
|
"""Map a key to its standardized form."""
|
|
37
|
-
return self.KEY_MAP.get(key, key.lower()) # type: ignore
|
|
38
|
+
return self.KEY_MAP.get(key.lower(), key.lower()) # type: ignore
|
|
38
39
|
|
|
39
40
|
def convert(self, data: Any) -> CLA:
|
|
40
41
|
"""Convert a Computer Use action to a HUD action"""
|
|
@@ -86,6 +87,9 @@ class OperatorAdapter(Adapter):
|
|
|
86
87
|
|
|
87
88
|
elif action_type == "screenshot":
|
|
88
89
|
return ScreenshotFetch()
|
|
90
|
+
|
|
91
|
+
elif action_type == "response":
|
|
92
|
+
return ResponseAction(text=data.get("text", ""))
|
|
89
93
|
else:
|
|
90
94
|
raise ValueError(f"Unsupported action type: {action_type}")
|
|
91
95
|
|
hud/agent/__init__.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from .base import Agent
|
|
2
2
|
from .claude import ClaudeAgent
|
|
3
3
|
from .operator import OperatorAgent
|
|
4
|
+
from .langchain import LangchainAgent
|
|
4
5
|
|
|
5
6
|
from hud.adapters import OperatorAdapter, ClaudeAdapter
|
|
6
7
|
|
|
7
|
-
__all__ = ["Agent", "ClaudeAgent", "OperatorAgent", "OperatorAdapter", "ClaudeAdapter"]
|
|
8
|
+
__all__ = ["Agent", "ClaudeAgent", "OperatorAgent", "OperatorAdapter", "ClaudeAdapter", "LangchainAgent"]
|
hud/agent/claude.py
CHANGED
|
@@ -11,7 +11,7 @@ from anthropic.types.beta import (
|
|
|
11
11
|
BetaImageBlockParam,
|
|
12
12
|
)
|
|
13
13
|
|
|
14
|
-
|
|
14
|
+
from hud.adapters import Adapter
|
|
15
15
|
from hud.agent.base import Agent
|
|
16
16
|
from hud.adapters.claude import ClaudeAdapter
|
|
17
17
|
from hud.env.environment import Observation
|
|
@@ -61,7 +61,7 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
|
|
|
61
61
|
def __init__(
|
|
62
62
|
self,
|
|
63
63
|
client: AsyncAnthropic | None = None,
|
|
64
|
-
adapter:
|
|
64
|
+
adapter: Adapter | None = None,
|
|
65
65
|
model: str = "claude-3-7-sonnet-20250219",
|
|
66
66
|
max_tokens: int = 4096,
|
|
67
67
|
max_iterations: int = 10,
|
|
@@ -85,6 +85,8 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
|
|
|
85
85
|
|
|
86
86
|
# Create client
|
|
87
87
|
client = AsyncAnthropic(api_key=api_key)
|
|
88
|
+
|
|
89
|
+
adapter = adapter or ClaudeAdapter()
|
|
88
90
|
|
|
89
91
|
super().__init__(client=client, adapter=adapter)
|
|
90
92
|
|
|
@@ -184,4 +186,22 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
|
|
|
184
186
|
done = False
|
|
185
187
|
break
|
|
186
188
|
|
|
189
|
+
# If no tool use action was found, check for a final text response
|
|
190
|
+
if not actions and done:
|
|
191
|
+
final_text_response = ""
|
|
192
|
+
for block in response_content:
|
|
193
|
+
if block.type == "text":
|
|
194
|
+
final_text_response += block.text
|
|
195
|
+
|
|
196
|
+
if final_text_response.strip():
|
|
197
|
+
logger.info(f"No tool use found. Using final text as response: {final_text_response}")
|
|
198
|
+
actions = [{
|
|
199
|
+
"action": "response",
|
|
200
|
+
"text": final_text_response.strip()
|
|
201
|
+
}]
|
|
202
|
+
# Keep done = True
|
|
203
|
+
else:
|
|
204
|
+
logger.info("No tool use and no final text block found.")
|
|
205
|
+
# Keep done = True, actions remains empty
|
|
206
|
+
|
|
187
207
|
return actions, done
|
hud/agent/langchain.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Generic, List, Optional, TypeVar, Union, cast
|
|
3
|
+
|
|
4
|
+
# Langchain imports
|
|
5
|
+
from langchain_core.language_models import BaseLanguageModel
|
|
6
|
+
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage
|
|
7
|
+
from langchain_core.runnables import Runnable, RunnableSerializable
|
|
8
|
+
from pydantic import Field, BaseModel
|
|
9
|
+
|
|
10
|
+
# HUD imports
|
|
11
|
+
from hud.adapters import Adapter
|
|
12
|
+
from hud.agent.base import Agent
|
|
13
|
+
from hud.env.environment import Observation
|
|
14
|
+
from hud.adapters.common.types import (
|
|
15
|
+
CLA,
|
|
16
|
+
ClickAction,
|
|
17
|
+
TypeAction,
|
|
18
|
+
ScrollAction,
|
|
19
|
+
MoveAction,
|
|
20
|
+
DragAction,
|
|
21
|
+
PressAction,
|
|
22
|
+
KeyDownAction,
|
|
23
|
+
KeyUpAction,
|
|
24
|
+
WaitAction,
|
|
25
|
+
ResponseAction,
|
|
26
|
+
CustomAction,
|
|
27
|
+
# Exclude ScreenshotFetch, PositionFetch as they are internal
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
# Define a Pydantic Union type representing exactly ONE possible CLA action
|
|
33
|
+
# This is what we'll ask the Langchain model to output.
|
|
34
|
+
SingleCLAction = Union[
|
|
35
|
+
ClickAction,
|
|
36
|
+
TypeAction,
|
|
37
|
+
ScrollAction,
|
|
38
|
+
MoveAction,
|
|
39
|
+
DragAction,
|
|
40
|
+
PressAction,
|
|
41
|
+
KeyDownAction,
|
|
42
|
+
KeyUpAction,
|
|
43
|
+
WaitAction,
|
|
44
|
+
ResponseAction,
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
# Define a Pydantic model to wrap the single action, potentially making it
|
|
48
|
+
# easier for the LLM to consistently output the desired structure.
|
|
49
|
+
class StepAction(BaseModel):
|
|
50
|
+
"""Wrapper model requesting a single concrete CLA action from the Langchain model."""
|
|
51
|
+
action: SingleCLAction = Field(..., description="The single CLA action to perform for this step.")
|
|
52
|
+
|
|
53
|
+
# Generic Type for the Langchain Model/Runnable
|
|
54
|
+
# Allows flexibility in what the user provides (model, chain, etc.)
|
|
55
|
+
# Bound to BaseLanguageModel as .with_structured_output is expected
|
|
56
|
+
LangchainModelOrRunnable = TypeVar("LangchainModelOrRunnable", bound=BaseLanguageModel)
|
|
57
|
+
|
|
58
|
+
class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainModelOrRunnable]):
|
|
59
|
+
"""
|
|
60
|
+
An agent that uses an arbitrary Langchain model or runnable, leveraging
|
|
61
|
+
Langchain's structured output capabilities to produce a single CLA action per step.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
def __init__(
|
|
65
|
+
self,
|
|
66
|
+
langchain_model: LangchainModelOrRunnable,
|
|
67
|
+
adapter: Optional[Adapter] = None,
|
|
68
|
+
system_prompt: str | None = None,
|
|
69
|
+
):
|
|
70
|
+
"""
|
|
71
|
+
Initialize the LangchainAgent.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
langchain_model: The Langchain language model or runnable chain to use.
|
|
75
|
+
Must support asynchronous invocation (`ainvoke`) and
|
|
76
|
+
`.with_structured_output()`.
|
|
77
|
+
adapter: An optional HUD adapter. If provided, it will be used for
|
|
78
|
+
preprocessing observations (rescaling) and postprocessing
|
|
79
|
+
the single CLA action (coordinate rescaling).
|
|
80
|
+
system_prompt: An optional system prompt to guide the Langchain model.
|
|
81
|
+
If None, a default prompt encouraging single CLA output is used.
|
|
82
|
+
"""
|
|
83
|
+
super().__init__(client=langchain_model, adapter=adapter) # Store model as 'client'
|
|
84
|
+
self.langchain_model = langchain_model # Also store with specific name
|
|
85
|
+
|
|
86
|
+
self.system_prompt_str = system_prompt or self._get_default_system_prompt()
|
|
87
|
+
self.history: List[BaseMessage] = []
|
|
88
|
+
|
|
89
|
+
def _get_default_system_prompt(self) -> str:
|
|
90
|
+
# TODO: Refine this prompt based on testing.
|
|
91
|
+
# It needs to strongly encourage outputting *only* the StepAction structure.
|
|
92
|
+
return (
|
|
93
|
+
"You are an agent interacting with a computer environment (either a web browser or an OS desktop). "
|
|
94
|
+
"Your goal is to follow the user's instructions based on the provided text and screenshot observations."
|
|
95
|
+
"For each step, you must choose exactly ONE action to perform from the available CLA action types."
|
|
96
|
+
"Output your chosen action using the provided 'StepAction' tool/function."
|
|
97
|
+
"If you believe the task is complete based on the user's prompt and the observations, use the 'ResponseAction'."
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
async def fetch_response(self, observation: Observation) -> tuple[CLA | None, bool]:
|
|
101
|
+
"""
|
|
102
|
+
Fetches a response from the configured Langchain model, expecting a single
|
|
103
|
+
structured CLA action.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
observation: The preprocessed observation (screenshot potentially rescaled by adapter).
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
A tuple containing:
|
|
110
|
+
- A list with a single dictionary representing the raw CLA action (before adapter postprocessing).
|
|
111
|
+
- A boolean indicating if the agent chose ResponseAction (task completion).
|
|
112
|
+
"""
|
|
113
|
+
# 1. Format observation into Langchain message(s)
|
|
114
|
+
human_content: List[Union[str, dict]] = []
|
|
115
|
+
if observation.text:
|
|
116
|
+
human_content.append(observation.text)
|
|
117
|
+
if observation.screenshot:
|
|
118
|
+
# Assuming the Langchain model/chain can handle base64 images
|
|
119
|
+
# This might need adjustment based on the specific model used.
|
|
120
|
+
human_content.append({
|
|
121
|
+
"type": "image_url",
|
|
122
|
+
"image_url": {
|
|
123
|
+
"url": f"data:image/png;base64,{observation.screenshot}"
|
|
124
|
+
}
|
|
125
|
+
})
|
|
126
|
+
|
|
127
|
+
if not human_content:
|
|
128
|
+
logger.warning("LangchainAgent received an observation with no text or screenshot.")
|
|
129
|
+
# Decide how to handle empty observation - perhaps return no action?
|
|
130
|
+
return [], False # Or raise an error?
|
|
131
|
+
|
|
132
|
+
current_human_message = HumanMessage(content=human_content)
|
|
133
|
+
|
|
134
|
+
# 2. Prepare message history for the model
|
|
135
|
+
messages_for_llm: List[BaseMessage] = [
|
|
136
|
+
SystemMessage(content=self.system_prompt_str),
|
|
137
|
+
*self.history,
|
|
138
|
+
current_human_message,
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
# 3. Configure structured output
|
|
142
|
+
# We ask for the StepAction wrapper, which contains the actual SingleCLAAction
|
|
143
|
+
# Explicitly use method="function_calling" to handle schemas with default values
|
|
144
|
+
structured_llm = self.langchain_model.with_structured_output(
|
|
145
|
+
schema=StepAction,
|
|
146
|
+
method="function_calling"
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# 4. Invoke Langchain model asynchronously
|
|
150
|
+
try:
|
|
151
|
+
ai_response_structured = await structured_llm.ainvoke(messages_for_llm)
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.error(f"Langchain model invocation failed: {e}", exc_info=True)
|
|
154
|
+
# Decide how to handle LLM errors - maybe retry or return empty action?
|
|
155
|
+
return [], False
|
|
156
|
+
|
|
157
|
+
# 5. Process the structured response
|
|
158
|
+
is_done = False
|
|
159
|
+
ai_message_content_for_history = "" # For storing in history
|
|
160
|
+
|
|
161
|
+
if isinstance(ai_response_structured, StepAction):
|
|
162
|
+
# Successfully got the wrapper, extract the actual action
|
|
163
|
+
actual_action = ai_response_structured.action
|
|
164
|
+
ai_message_content_for_history = actual_action.model_dump()
|
|
165
|
+
if isinstance(actual_action, ResponseAction):
|
|
166
|
+
is_done = True
|
|
167
|
+
logger.info(f"LangchainAgent determined task is done with response: {actual_action.text[:100]}...")
|
|
168
|
+
else:
|
|
169
|
+
logger.info(f"LangchainAgent produced action: {type(actual_action).__name__}")
|
|
170
|
+
|
|
171
|
+
else:
|
|
172
|
+
logger.warning(
|
|
173
|
+
f"Langchain model did not return the expected StepAction structure. "
|
|
174
|
+
f"Received type: {type(ai_response_structured)}. Value: {ai_response_structured!r}"
|
|
175
|
+
)
|
|
176
|
+
# Attempt to add raw response to history for debugging
|
|
177
|
+
if isinstance(ai_response_structured, BaseMessage):
|
|
178
|
+
ai_message_content_for_history = ai_response_structured.content
|
|
179
|
+
elif isinstance(ai_response_structured, str):
|
|
180
|
+
ai_message_content_for_history = ai_response_structured
|
|
181
|
+
else:
|
|
182
|
+
ai_message_content_for_history = repr(ai_response_structured)
|
|
183
|
+
# Return no action as we didn't get the expected structure
|
|
184
|
+
return [], False
|
|
185
|
+
|
|
186
|
+
# 6. Update history
|
|
187
|
+
self.history.append(current_human_message)
|
|
188
|
+
# Add the AI response (containing the structured action dict) to history
|
|
189
|
+
# Convert dict to string representation for AIMessage content
|
|
190
|
+
self.history.append(AIMessage(content=repr(ai_message_content_for_history)))
|
|
191
|
+
# TODO: Consider history truncation/summarization if it grows too long
|
|
192
|
+
|
|
193
|
+
if actual_action:
|
|
194
|
+
# Return the single action dictionary within a list
|
|
195
|
+
return [actual_action], is_done
|
|
196
|
+
else:
|
|
197
|
+
# Should ideally not happen if structure validation worked, but as a fallback
|
|
198
|
+
return [], is_done
|
hud/agent/operator.py
CHANGED
|
@@ -9,9 +9,11 @@ from openai.types.responses import (
|
|
|
9
9
|
ResponseInputParam,
|
|
10
10
|
ResponseInputItemParam,
|
|
11
11
|
ResponseOutputMessage,
|
|
12
|
-
ResponseComputerToolCall
|
|
12
|
+
ResponseComputerToolCall,
|
|
13
|
+
ResponseOutputText
|
|
13
14
|
)
|
|
14
15
|
|
|
16
|
+
from hud.adapters import Adapter
|
|
15
17
|
from hud.agent.base import Agent
|
|
16
18
|
from hud.adapters.operator import OperatorAdapter
|
|
17
19
|
from hud.env.environment import Observation
|
|
@@ -32,7 +34,7 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
|
|
|
32
34
|
client: OpenAI | None = None,
|
|
33
35
|
model: str = "computer-use-preview",
|
|
34
36
|
environment: Literal["windows", "mac", "linux", "browser"] = "windows",
|
|
35
|
-
adapter:
|
|
37
|
+
adapter: Adapter | None = None,
|
|
36
38
|
max_iterations: int = 8
|
|
37
39
|
):
|
|
38
40
|
"""
|
|
@@ -54,6 +56,8 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
|
|
|
54
56
|
|
|
55
57
|
# Create synchronous client
|
|
56
58
|
client = OpenAI(api_key=api_key)
|
|
59
|
+
|
|
60
|
+
adapter = adapter or OperatorAdapter()
|
|
57
61
|
|
|
58
62
|
super().__init__(client=client, adapter=adapter)
|
|
59
63
|
|
|
@@ -74,7 +78,7 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
|
|
|
74
78
|
self.last_response_id = None
|
|
75
79
|
self.pending_call_id = None
|
|
76
80
|
self.initial_prompt = None
|
|
77
|
-
|
|
81
|
+
|
|
78
82
|
async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
|
|
79
83
|
"""
|
|
80
84
|
Fetch a response from the model based on the observation.
|
|
@@ -158,33 +162,47 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
|
|
|
158
162
|
# Store the response ID for the next call
|
|
159
163
|
self.last_response_id = response.id
|
|
160
164
|
|
|
161
|
-
# Process the response to extract
|
|
165
|
+
# Process the response to extract actions or final text
|
|
162
166
|
actions = []
|
|
163
|
-
done = True # Assume
|
|
164
|
-
|
|
165
|
-
|
|
167
|
+
done = True # Assume done unless a computer call is found
|
|
168
|
+
final_text_response = ""
|
|
169
|
+
|
|
170
|
+
# Check for computer calls first
|
|
166
171
|
computer_calls = [
|
|
167
172
|
item for item in response.output
|
|
168
173
|
if isinstance(item, ResponseComputerToolCall) and item.type == "computer_call"
|
|
169
174
|
]
|
|
170
175
|
|
|
171
176
|
if computer_calls:
|
|
172
|
-
#
|
|
177
|
+
# If computer calls exist, process them and set done=False
|
|
173
178
|
done = False
|
|
174
|
-
|
|
175
|
-
# Process all computer calls
|
|
176
179
|
for computer_call in computer_calls:
|
|
177
180
|
self.pending_call_id = computer_call.call_id
|
|
178
181
|
action = computer_call.action
|
|
179
|
-
actions.append(action.model_dump())
|
|
180
|
-
|
|
181
|
-
# Log the action
|
|
182
|
+
actions.append(action.model_dump()) # Convert Pydantic model to dict
|
|
182
183
|
logger.info(f"Computer call action: {action}")
|
|
183
184
|
else:
|
|
184
|
-
#
|
|
185
|
-
logger.info("No computer call found
|
|
185
|
+
# No computer calls, check for a final text message
|
|
186
|
+
logger.info("No computer call found. Checking for final message.")
|
|
187
|
+
logger.info(response.output)
|
|
186
188
|
for item in response.output:
|
|
187
189
|
if isinstance(item, ResponseOutputMessage) and item.type == "message":
|
|
188
|
-
|
|
189
|
-
|
|
190
|
+
# Extract text from content blocks within the message
|
|
191
|
+
full_text = "".join([c.text for c in item.content if isinstance(c, ResponseOutputText)])
|
|
192
|
+
if full_text:
|
|
193
|
+
final_text_response = full_text
|
|
194
|
+
logger.info(f"Final text message: {final_text_response}")
|
|
195
|
+
break # Stop after finding the first text message
|
|
196
|
+
|
|
197
|
+
# If we found final text, package it as a 'response' action
|
|
198
|
+
if final_text_response:
|
|
199
|
+
actions = [{
|
|
200
|
+
"type": "response",
|
|
201
|
+
"text": final_text_response
|
|
202
|
+
}]
|
|
203
|
+
# Keep done = True
|
|
204
|
+
else:
|
|
205
|
+
logger.info("No computer calls and no final text message found.")
|
|
206
|
+
# Keep done = True, actions remains empty
|
|
207
|
+
|
|
190
208
|
return actions, done
|
hud/env/docker_client.py
CHANGED
|
@@ -215,7 +215,7 @@ class DockerClient(Client):
|
|
|
215
215
|
raise ValueError("Could not find package name in pyproject.toml")
|
|
216
216
|
logger.info("Installing %s in /root/controller", self._package_name)
|
|
217
217
|
result = await self.execute(
|
|
218
|
-
["bash", "-c", "cd /root/controller && pip install -e ."],
|
|
218
|
+
["bash", "-c", "cd /root/controller && pip install -e . --break-system-packages"],
|
|
219
219
|
timeout=60,
|
|
220
220
|
)
|
|
221
221
|
if result["stdout"]:
|