hud-python 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +13 -10
- hud/adapters/claude/adapter.py +30 -18
- hud/adapters/common/adapter.py +0 -1
- hud/adapters/common/types.py +129 -4
- hud/adapters/operator/adapter.py +23 -13
- hud/agent/base.py +5 -4
- hud/agent/claude.py +65 -13
- hud/agent/claude_plays_pokemon.py +2 -2
- hud/agent/langchain.py +8 -2
- hud/agent/operator.py +36 -11
- hud/agent/tests/test_base.py +2 -2
- hud/env/docker_client.py +26 -3
- hud/env/environment.py +86 -40
- hud/env/local_docker_client.py +50 -4
- hud/env/remote_client.py +22 -4
- hud/env/remote_docker_client.py +6 -2
- hud/gym.py +15 -4
- hud/job.py +91 -26
- hud/settings.py +6 -0
- hud/task.py +84 -6
- hud/taskset.py +63 -8
- hud/telemetry/exporter.py +4 -6
- hud/trajectory.py +3 -0
- hud/types.py +28 -2
- hud/utils/agent.py +37 -0
- hud/utils/common.py +142 -26
- hud/utils/config.py +11 -0
- hud/utils/tests/test_common.py +225 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.2.6.dist-info → hud_python-0.2.8.dist-info}/METADATA +9 -6
- {hud_python-0.2.6.dist-info → hud_python-0.2.8.dist-info}/RECORD +34 -33
- {hud_python-0.2.6.dist-info → hud_python-0.2.8.dist-info}/WHEEL +0 -0
- {hud_python-0.2.6.dist-info → hud_python-0.2.8.dist-info}/licenses/LICENSE +0 -0
|
@@ -11,7 +11,7 @@ from anthropic.types.beta import (
|
|
|
11
11
|
BetaImageBlockParam,
|
|
12
12
|
)
|
|
13
13
|
|
|
14
|
-
from hud.adapters.common.types import CLA
|
|
14
|
+
from hud.adapters.common.types import CLA, LogType
|
|
15
15
|
from hud.agent import Agent
|
|
16
16
|
from hud.adapters import Adapter
|
|
17
17
|
from hud.settings import settings
|
|
@@ -192,7 +192,7 @@ class ClaudePlaysPokemon(Agent[AsyncAnthropic, CLA]):
|
|
|
192
192
|
observation: The current game observation
|
|
193
193
|
|
|
194
194
|
Returns:
|
|
195
|
-
tuple[list[dict[str, Any]], bool]: List of actions
|
|
195
|
+
tuple[list[dict[str, Any]], bool, list[LogType] | None]: List of actions, whether the game is done, and a list of strings or dictionaries of logs.
|
|
196
196
|
|
|
197
197
|
Raises:
|
|
198
198
|
ValueError: If client is not initialized
|
hud/agent/langchain.py
CHANGED
|
@@ -24,6 +24,7 @@ from hud.adapters.common.types import (
|
|
|
24
24
|
WaitAction,
|
|
25
25
|
ResponseAction,
|
|
26
26
|
CustomAction,
|
|
27
|
+
LogType,
|
|
27
28
|
# Exclude ScreenshotFetch, PositionFetch as they are internal
|
|
28
29
|
)
|
|
29
30
|
|
|
@@ -74,6 +75,7 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
|
|
|
74
75
|
langchain_model: LangchainModelOrRunnable,
|
|
75
76
|
adapter: Optional[Adapter] = None,
|
|
76
77
|
system_prompt: str | None = None,
|
|
78
|
+
name: str | None = None,
|
|
77
79
|
):
|
|
78
80
|
"""
|
|
79
81
|
Initialize the LangchainAgent.
|
|
@@ -88,7 +90,9 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
|
|
|
88
90
|
system_prompt: An optional system prompt to guide the Langchain model.
|
|
89
91
|
If None, a default prompt encouraging single CLA output is used.
|
|
90
92
|
"""
|
|
91
|
-
super().__init__(
|
|
93
|
+
super().__init__(
|
|
94
|
+
client=langchain_model, adapter=adapter, name=name
|
|
95
|
+
) # Store model as 'client'
|
|
92
96
|
self.langchain_model = langchain_model # Also store with specific name
|
|
93
97
|
|
|
94
98
|
self.system_prompt_str = system_prompt or self._get_default_system_prompt()
|
|
@@ -137,7 +141,7 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
|
|
|
137
141
|
if not human_content:
|
|
138
142
|
logger.warning("LangchainAgent received an observation with no text or screenshot.")
|
|
139
143
|
# Decide how to handle empty observation - perhaps return no action?
|
|
140
|
-
return [], False
|
|
144
|
+
return [], False
|
|
141
145
|
|
|
142
146
|
current_human_message = HumanMessage(content=human_content)
|
|
143
147
|
|
|
@@ -202,7 +206,9 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
|
|
|
202
206
|
# TODO: Consider history truncation/summarization if it grows too long
|
|
203
207
|
|
|
204
208
|
if actual_action:
|
|
209
|
+
actual_action = actual_action.model_dump()
|
|
205
210
|
# Return the single action dictionary within a list
|
|
211
|
+
actual_action["logs"] = ai_message_content_for_history
|
|
206
212
|
return [actual_action], is_done
|
|
207
213
|
else:
|
|
208
214
|
# Should ideally not happen if structure validation worked, but as a fallback
|
hud/agent/operator.py
CHANGED
|
@@ -19,6 +19,7 @@ from hud.adapters.operator import OperatorAdapter
|
|
|
19
19
|
from hud.types import Gym
|
|
20
20
|
from hud.utils.common import Observation
|
|
21
21
|
from hud.settings import settings
|
|
22
|
+
from hud.adapters.common.types import LogType
|
|
22
23
|
|
|
23
24
|
logger = logging.getLogger(__name__)
|
|
24
25
|
|
|
@@ -37,9 +38,10 @@ class OperatorAgent(Agent[AsyncOpenAI, dict[str, Any]]):
|
|
|
37
38
|
self,
|
|
38
39
|
client: AsyncOpenAI | None = None,
|
|
39
40
|
model: str = "computer-use-preview",
|
|
40
|
-
environment: Literal["windows", "mac", "linux", "browser"] = "
|
|
41
|
+
environment: Literal["windows", "mac", "linux", "browser"] = "browser",
|
|
41
42
|
adapter: Adapter | None = None,
|
|
42
43
|
max_iterations: int = 8,
|
|
44
|
+
name: str | None = None,
|
|
43
45
|
):
|
|
44
46
|
"""
|
|
45
47
|
Initialize the OperatorAgent.
|
|
@@ -50,6 +52,7 @@ class OperatorAgent(Agent[AsyncOpenAI, dict[str, Any]]):
|
|
|
50
52
|
environment: The environment type (windows, mac, linux, browser)
|
|
51
53
|
adapter: The adapter to use for preprocessing and postprocessing
|
|
52
54
|
max_iterations: Maximum number of iterations for the agent
|
|
55
|
+
name: The name of the agent
|
|
53
56
|
"""
|
|
54
57
|
# Initialize client if not provided
|
|
55
58
|
if client is None:
|
|
@@ -65,7 +68,10 @@ class OperatorAgent(Agent[AsyncOpenAI, dict[str, Any]]):
|
|
|
65
68
|
|
|
66
69
|
adapter = adapter or OperatorAdapter()
|
|
67
70
|
|
|
68
|
-
|
|
71
|
+
if name is None:
|
|
72
|
+
name = f"openai-{model}"
|
|
73
|
+
|
|
74
|
+
super().__init__(client=client, adapter=adapter, name=name)
|
|
69
75
|
|
|
70
76
|
self.model = model
|
|
71
77
|
self.environment = environment
|
|
@@ -86,6 +92,8 @@ class OperatorAgent(Agent[AsyncOpenAI, dict[str, Any]]):
|
|
|
86
92
|
self.initial_prompt = None
|
|
87
93
|
self.pending_safety_checks = []
|
|
88
94
|
|
|
95
|
+
self.task_run_id = None
|
|
96
|
+
|
|
89
97
|
async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
|
|
90
98
|
"""
|
|
91
99
|
Fetch a response from the model based on the observation.
|
|
@@ -94,8 +102,8 @@ class OperatorAgent(Agent[AsyncOpenAI, dict[str, Any]]):
|
|
|
94
102
|
observation: The preprocessed observation
|
|
95
103
|
|
|
96
104
|
Returns:
|
|
97
|
-
tuple[list[dict[str, Any]], bool]: A tuple containing the list of raw actions
|
|
98
|
-
boolean indicating if the agent believes the task is complete
|
|
105
|
+
tuple[list[dict[str, Any]], bool, list[LogType] | None]: A tuple containing the list of raw actions,
|
|
106
|
+
boolean indicating if the agent believes the task is complete.
|
|
99
107
|
"""
|
|
100
108
|
if not self.client:
|
|
101
109
|
raise ValueError("Client is required")
|
|
@@ -112,7 +120,7 @@ class OperatorAgent(Agent[AsyncOpenAI, dict[str, Any]]):
|
|
|
112
120
|
)
|
|
113
121
|
|
|
114
122
|
# Process the observation based on whether it's the first one or a response to an action
|
|
115
|
-
if self.pending_call_id is None and self.last_response_id is None:
|
|
123
|
+
if self.pending_call_id is None: # and self.last_response_id is None:
|
|
116
124
|
# This is the first observation, store and send the prompt
|
|
117
125
|
self.initial_prompt = observation.text
|
|
118
126
|
|
|
@@ -133,13 +141,15 @@ class OperatorAgent(Agent[AsyncOpenAI, dict[str, Any]]):
|
|
|
133
141
|
# Structure the input correctly for the API using cast
|
|
134
142
|
input_param = cast(ResponseInputParam, [{"role": "user", "content": input_content}])
|
|
135
143
|
|
|
136
|
-
# Call OpenAI API for the initial prompt (asynchronous call)
|
|
137
144
|
response = await self.client.responses.create(
|
|
138
|
-
model=self.model,
|
|
145
|
+
model=self.model,
|
|
146
|
+
tools=[computer_tool],
|
|
147
|
+
input=input_param,
|
|
148
|
+
truncation="auto",
|
|
149
|
+
reasoning={"summary": "auto"},
|
|
139
150
|
)
|
|
140
151
|
|
|
141
152
|
else:
|
|
142
|
-
# This is a response to a previous action
|
|
143
153
|
if not observation.screenshot:
|
|
144
154
|
logger.warning("No screenshot provided for response to action")
|
|
145
155
|
return [], True
|
|
@@ -164,7 +174,6 @@ class OperatorAgent(Agent[AsyncOpenAI, dict[str, Any]]):
|
|
|
164
174
|
)
|
|
165
175
|
self.pending_safety_checks = []
|
|
166
176
|
|
|
167
|
-
# Call OpenAI API for follow-up (asynchronous call)
|
|
168
177
|
response = await self.client.responses.create(
|
|
169
178
|
model=self.model,
|
|
170
179
|
previous_response_id=self.last_response_id,
|
|
@@ -181,6 +190,8 @@ class OperatorAgent(Agent[AsyncOpenAI, dict[str, Any]]):
|
|
|
181
190
|
done = True # Assume done unless a computer call is found
|
|
182
191
|
final_text_response = ""
|
|
183
192
|
|
|
193
|
+
self.pending_call_id = None
|
|
194
|
+
|
|
184
195
|
# Check for computer calls first
|
|
185
196
|
computer_calls = [
|
|
186
197
|
item
|
|
@@ -217,8 +228,22 @@ class OperatorAgent(Agent[AsyncOpenAI, dict[str, Any]]):
|
|
|
217
228
|
# No ResponseAgent logic here anymore - just return the response
|
|
218
229
|
actions = [{"type": "response", "text": final_text_response}]
|
|
219
230
|
done = True
|
|
220
|
-
|
|
221
|
-
|
|
231
|
+
else:
|
|
232
|
+
logger.info("No computer calls and no final text message found.")
|
|
222
233
|
# Keep done = True, actions remains empty
|
|
223
234
|
|
|
235
|
+
reasoning = ""
|
|
236
|
+
for item in response.output:
|
|
237
|
+
if item.type == "reasoning" and item.summary:
|
|
238
|
+
reasoning += f"Thinking: {item.summary[0].text}\n"
|
|
239
|
+
elif item.type == "message":
|
|
240
|
+
for content in item.content:
|
|
241
|
+
if isinstance(content, ResponseOutputText):
|
|
242
|
+
reasoning += f"{content.text}\n"
|
|
243
|
+
|
|
244
|
+
# add reasoning to the actions
|
|
245
|
+
for action in actions:
|
|
246
|
+
action["reasoning"] = reasoning
|
|
247
|
+
action["logs"] = response.model_dump() # type: ignore[assignment]
|
|
248
|
+
|
|
224
249
|
return actions, done
|
hud/agent/tests/test_base.py
CHANGED
|
@@ -22,9 +22,9 @@ class ConcreteAgent(Agent[Any, dict[str, Any]]):
|
|
|
22
22
|
async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
|
|
23
23
|
"""Mock implementation that returns predefined responses."""
|
|
24
24
|
if self.call_count < len(self.mock_responses):
|
|
25
|
-
|
|
25
|
+
actions, done = self.mock_responses[self.call_count]
|
|
26
26
|
self.call_count += 1
|
|
27
|
-
return
|
|
27
|
+
return actions, done
|
|
28
28
|
return [], True
|
|
29
29
|
|
|
30
30
|
|
hud/env/docker_client.py
CHANGED
|
@@ -12,7 +12,7 @@ import toml
|
|
|
12
12
|
|
|
13
13
|
from hud.env.client import Client
|
|
14
14
|
from hud.types import EnvironmentStatus
|
|
15
|
-
from hud.utils.common import directory_to_tar_bytes
|
|
15
|
+
from hud.utils.common import _compile_pathspec, directory_to_tar_bytes
|
|
16
16
|
|
|
17
17
|
if TYPE_CHECKING:
|
|
18
18
|
from hud.utils import ExecuteResult
|
|
@@ -151,15 +151,32 @@ class DockerClient(Client):
|
|
|
151
151
|
if not self._source_path:
|
|
152
152
|
return {}
|
|
153
153
|
|
|
154
|
-
|
|
154
|
+
# Build ignore spec (currently we only care about .hudignore but reuse
|
|
155
|
+
# the common helper for consistency).
|
|
156
|
+
spec = _compile_pathspec(
|
|
157
|
+
self._source_path,
|
|
158
|
+
respect_gitignore=False,
|
|
159
|
+
respect_dockerignore=False,
|
|
160
|
+
respect_hudignore=True,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
file_mtimes: dict[str, float] = {}
|
|
164
|
+
|
|
155
165
|
for root, _, files in os.walk(self._source_path):
|
|
156
166
|
for file in files:
|
|
157
167
|
file_path = Path(root) / file
|
|
168
|
+
rel_path = file_path.relative_to(self._source_path).as_posix()
|
|
169
|
+
|
|
170
|
+
# Skip ignored files
|
|
171
|
+
if spec and spec.match_file(rel_path):
|
|
172
|
+
continue
|
|
173
|
+
|
|
158
174
|
try:
|
|
159
175
|
file_mtimes[str(file_path)] = file_path.stat().st_mtime
|
|
160
176
|
except (FileNotFoundError, PermissionError):
|
|
161
177
|
# Skip files that can't be accessed
|
|
162
178
|
continue
|
|
179
|
+
|
|
163
180
|
return file_mtimes
|
|
164
181
|
|
|
165
182
|
async def needs_update(self) -> bool:
|
|
@@ -181,6 +198,11 @@ class DockerClient(Client):
|
|
|
181
198
|
if not self._last_file_mtimes:
|
|
182
199
|
return True
|
|
183
200
|
|
|
201
|
+
# Check for removed files
|
|
202
|
+
for file_path in self._last_file_mtimes:
|
|
203
|
+
if file_path not in current_mtimes:
|
|
204
|
+
return True
|
|
205
|
+
|
|
184
206
|
# Check for new or modified files
|
|
185
207
|
for file_path, mtime in current_mtimes.items():
|
|
186
208
|
if file_path not in self._last_file_mtimes or mtime > self._last_file_mtimes[file_path]:
|
|
@@ -287,7 +309,8 @@ class DockerClient(Client):
|
|
|
287
309
|
if len(stdout_parts) > 1:
|
|
288
310
|
result = json.loads(stdout_parts[1])
|
|
289
311
|
else:
|
|
290
|
-
|
|
312
|
+
logger.warning("Potential error: %s", stderr)
|
|
313
|
+
result = None
|
|
291
314
|
|
|
292
315
|
return result, stdout, stderr
|
|
293
316
|
|
hud/env/environment.py
CHANGED
|
@@ -8,14 +8,14 @@ from typing import TYPE_CHECKING, Any
|
|
|
8
8
|
from pydantic import BaseModel
|
|
9
9
|
|
|
10
10
|
from hud.env.client import Client
|
|
11
|
-
from hud.env.remote_client import RemoteClient
|
|
11
|
+
from hud.env.remote_client import RemoteClient, SetupRequest
|
|
12
12
|
from hud.task import Task
|
|
13
|
+
from hud.utils.agent import format_agent_prompt
|
|
13
14
|
from hud.utils.common import FunctionConfig, FunctionConfigs, Observation
|
|
14
15
|
from hud.utils.config import (
|
|
15
16
|
LOCAL_EVALUATORS,
|
|
16
17
|
REMOTE_EVALUATE,
|
|
17
18
|
REMOTE_FUNCTION_PREFIX,
|
|
18
|
-
REMOTE_SETUP,
|
|
19
19
|
expand_config,
|
|
20
20
|
)
|
|
21
21
|
from hud.utils.telemetry import stream
|
|
@@ -41,9 +41,15 @@ class Environment(BaseModel):
|
|
|
41
41
|
task: Task | None = None
|
|
42
42
|
build_data: dict[str, Any]
|
|
43
43
|
|
|
44
|
+
# The task run id
|
|
45
|
+
task_run_id: str | None = None
|
|
46
|
+
|
|
44
47
|
# final response
|
|
45
48
|
final_response: str | None = None
|
|
46
49
|
|
|
50
|
+
# environment prompt information
|
|
51
|
+
environment_prompt: str | None = None
|
|
52
|
+
|
|
47
53
|
async def _invoke_all(self, configs: FunctionConfigs) -> list[Any]:
|
|
48
54
|
# Execute each config and collect results
|
|
49
55
|
configs_all = [configs] if not isinstance(configs, list) else configs
|
|
@@ -69,24 +75,45 @@ class Environment(BaseModel):
|
|
|
69
75
|
async def _setup(self, config: FunctionConfigs | None = None) -> None:
|
|
70
76
|
"""
|
|
71
77
|
Setup the environment.
|
|
78
|
+
No-op if no config or task is provided.
|
|
72
79
|
|
|
73
80
|
Args:
|
|
74
81
|
config: The configuration to use for the setup
|
|
75
82
|
"""
|
|
76
83
|
if isinstance(self.client, RemoteClient):
|
|
77
84
|
await self.get_urls()
|
|
78
|
-
|
|
85
|
+
|
|
86
|
+
setup_request = SetupRequest()
|
|
87
|
+
|
|
88
|
+
if self.task:
|
|
89
|
+
setup_request.task_id = self.task.id
|
|
90
|
+
setup_request.config = self.task.config
|
|
91
|
+
setup_request.metadata = _format_task_metadata(self.task)
|
|
92
|
+
if self.task.setup:
|
|
93
|
+
setup_request.setup = expand_config(self.task.setup)[0]
|
|
94
|
+
elif config:
|
|
95
|
+
setup_request.setup = expand_config(config)[0]
|
|
96
|
+
else:
|
|
97
|
+
raise ValueError("No task or config provided for remote environment")
|
|
98
|
+
|
|
99
|
+
result = await self.client.setup(setup_request)
|
|
100
|
+
|
|
101
|
+
if result and result.get("id"):
|
|
102
|
+
self.task_run_id = result.get("id")
|
|
103
|
+
logger.info("View the live trace at https://app.hud.so/trace/%s", self.task_run_id)
|
|
104
|
+
else:
|
|
105
|
+
logger.warning("No task run id found in the result")
|
|
79
106
|
else:
|
|
80
107
|
if config is not None:
|
|
81
108
|
await self._invoke_all(config)
|
|
82
109
|
elif self.task and self.task.setup is not None:
|
|
83
110
|
await self._invoke_all(self.task.setup)
|
|
84
|
-
else:
|
|
85
|
-
raise ValueError(
|
|
86
|
-
"No config, task or task setup function provided for local environment"
|
|
87
|
-
)
|
|
88
111
|
|
|
89
|
-
async def evaluate(
|
|
112
|
+
async def evaluate(
|
|
113
|
+
self,
|
|
114
|
+
config: FunctionConfigs | None = None,
|
|
115
|
+
metadata: dict[str, Any] | None = None,
|
|
116
|
+
) -> Any:
|
|
90
117
|
"""
|
|
91
118
|
Evaluate the environment.
|
|
92
119
|
|
|
@@ -97,7 +124,9 @@ class Environment(BaseModel):
|
|
|
97
124
|
Any: Result of the evaluation
|
|
98
125
|
"""
|
|
99
126
|
if isinstance(self.client, RemoteClient):
|
|
100
|
-
results = await self._invoke_all(
|
|
127
|
+
results = await self._invoke_all(
|
|
128
|
+
create_remote_config(self, config, REMOTE_EVALUATE, metadata)
|
|
129
|
+
)
|
|
101
130
|
else:
|
|
102
131
|
if config is not None:
|
|
103
132
|
results = await self._invoke_all(config)
|
|
@@ -110,27 +139,32 @@ class Environment(BaseModel):
|
|
|
110
139
|
else:
|
|
111
140
|
return results
|
|
112
141
|
|
|
113
|
-
async def reset(
|
|
114
|
-
self, configs: FunctionConfigs | None = None
|
|
115
|
-
) -> tuple[Observation, dict[str, Any]]:
|
|
142
|
+
async def reset(self) -> tuple[Observation, dict[str, Any]]:
|
|
116
143
|
"""
|
|
117
|
-
Reset the environment.
|
|
144
|
+
Reset the environment and return the first observation with the agent prompt.
|
|
118
145
|
|
|
119
146
|
Args:
|
|
120
|
-
|
|
147
|
+
None
|
|
121
148
|
|
|
122
149
|
Returns:
|
|
123
|
-
Observation: The first observation from the environment
|
|
150
|
+
Observation: The first observation from the environment with the agent prompt
|
|
124
151
|
info: Dictionary of information about the environment
|
|
125
152
|
"""
|
|
126
153
|
# await self._setup(configs)
|
|
127
154
|
obs, _, _, info = await self.step()
|
|
128
|
-
|
|
129
|
-
|
|
155
|
+
|
|
156
|
+
if self.build_data.get("environment_prompt"):
|
|
157
|
+
self.environment_prompt = self.build_data["environment_prompt"]
|
|
158
|
+
|
|
159
|
+
# Format the agent prompt with the environment prompt and the task prompt
|
|
160
|
+
obs.text = format_agent_prompt(self.environment_prompt, self.task)
|
|
161
|
+
|
|
130
162
|
return obs, info
|
|
131
163
|
|
|
132
164
|
async def step(
|
|
133
|
-
self,
|
|
165
|
+
self,
|
|
166
|
+
actions: CLA | list[CLA] | None = None,
|
|
167
|
+
verbose: bool = False,
|
|
134
168
|
) -> tuple[Observation, float, bool, dict[str, Any]]:
|
|
135
169
|
"""Execute a step in the environment.
|
|
136
170
|
|
|
@@ -152,10 +186,11 @@ class Environment(BaseModel):
|
|
|
152
186
|
result, stdout, stderr = await self.client.invoke(
|
|
153
187
|
FunctionConfig(function="step", args=args)
|
|
154
188
|
)
|
|
155
|
-
if
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
189
|
+
if verbose:
|
|
190
|
+
if stdout:
|
|
191
|
+
logger.info("Step produced stdout: %s", stdout.decode())
|
|
192
|
+
if stderr:
|
|
193
|
+
logger.warning("Step produced stderr: %s", stderr.decode())
|
|
159
194
|
|
|
160
195
|
observation = Observation.model_validate(result["observation"], strict=True)
|
|
161
196
|
|
|
@@ -199,12 +234,12 @@ class Environment(BaseModel):
|
|
|
199
234
|
await self.client.close()
|
|
200
235
|
|
|
201
236
|
async def stream(self) -> str | None:
|
|
202
|
-
|
|
203
|
-
|
|
237
|
+
if not self.live_url:
|
|
238
|
+
await self.get_urls()
|
|
239
|
+
if self.live_url is None:
|
|
204
240
|
logger.warning("No live URL found")
|
|
205
241
|
return None
|
|
206
|
-
|
|
207
|
-
return stream(urls["live_url"])
|
|
242
|
+
return stream(self.live_url)
|
|
208
243
|
|
|
209
244
|
async def run(self, agent: Agent, max_steps: int = 27, verbose: bool = True) -> Any:
|
|
210
245
|
"""Run an agent in the environment.
|
|
@@ -218,7 +253,11 @@ class Environment(BaseModel):
|
|
|
218
253
|
for i in range(max_steps):
|
|
219
254
|
action, done = await agent.predict(obs, verbose=verbose)
|
|
220
255
|
if verbose:
|
|
221
|
-
logger.info(
|
|
256
|
+
logger.info(
|
|
257
|
+
"Step %d: Action: %s",
|
|
258
|
+
i,
|
|
259
|
+
[str(a) for a in action] if len(action) > 1 else str(action[0]),
|
|
260
|
+
)
|
|
222
261
|
obs, reward, terminated, info = await self.step(action)
|
|
223
262
|
if verbose:
|
|
224
263
|
logger.info("Step %d: Observation: %s", i, obs)
|
|
@@ -230,10 +269,21 @@ class Environment(BaseModel):
|
|
|
230
269
|
return result
|
|
231
270
|
|
|
232
271
|
|
|
272
|
+
def _format_task_metadata(task: Task) -> dict[str, Any]:
|
|
273
|
+
metadata = {}
|
|
274
|
+
if task.metadata:
|
|
275
|
+
for key, value in task.metadata.items():
|
|
276
|
+
metadata[str(key)] = value
|
|
277
|
+
if task.sensitive_data:
|
|
278
|
+
metadata["sensitive_data"] = task.sensitive_data
|
|
279
|
+
return metadata
|
|
280
|
+
|
|
281
|
+
|
|
233
282
|
def create_remote_config(
|
|
234
283
|
env: Environment | None = None,
|
|
235
284
|
config: FunctionConfigs | None = None,
|
|
236
285
|
function: str | None = None,
|
|
286
|
+
metadata: dict[str, Any] | None = None,
|
|
237
287
|
) -> list[FunctionConfig]:
|
|
238
288
|
"""
|
|
239
289
|
Create a remote configuration for setup or evaluate, determining the final
|
|
@@ -317,6 +367,8 @@ def create_remote_config(
|
|
|
317
367
|
`[FunctionConfig(function='evaluate', args=[])]`
|
|
318
368
|
"""
|
|
319
369
|
# If no function provided, just expand the config and return it directly
|
|
370
|
+
if metadata is None:
|
|
371
|
+
metadata = {}
|
|
320
372
|
if function is None:
|
|
321
373
|
if config:
|
|
322
374
|
return expand_config(config)
|
|
@@ -330,7 +382,7 @@ def create_remote_config(
|
|
|
330
382
|
if not isinstance(expanded_configs[0].args, list):
|
|
331
383
|
expanded_configs[0].args = [expanded_configs[0].args]
|
|
332
384
|
expanded_configs[0].args.append(env.final_response) # for remote responses
|
|
333
|
-
return [FunctionConfig(function=function, args=expanded_configs)]
|
|
385
|
+
return [FunctionConfig(function=function, args=expanded_configs, metadata=metadata)]
|
|
334
386
|
|
|
335
387
|
# Otherwise, use the environment's task
|
|
336
388
|
task = env.task if env else None
|
|
@@ -339,6 +391,8 @@ def create_remote_config(
|
|
|
339
391
|
if task is None:
|
|
340
392
|
raise ValueError("Either task or config must be provided")
|
|
341
393
|
|
|
394
|
+
metadata = _format_task_metadata(task)
|
|
395
|
+
|
|
342
396
|
# Case 2: Task has the specified function attribute
|
|
343
397
|
task_config = getattr(task, function, None)
|
|
344
398
|
if task_config:
|
|
@@ -350,11 +404,7 @@ def create_remote_config(
|
|
|
350
404
|
if not isinstance(expanded_configs[0].args, list):
|
|
351
405
|
expanded_configs[0].args = [expanded_configs[0].args]
|
|
352
406
|
expanded_configs[0].args.append(env.final_response) # for remote responses
|
|
353
|
-
return [
|
|
354
|
-
FunctionConfig(
|
|
355
|
-
function=function, args=expanded_configs, metadata={"task": task.model_dump()}
|
|
356
|
-
)
|
|
357
|
-
]
|
|
407
|
+
return [FunctionConfig(function=function, args=expanded_configs, metadata=metadata)]
|
|
358
408
|
|
|
359
409
|
# Case 3: Check for task.config
|
|
360
410
|
if hasattr(task, "config") and task.config:
|
|
@@ -369,11 +419,7 @@ def create_remote_config(
|
|
|
369
419
|
if not isinstance(final_args["args"], list):
|
|
370
420
|
final_args["args"] = [final_args["args"]]
|
|
371
421
|
final_args["args"].append(env.final_response)
|
|
372
|
-
return [
|
|
373
|
-
FunctionConfig(
|
|
374
|
-
function=function, args=[final_args], metadata={"task": task.model_dump()}
|
|
375
|
-
)
|
|
376
|
-
]
|
|
422
|
+
return [FunctionConfig(function=function, args=[final_args], metadata=metadata)]
|
|
377
423
|
|
|
378
424
|
# Case 4: Use task.id
|
|
379
425
|
if task.id:
|
|
@@ -384,7 +430,7 @@ def create_remote_config(
|
|
|
384
430
|
FunctionConfig(
|
|
385
431
|
function=f"{REMOTE_FUNCTION_PREFIX}{function}",
|
|
386
432
|
args=args_list,
|
|
387
|
-
metadata=
|
|
433
|
+
metadata=metadata,
|
|
388
434
|
)
|
|
389
435
|
]
|
|
390
436
|
|
|
@@ -392,4 +438,4 @@ def create_remote_config(
|
|
|
392
438
|
args_list = []
|
|
393
439
|
if env and env.final_response:
|
|
394
440
|
args_list.append(env.final_response)
|
|
395
|
-
return [FunctionConfig(function=function, args=args_list, metadata=
|
|
441
|
+
return [FunctionConfig(function=function, args=args_list, metadata=metadata)]
|
hud/env/local_docker_client.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import contextlib
|
|
4
5
|
import io
|
|
5
6
|
import logging
|
|
6
7
|
import textwrap
|
|
@@ -34,6 +35,7 @@ class LocalDockerClient(DockerClient):
|
|
|
34
35
|
"""
|
|
35
36
|
Build an image from a build context.
|
|
36
37
|
"""
|
|
38
|
+
logger.info("Building image from %s", build_context)
|
|
37
39
|
# Create a unique image tag
|
|
38
40
|
image_tag = f"hud-env-{uuid.uuid4().hex[:8]}"
|
|
39
41
|
|
|
@@ -67,6 +69,7 @@ class LocalDockerClient(DockerClient):
|
|
|
67
69
|
async def create(
|
|
68
70
|
cls,
|
|
69
71
|
image: str,
|
|
72
|
+
host_config: dict[str, Any] | None = None,
|
|
70
73
|
) -> LocalDockerClient:
|
|
71
74
|
"""
|
|
72
75
|
Creates a Docker environment client from a image.
|
|
@@ -81,20 +84,42 @@ class LocalDockerClient(DockerClient):
|
|
|
81
84
|
# Initialize Docker client
|
|
82
85
|
docker_client = aiodocker.Docker()
|
|
83
86
|
|
|
87
|
+
# Default host config
|
|
88
|
+
if host_config is None:
|
|
89
|
+
host_config = {
|
|
90
|
+
"PublishAllPorts": True,
|
|
91
|
+
}
|
|
92
|
+
|
|
84
93
|
# Create and start the container
|
|
85
94
|
container_config = {
|
|
86
95
|
"Image": image,
|
|
87
96
|
"Tty": True,
|
|
88
97
|
"OpenStdin": True,
|
|
89
98
|
"Cmd": None,
|
|
90
|
-
"HostConfig":
|
|
91
|
-
"PublishAllPorts": True,
|
|
92
|
-
},
|
|
99
|
+
"HostConfig": host_config,
|
|
93
100
|
}
|
|
94
101
|
|
|
95
102
|
container = await docker_client.containers.create(config=container_config)
|
|
96
103
|
await container.start()
|
|
97
104
|
|
|
105
|
+
# --------------------------------------------------
|
|
106
|
+
# Stream container logs while we wait for readiness
|
|
107
|
+
# --------------------------------------------------
|
|
108
|
+
async def _stream_logs() -> None:
|
|
109
|
+
try:
|
|
110
|
+
# .log() with follow=True -> async iterator of bytes/str
|
|
111
|
+
async for raw in container.log(stdout=True, stderr=True, follow=True):
|
|
112
|
+
if isinstance(raw, bytes):
|
|
113
|
+
raw = raw.decode(errors="replace")
|
|
114
|
+
logger.info("container %s | %s", container.id[:12], raw.rstrip())
|
|
115
|
+
except asyncio.CancelledError:
|
|
116
|
+
# task cancelled during cleanup - silently exit
|
|
117
|
+
return
|
|
118
|
+
except Exception:
|
|
119
|
+
logger.exception("error while streaming logs from %s", container.id[:12])
|
|
120
|
+
|
|
121
|
+
log_task: asyncio.Task | None = asyncio.create_task(_stream_logs())
|
|
122
|
+
|
|
98
123
|
inspection = await container.show()
|
|
99
124
|
if health_check_config := inspection["Config"].get("Healthcheck"):
|
|
100
125
|
# Using the interval as spinup deadline is a bit implicit - could
|
|
@@ -115,9 +140,21 @@ class LocalDockerClient(DockerClient):
|
|
|
115
140
|
raise TimeoutError(f"{container.id} not healthy after {window_secs}s")
|
|
116
141
|
await asyncio.sleep(1)
|
|
117
142
|
logger.debug("Container %s is healthy", container.id)
|
|
143
|
+
else:
|
|
144
|
+
logger.debug("Container %s has no healthcheck, assuming ready", container.id)
|
|
145
|
+
|
|
146
|
+
# Stop the log stream now that the container is ready
|
|
147
|
+
if log_task is not None:
|
|
148
|
+
log_task.cancel()
|
|
149
|
+
with contextlib.suppress(Exception):
|
|
150
|
+
await log_task
|
|
151
|
+
log_task = None
|
|
118
152
|
|
|
119
153
|
# Return the controller instance
|
|
120
|
-
|
|
154
|
+
client = cls(docker_client, container.id)
|
|
155
|
+
# store the task so close() can cancel if it is still running
|
|
156
|
+
client._log_task = log_task # type: ignore[attr-defined]
|
|
157
|
+
return client
|
|
121
158
|
|
|
122
159
|
def __init__(self, docker_conn: aiodocker.Docker, container_id: str) -> None:
|
|
123
160
|
"""
|
|
@@ -135,6 +172,9 @@ class LocalDockerClient(DockerClient):
|
|
|
135
172
|
# Docker client will be initialized when needed
|
|
136
173
|
self._docker = docker_conn
|
|
137
174
|
|
|
175
|
+
# Background task for streaming logs (may be None)
|
|
176
|
+
self._log_task: asyncio.Task | None = None
|
|
177
|
+
|
|
138
178
|
@property
|
|
139
179
|
def container_id(self) -> str:
|
|
140
180
|
"""Get the container ID."""
|
|
@@ -288,3 +328,9 @@ class LocalDockerClient(DockerClient):
|
|
|
288
328
|
logger.warning("Error during Docker container cleanup: %s", e)
|
|
289
329
|
finally:
|
|
290
330
|
await self._docker.close()
|
|
331
|
+
|
|
332
|
+
# Cancel background log forwarding first (if still active)
|
|
333
|
+
if self._log_task is not None:
|
|
334
|
+
self._log_task.cancel()
|
|
335
|
+
with contextlib.suppress(Exception):
|
|
336
|
+
await self._log_task
|