hud-python 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +13 -10
- hud/adapters/claude/adapter.py +30 -18
- hud/adapters/common/adapter.py +0 -1
- hud/adapters/common/types.py +129 -4
- hud/adapters/operator/adapter.py +23 -13
- hud/agent/base.py +5 -4
- hud/agent/claude.py +65 -13
- hud/agent/claude_plays_pokemon.py +3 -2
- hud/agent/langchain.py +8 -2
- hud/agent/operator.py +36 -11
- hud/agent/tests/test_base.py +2 -2
- hud/env/docker_client.py +24 -2
- hud/env/environment.py +86 -40
- hud/env/local_docker_client.py +50 -4
- hud/env/remote_client.py +22 -4
- hud/env/remote_docker_client.py +8 -4
- hud/gym.py +15 -4
- hud/job.py +100 -35
- hud/server/requests.py +26 -4
- hud/settings.py +7 -1
- hud/task.py +84 -6
- hud/taskset.py +79 -12
- hud/telemetry/context.py +33 -57
- hud/telemetry/exporter.py +4 -6
- hud/telemetry/instrumentation/mcp.py +0 -3
- hud/telemetry/tests/test_context.py +7 -3
- hud/trajectory.py +3 -0
- hud/types.py +28 -2
- hud/utils/agent.py +37 -0
- hud/utils/common.py +142 -26
- hud/utils/config.py +11 -0
- hud/utils/tests/test_common.py +225 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.2.5.dist-info → hud_python-0.2.7.dist-info}/METADATA +26 -23
- {hud_python-0.2.5.dist-info → hud_python-0.2.7.dist-info}/RECORD +38 -37
- {hud_python-0.2.5.dist-info → hud_python-0.2.7.dist-info}/WHEEL +0 -0
- {hud_python-0.2.5.dist-info → hud_python-0.2.7.dist-info}/licenses/LICENSE +0 -0
hud/taskset.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from
|
|
3
|
+
from pathlib import PosixPath
|
|
4
|
+
from typing import TYPE_CHECKING, Any, get_args
|
|
4
5
|
from venv import logger
|
|
5
6
|
|
|
6
7
|
from pydantic import BaseModel
|
|
@@ -9,6 +10,7 @@ from hud.env.environment import create_remote_config
|
|
|
9
10
|
from hud.server import make_request
|
|
10
11
|
from hud.settings import settings
|
|
11
12
|
from hud.task import Task
|
|
13
|
+
from hud.types import CustomGym, ServerGym
|
|
12
14
|
from hud.utils.config import REMOTE_EVALUATE, REMOTE_SETUP
|
|
13
15
|
|
|
14
16
|
if TYPE_CHECKING:
|
|
@@ -86,16 +88,45 @@ class TaskSet(BaseModel):
|
|
|
86
88
|
# Convert all tasks to expanded configs
|
|
87
89
|
processed_tasks = []
|
|
88
90
|
for task in self.tasks:
|
|
89
|
-
|
|
90
|
-
|
|
91
|
+
if task.setup is not None:
|
|
92
|
+
setup_config = (
|
|
93
|
+
create_remote_config(None, task.setup, REMOTE_SETUP)[0].args[0].model_dump()
|
|
94
|
+
)
|
|
95
|
+
else:
|
|
96
|
+
setup_config = None
|
|
97
|
+
if task.evaluate is not None:
|
|
98
|
+
evaluate_config = (
|
|
99
|
+
create_remote_config(None, task.evaluate, REMOTE_EVALUATE)[0]
|
|
100
|
+
.args[0]
|
|
101
|
+
.model_dump()
|
|
102
|
+
)
|
|
103
|
+
else:
|
|
104
|
+
evaluate_config = None
|
|
105
|
+
|
|
106
|
+
if isinstance(task.gym, CustomGym):
|
|
107
|
+
if isinstance(task.gym.image_or_build_context, PosixPath):
|
|
108
|
+
raise ValueError(
|
|
109
|
+
"Local build contexts are not supported for "
|
|
110
|
+
"remote tasksets, attach an image or existing "
|
|
111
|
+
"gym id."
|
|
112
|
+
)
|
|
113
|
+
gym_str = "docker"
|
|
114
|
+
image_uri = task.gym.image_or_build_context
|
|
115
|
+
elif isinstance(task.gym, str) and task.gym in get_args(ServerGym):
|
|
116
|
+
gym_str = task.gym
|
|
117
|
+
image_uri = None
|
|
118
|
+
else:
|
|
119
|
+
raise ValueError(f"Unknown gym type: {type(task.gym)}")
|
|
91
120
|
|
|
92
121
|
processed_tasks.append(
|
|
93
122
|
{
|
|
94
123
|
"prompt": task.prompt,
|
|
95
|
-
"gym":
|
|
96
|
-
"setup": setup_config
|
|
97
|
-
"evaluate": evaluate_config
|
|
124
|
+
"gym": gym_str,
|
|
125
|
+
"setup": setup_config,
|
|
126
|
+
"evaluate": evaluate_config,
|
|
98
127
|
"config": task.config,
|
|
128
|
+
"image_uri": image_uri,
|
|
129
|
+
"description": task.description,
|
|
99
130
|
}
|
|
100
131
|
)
|
|
101
132
|
|
|
@@ -113,7 +144,15 @@ class TaskSet(BaseModel):
|
|
|
113
144
|
"Taskset %s uploaded successfully, see it on app.hud.so/evalsets/%s", name, name
|
|
114
145
|
)
|
|
115
146
|
|
|
116
|
-
|
|
147
|
+
def _apply(self, dict: dict[str, Any]) -> None:
|
|
148
|
+
"""
|
|
149
|
+
Applies a parameter to all tasks in the taskset.
|
|
150
|
+
"""
|
|
151
|
+
for task in self.tasks:
|
|
152
|
+
for key, value in dict.items():
|
|
153
|
+
setattr(task, key, value)
|
|
154
|
+
|
|
155
|
+
def fit(self, agent: Agent | type[Agent]) -> None:
|
|
117
156
|
"""
|
|
118
157
|
Automatically adapts the taskset to the agent's transfer_gyms.
|
|
119
158
|
"""
|
|
@@ -121,19 +160,27 @@ class TaskSet(BaseModel):
|
|
|
121
160
|
agent = agent()
|
|
122
161
|
|
|
123
162
|
for task in self.tasks:
|
|
124
|
-
if task.gym is None:
|
|
163
|
+
if task.gym is None or isinstance(task.gym, CustomGym):
|
|
125
164
|
continue
|
|
126
165
|
task.gym = agent.transfer_gyms.get(task.gym, task.gym)
|
|
127
166
|
|
|
128
167
|
|
|
129
|
-
async def load_taskset(
|
|
168
|
+
async def load_taskset(
|
|
169
|
+
taskset_id: str,
|
|
170
|
+
api_key: str | None = None,
|
|
171
|
+
metadata: dict[str, Any] | None = None,
|
|
172
|
+
load_custom_as_local: bool = False,
|
|
173
|
+
system_prompt: str | None = None,
|
|
174
|
+
) -> TaskSet:
|
|
130
175
|
"""
|
|
131
176
|
Loads a TaskSet by its ID.
|
|
132
177
|
|
|
133
178
|
Args:
|
|
134
179
|
taskset_id: The ID of the taskset to load
|
|
135
180
|
api_key: Optional API key to use for the request
|
|
136
|
-
|
|
181
|
+
metadata: Optional metadata to apply to the taskset
|
|
182
|
+
load_custom_as_local: Whether to load custom gyms as local
|
|
183
|
+
system_prompt: Optional system prompt to override the default
|
|
137
184
|
Returns:
|
|
138
185
|
TaskSet: The loaded taskset
|
|
139
186
|
"""
|
|
@@ -149,13 +196,33 @@ async def load_taskset(taskset_id: str, api_key: str | None = None) -> TaskSet:
|
|
|
149
196
|
|
|
150
197
|
logger.info(f"Taskset {taskset_id} loaded successfully")
|
|
151
198
|
|
|
152
|
-
|
|
199
|
+
tasks = data["evalset"]
|
|
200
|
+
for task in tasks:
|
|
201
|
+
if system_prompt:
|
|
202
|
+
task["system_prompt"] = system_prompt
|
|
203
|
+
if task["gym"] == "docker":
|
|
204
|
+
if "image_uri" not in task:
|
|
205
|
+
raise ValueError(
|
|
206
|
+
"No `image_uri` key found. This taskset may be "
|
|
207
|
+
"incompatible with your version of HUD SDK."
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
task["gym"] = CustomGym(
|
|
211
|
+
location="local" if load_custom_as_local else "remote",
|
|
212
|
+
image_or_build_context=task["image_uri"],
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
taskset = TaskSet.model_validate(
|
|
153
216
|
{
|
|
154
217
|
"id": taskset_id,
|
|
155
|
-
"tasks":
|
|
218
|
+
"tasks": tasks,
|
|
156
219
|
}
|
|
157
220
|
)
|
|
158
221
|
|
|
222
|
+
taskset._apply({"metadata": metadata})
|
|
223
|
+
|
|
224
|
+
return taskset
|
|
225
|
+
|
|
159
226
|
|
|
160
227
|
def load_from_inspect(dataset: Dataset) -> TaskSet:
|
|
161
228
|
"""
|
hud/telemetry/context.py
CHANGED
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import contextvars
|
|
4
4
|
import logging
|
|
5
|
+
from collections import defaultdict
|
|
5
6
|
from datetime import datetime
|
|
6
7
|
from typing import Any, TypeVar
|
|
7
8
|
|
|
@@ -11,7 +12,6 @@ from hud.telemetry.mcp_models import (
|
|
|
11
12
|
MCPNotificationCall,
|
|
12
13
|
MCPRequestCall,
|
|
13
14
|
MCPResponseCall,
|
|
14
|
-
MCPTelemetryRecord,
|
|
15
15
|
StatusType,
|
|
16
16
|
)
|
|
17
17
|
|
|
@@ -21,9 +21,8 @@ logger = logging.getLogger("hud.telemetry")
|
|
|
21
21
|
current_task_run_id: contextvars.ContextVar[str | None] = contextvars.ContextVar(
|
|
22
22
|
"current_task_run_id", default=None
|
|
23
23
|
)
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
)
|
|
24
|
+
# NEW: Global dictionary for buffering, keyed by task_run_id
|
|
25
|
+
_GLOBAL_MCP_CALL_BUFFERS: defaultdict[str, list[BaseMCPCall]] = defaultdict(list)
|
|
27
26
|
is_root_trace: contextvars.ContextVar[bool] = contextvars.ContextVar("is_root_trace", default=False)
|
|
28
27
|
|
|
29
28
|
# Maximum buffer size before automatic flush
|
|
@@ -35,51 +34,37 @@ T = TypeVar("T", bound=BaseMCPCall)
|
|
|
35
34
|
|
|
36
35
|
def get_current_task_run_id() -> str | None:
|
|
37
36
|
"""Get the task_run_id for the current trace context."""
|
|
38
|
-
|
|
39
|
-
# Convert empty string sentinel back to None
|
|
40
|
-
return None if value == "" else value
|
|
37
|
+
return current_task_run_id.get()
|
|
41
38
|
|
|
42
39
|
|
|
43
40
|
def set_current_task_run_id(task_run_id: str | None) -> None:
|
|
44
41
|
"""Set the task_run_id for the current trace context."""
|
|
45
|
-
|
|
46
|
-
value_to_set = "" if task_run_id is None else task_run_id
|
|
47
|
-
current_task_run_id.set(value_to_set)
|
|
42
|
+
current_task_run_id.set(task_run_id)
|
|
48
43
|
|
|
49
44
|
|
|
50
45
|
def buffer_mcp_call(record: BaseMCPCall | dict[str, Any]) -> None:
|
|
51
|
-
"""
|
|
52
|
-
Add an MCP call to the buffer for the current trace.
|
|
53
|
-
|
|
54
|
-
Args:
|
|
55
|
-
record: Either a Pydantic model instance or dictionary with MCP call data
|
|
56
|
-
"""
|
|
57
|
-
# Only buffer if we have an active trace
|
|
58
46
|
task_run_id = get_current_task_run_id()
|
|
59
|
-
if task_run_id is not None and task_run_id != "":
|
|
60
|
-
buffer = mcp_calls_buffer.get()
|
|
61
|
-
if buffer is None:
|
|
62
|
-
buffer = []
|
|
63
47
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
48
|
+
if not task_run_id:
|
|
49
|
+
logger.warning(
|
|
50
|
+
"BUFFER_MCP_CALL: No task_run_id. Skipping buffer for %s", type(record).__name__
|
|
51
|
+
)
|
|
52
|
+
return
|
|
67
53
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
54
|
+
# Ensure 'record' is a Pydantic model instance from here
|
|
55
|
+
if isinstance(record, dict):
|
|
56
|
+
try:
|
|
57
|
+
record_model = BaseMCPCall.from_dict(record)
|
|
58
|
+
record = record_model
|
|
59
|
+
except Exception as e_conv:
|
|
60
|
+
logger.exception("BUFFER_MCP_CALL: Failed to convert dict to BaseMCPCall: %s", e_conv)
|
|
61
|
+
return
|
|
74
62
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
mcp_calls_buffer.set(buffer)
|
|
63
|
+
_GLOBAL_MCP_CALL_BUFFERS[task_run_id].append(record)
|
|
64
|
+
buffer_len = len(_GLOBAL_MCP_CALL_BUFFERS[task_run_id])
|
|
78
65
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
logger.debug("MCP calls buffer reached size %d, auto-flushing", len(buffer))
|
|
82
|
-
flush_buffer(export=True)
|
|
66
|
+
if buffer_len >= MAX_BUFFER_SIZE:
|
|
67
|
+
flush_buffer(export=True)
|
|
83
68
|
|
|
84
69
|
|
|
85
70
|
def flush_buffer(export: bool = False) -> list[BaseMCPCall]:
|
|
@@ -92,25 +77,16 @@ def flush_buffer(export: bool = False) -> list[BaseMCPCall]:
|
|
|
92
77
|
Returns:
|
|
93
78
|
The list of buffered MCP calls
|
|
94
79
|
"""
|
|
95
|
-
|
|
96
|
-
if
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
# Create a telemetry record for export
|
|
106
|
-
_telemetry_record = MCPTelemetryRecord(task_run_id=task_id, records=buffer)
|
|
107
|
-
# In the future, we could call an export function here
|
|
108
|
-
# For now, just log that we have telemetry
|
|
109
|
-
logger.debug("MCP telemetry record created with %d calls", len(buffer))
|
|
110
|
-
else:
|
|
111
|
-
logger.warning("No task_run_id found in buffer, skipping export")
|
|
112
|
-
|
|
113
|
-
return buffer
|
|
80
|
+
task_run_id = get_current_task_run_id()
|
|
81
|
+
if not task_run_id:
|
|
82
|
+
logger.warning("FLUSH_BUFFER: No current task_run_id. Cannot flush.")
|
|
83
|
+
return []
|
|
84
|
+
|
|
85
|
+
buffer_for_task = _GLOBAL_MCP_CALL_BUFFERS.pop(
|
|
86
|
+
task_run_id, []
|
|
87
|
+
) # Get and remove the list for this task
|
|
88
|
+
|
|
89
|
+
return buffer_for_task # Return the flushed items
|
|
114
90
|
|
|
115
91
|
|
|
116
92
|
def create_request_record(
|
|
@@ -150,6 +126,7 @@ def create_response_record(
|
|
|
150
126
|
is_error=is_error,
|
|
151
127
|
**kwargs,
|
|
152
128
|
)
|
|
129
|
+
|
|
153
130
|
buffer_mcp_call(record)
|
|
154
131
|
return record
|
|
155
132
|
|
|
@@ -189,5 +166,4 @@ def create_manual_test_record(**custom_data: Any) -> MCPManualTestCall | None:
|
|
|
189
166
|
def reset_context() -> None:
|
|
190
167
|
"""Reset all telemetry context variables. Useful for test isolation."""
|
|
191
168
|
set_current_task_run_id(None)
|
|
192
|
-
mcp_calls_buffer.set([])
|
|
193
169
|
is_root_trace.set(False)
|
hud/telemetry/exporter.py
CHANGED
|
@@ -298,12 +298,10 @@ async def _export_trace_payload_async(payload: dict[str, Any]) -> None:
|
|
|
298
298
|
"telemetry": payload.get("mcp_calls", []),
|
|
299
299
|
}
|
|
300
300
|
|
|
301
|
-
|
|
302
|
-
# if not data_to_send["mcp_calls"]:
|
|
303
|
-
# logger.debug("No MCP calls in payload for task run %s, skipping specific export if "
|
|
304
|
-
# "desired.", task_run_id)
|
|
305
|
-
# # Depending on backend, might not want to send empty mcp_calls list, or it's fine.
|
|
301
|
+
await send_telemetry_to_server(task_run_id, data_to_send)
|
|
306
302
|
|
|
303
|
+
|
|
304
|
+
async def send_telemetry_to_server(task_run_id: str, data: dict[str, Any]) -> None:
|
|
307
305
|
telemetry_url = f"{settings.base_url}/v2/task_runs/{task_run_id}/telemetry-upload"
|
|
308
306
|
|
|
309
307
|
try:
|
|
@@ -320,7 +318,7 @@ async def _export_trace_payload_async(payload: dict[str, Any]) -> None:
|
|
|
320
318
|
)
|
|
321
319
|
response = await client.post(
|
|
322
320
|
telemetry_url,
|
|
323
|
-
json=
|
|
321
|
+
json=data, # Send the structured attributes and mcp_calls
|
|
324
322
|
headers=headers,
|
|
325
323
|
timeout=30.0,
|
|
326
324
|
)
|
|
@@ -31,9 +31,6 @@ from hud.telemetry.mcp_models import DirectionType, MCPCallType, MCPManualTestCa
|
|
|
31
31
|
|
|
32
32
|
logger = logging.getLogger(__name__)
|
|
33
33
|
|
|
34
|
-
# Ensure no OTel imports remain
|
|
35
|
-
# from opentelemetry import context as otel_context, propagate # Should be removed
|
|
36
|
-
|
|
37
34
|
|
|
38
35
|
class MCPInstrumentor:
|
|
39
36
|
"""
|
|
@@ -140,9 +140,13 @@ class TestMCPCallBuffer:
|
|
|
140
140
|
|
|
141
141
|
# Flush should return all calls from both tasks
|
|
142
142
|
result = flush_buffer()
|
|
143
|
-
assert len(result) ==
|
|
144
|
-
assert result[0] ==
|
|
145
|
-
|
|
143
|
+
assert len(result) == 1
|
|
144
|
+
assert result[0] == mock_call_2
|
|
145
|
+
|
|
146
|
+
set_current_task_run_id("task-1")
|
|
147
|
+
result2 = flush_buffer()
|
|
148
|
+
assert len(result2) == 1
|
|
149
|
+
assert result2[0] == mock_call_1
|
|
146
150
|
|
|
147
151
|
def test_buffer_mcp_call_without_task_id(self):
|
|
148
152
|
"""Test adding MCP call when no task run ID is set."""
|
hud/trajectory.py
CHANGED
|
@@ -6,6 +6,8 @@ import datetime
|
|
|
6
6
|
from IPython.display import HTML, Markdown, display
|
|
7
7
|
from pydantic import BaseModel, Field
|
|
8
8
|
|
|
9
|
+
from .adapters.common.types import LogType
|
|
10
|
+
|
|
9
11
|
|
|
10
12
|
class TrajectoryStep(BaseModel):
|
|
11
13
|
"""Model representing a single task run's trajectory information."""
|
|
@@ -13,6 +15,7 @@ class TrajectoryStep(BaseModel):
|
|
|
13
15
|
observation_url: str | None = None
|
|
14
16
|
observation_text: str | None = None
|
|
15
17
|
actions: list[dict]
|
|
18
|
+
logs: LogType | None = None
|
|
16
19
|
start_timestamp: str | None = None
|
|
17
20
|
end_timestamp: str | None = None
|
|
18
21
|
|
hud/types.py
CHANGED
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import enum
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Literal, TypeAlias
|
|
5
|
+
from typing import Any, Literal, TypeAlias
|
|
6
6
|
|
|
7
7
|
from pydantic import BaseModel
|
|
8
8
|
|
|
@@ -28,6 +28,9 @@ class CustomGym(BaseModel):
|
|
|
28
28
|
# B. If string, then it is the uri of the docker image to use.
|
|
29
29
|
# The controller must already be installed in the image.
|
|
30
30
|
image_or_build_context: str | Path
|
|
31
|
+
# host_config will be passed to the docker client when creating the environment.
|
|
32
|
+
# refer to official docker api documentation for available configs.
|
|
33
|
+
host_config: dict[str, Any] | None = None
|
|
31
34
|
|
|
32
35
|
|
|
33
36
|
class EnvironmentStatus(str, enum.Enum):
|
|
@@ -48,7 +51,30 @@ class EnvironmentStatus(str, enum.Enum):
|
|
|
48
51
|
|
|
49
52
|
|
|
50
53
|
# Available HUD gyms
|
|
51
|
-
ServerGym: TypeAlias = Literal["qa", "hud-browser", "
|
|
54
|
+
ServerGym: TypeAlias = Literal["qa", "hud-browser", "OSWorld-Ubuntu", "docker"]
|
|
52
55
|
|
|
53
56
|
# Gyms can be either custom or server-side
|
|
54
57
|
Gym: TypeAlias = CustomGym | ServerGym
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# Metadata keys for the environment.
|
|
61
|
+
# partial: Whether the environment evaluator should give partial grades.
|
|
62
|
+
# eval_model: The model to use for evaluation when running a VLM. Wraps langchain.
|
|
63
|
+
# agent_name: The name of the agent that was used for running this task.
|
|
64
|
+
ServerMetadataKeys: TypeAlias = Literal["partial", "eval_model", "agent_name"]
|
|
65
|
+
MetadataKeys: TypeAlias = str | ServerMetadataKeys
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# Dictionary of sensitive data (only supported for hud-browser environments)
|
|
69
|
+
# key: website name or page identifier
|
|
70
|
+
# value: Dictionary of credentials for the sensitive data
|
|
71
|
+
# Example:
|
|
72
|
+
# {
|
|
73
|
+
# "google.com": {
|
|
74
|
+
# "google_username": "my_username",
|
|
75
|
+
# "google_password": "my_password"
|
|
76
|
+
# }
|
|
77
|
+
# }
|
|
78
|
+
# The agent only has access to the key of the credential, not the value. (i.e. google_username)
|
|
79
|
+
# The value is only available to the environment. (i.e. my_username)
|
|
80
|
+
SensitiveData: TypeAlias = dict[str, dict[str, str]]
|
hud/utils/agent.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from hud.task import Task
|
|
7
|
+
|
|
8
|
+
AGENT_PROMPT = (
|
|
9
|
+
"You are an AI agent whose goal is to accomplish the ultimate task following the instructions."
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def format_agent_prompt(environment_prompt: str | None, task: Task | None) -> str:
|
|
14
|
+
"""
|
|
15
|
+
Format the agent prompt with the environment prompt and the task prompt.
|
|
16
|
+
"""
|
|
17
|
+
prompt = AGENT_PROMPT
|
|
18
|
+
|
|
19
|
+
# User-provided system prompt takes precedence over environment prompt
|
|
20
|
+
if task and task.system_prompt:
|
|
21
|
+
prompt += f"\n\n{task.system_prompt}"
|
|
22
|
+
elif environment_prompt:
|
|
23
|
+
prompt += f"\n\n{environment_prompt}"
|
|
24
|
+
|
|
25
|
+
if task:
|
|
26
|
+
if task.sensitive_data:
|
|
27
|
+
prompt += "\n\nHere are placeholders for sensitive data for each domain:"
|
|
28
|
+
for domain, credentials in task.sensitive_data.items():
|
|
29
|
+
prompt += f"\n{domain}: "
|
|
30
|
+
placeholders = [f"{key}" for key in credentials]
|
|
31
|
+
prompt += f"{', '.join(placeholders)}"
|
|
32
|
+
prompt += "\n\nYou can type these placeholders to enter the sensitive data when needed."
|
|
33
|
+
|
|
34
|
+
if task.prompt:
|
|
35
|
+
prompt += f"\n\n{task.prompt}"
|
|
36
|
+
|
|
37
|
+
return prompt
|
hud/utils/common.py
CHANGED
|
@@ -6,6 +6,7 @@ import tarfile
|
|
|
6
6
|
import zipfile
|
|
7
7
|
from typing import TYPE_CHECKING, Any, TypedDict
|
|
8
8
|
|
|
9
|
+
from pathspec import PathSpec
|
|
9
10
|
from pydantic import BaseModel
|
|
10
11
|
|
|
11
12
|
from hud.server.requests import make_request
|
|
@@ -67,8 +68,8 @@ class Observation(BaseModel):
|
|
|
67
68
|
|
|
68
69
|
def __str__(self) -> str:
|
|
69
70
|
return f"""Observation(screenshot={
|
|
70
|
-
self.screenshot[:100] if self.screenshot else "None"
|
|
71
|
-
}
|
|
71
|
+
f"{self.screenshot[:100]}..." if self.screenshot else "None"
|
|
72
|
+
}, text={f"{self.text[:100]}..." if self.text else "None"})"""
|
|
72
73
|
|
|
73
74
|
|
|
74
75
|
class ExecuteResult(TypedDict):
|
|
@@ -86,44 +87,159 @@ class ExecuteResult(TypedDict):
|
|
|
86
87
|
exit_code: int
|
|
87
88
|
|
|
88
89
|
|
|
89
|
-
|
|
90
|
+
# ---------------------------------------------------------------------------
|
|
91
|
+
# Helper functions for handling ignore patterns
|
|
92
|
+
# ---------------------------------------------------------------------------
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _read_ignore_file(file_path: Path) -> list[str]:
|
|
96
|
+
"""Return patterns from *file_path* (ignoring blanks / comments)."""
|
|
97
|
+
if not file_path.exists():
|
|
98
|
+
return []
|
|
99
|
+
|
|
100
|
+
patterns: list[str] = []
|
|
101
|
+
for line in file_path.read_text().splitlines():
|
|
102
|
+
stripped = line.strip()
|
|
103
|
+
if not stripped or stripped.startswith("#"):
|
|
104
|
+
continue
|
|
105
|
+
patterns.append(stripped)
|
|
106
|
+
return patterns
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _gather_ignore_patterns(root_dir: Path, filename: str) -> list[str]:
|
|
110
|
+
"""Collect *filename* patterns throughout *root_dir* respecting hierarchy.
|
|
111
|
+
|
|
112
|
+
For a nested ignore file located at ``sub/dir/.gitignore`` containing the
|
|
113
|
+
pattern ``foo/``, the returned pattern will be ``sub/dir/foo/`` so that it
|
|
114
|
+
is evaluated relative to *root_dir* when passed to ``PathSpec``.
|
|
90
115
|
"""
|
|
91
|
-
|
|
116
|
+
gathered: list[str] = []
|
|
117
|
+
|
|
118
|
+
root_dir = root_dir.resolve()
|
|
119
|
+
|
|
120
|
+
for ignore_file in root_dir.rglob(filename):
|
|
121
|
+
prefix = ignore_file.parent.relative_to(root_dir).as_posix()
|
|
122
|
+
base_prefix = "" if prefix == "." else prefix
|
|
123
|
+
|
|
124
|
+
for pat in _read_ignore_file(ignore_file):
|
|
125
|
+
negate = pat.startswith("!")
|
|
126
|
+
pat_body = pat[1:] if negate else pat
|
|
127
|
+
|
|
128
|
+
# Leading slash means relative to the directory the ignore file is
|
|
129
|
+
# located in - remove it so we can prepend *prefix* below.
|
|
130
|
+
if pat_body.startswith("/"):
|
|
131
|
+
pat_body = pat_body.lstrip("/")
|
|
92
132
|
|
|
93
|
-
|
|
94
|
-
|
|
133
|
+
full_pattern = f"{base_prefix}/{pat_body}" if base_prefix else pat_body
|
|
134
|
+
if negate:
|
|
135
|
+
full_pattern = f"!{full_pattern}"
|
|
95
136
|
|
|
96
|
-
|
|
97
|
-
path: Path to the directory to convert
|
|
137
|
+
gathered.append(full_pattern)
|
|
98
138
|
|
|
99
|
-
|
|
100
|
-
|
|
139
|
+
return gathered
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _compile_pathspec(
|
|
143
|
+
directory: Path,
|
|
144
|
+
*,
|
|
145
|
+
respect_gitignore: bool,
|
|
146
|
+
respect_dockerignore: bool,
|
|
147
|
+
respect_hudignore: bool,
|
|
148
|
+
) -> PathSpec | None:
|
|
149
|
+
"""Compile a ``PathSpec`` from all relevant ignore files under *directory*.
|
|
150
|
+
|
|
151
|
+
In addition to the standard ``.gitignore`` and ``.dockerignore`` files we now
|
|
152
|
+
recognise a project-specific ``.hudignore`` file that shares the same pattern
|
|
153
|
+
syntax. Each file can be toggled independently through the corresponding
|
|
154
|
+
``respect_*`` keyword argument.
|
|
155
|
+
"""
|
|
156
|
+
patterns: list[str] = []
|
|
157
|
+
|
|
158
|
+
if respect_gitignore:
|
|
159
|
+
patterns.extend(_gather_ignore_patterns(directory, ".gitignore"))
|
|
160
|
+
if respect_dockerignore:
|
|
161
|
+
patterns.extend(_gather_ignore_patterns(directory, ".dockerignore"))
|
|
162
|
+
if respect_hudignore:
|
|
163
|
+
patterns.extend(_gather_ignore_patterns(directory, ".hudignore"))
|
|
164
|
+
|
|
165
|
+
if not patterns:
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
return PathSpec.from_lines("gitwildmatch", patterns)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _iter_files(
|
|
172
|
+
directory: Path,
|
|
173
|
+
*,
|
|
174
|
+
respect_gitignore: bool,
|
|
175
|
+
respect_dockerignore: bool,
|
|
176
|
+
respect_hudignore: bool,
|
|
177
|
+
) -> Iterator[tuple[Path, Path]]:
|
|
178
|
+
"""Yield ``(file_path, relative_path)`` while respecting ignore files."""
|
|
179
|
+
spec = _compile_pathspec(
|
|
180
|
+
directory,
|
|
181
|
+
respect_gitignore=respect_gitignore,
|
|
182
|
+
respect_dockerignore=respect_dockerignore,
|
|
183
|
+
respect_hudignore=respect_hudignore,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
for file_path in directory.rglob("*"):
|
|
187
|
+
if not file_path.is_file():
|
|
188
|
+
continue
|
|
189
|
+
rel_path = file_path.relative_to(directory)
|
|
190
|
+
rel_str = rel_path.as_posix()
|
|
191
|
+
if spec and spec.match_file(rel_str):
|
|
192
|
+
continue
|
|
193
|
+
yield file_path, rel_path
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def directory_to_tar_bytes(
|
|
197
|
+
directory_path: Path,
|
|
198
|
+
*,
|
|
199
|
+
respect_gitignore: bool = False,
|
|
200
|
+
respect_dockerignore: bool = False,
|
|
201
|
+
respect_hudignore: bool = True,
|
|
202
|
+
) -> bytes:
|
|
203
|
+
"""
|
|
204
|
+
Converts a directory to a tar archive and returns it as bytes.
|
|
205
|
+
|
|
206
|
+
By default the archive respects ignore rules defined in ``.gitignore``,
|
|
207
|
+
``.dockerignore`` and ``.hudignore`` (each can be disabled via kwargs).
|
|
101
208
|
"""
|
|
102
209
|
output = io.BytesIO()
|
|
103
210
|
|
|
104
211
|
with tarfile.open(fileobj=output, mode="w") as tar:
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
212
|
+
for file_path, rel_path in _iter_files(
|
|
213
|
+
directory_path,
|
|
214
|
+
respect_gitignore=respect_gitignore,
|
|
215
|
+
respect_dockerignore=respect_dockerignore,
|
|
216
|
+
respect_hudignore=respect_hudignore,
|
|
217
|
+
):
|
|
218
|
+
logger.debug("Adding %s to tar archive", rel_path)
|
|
219
|
+
tar.add(file_path, arcname=str(rel_path))
|
|
220
|
+
|
|
114
221
|
output.seek(0)
|
|
115
222
|
return output.getvalue()
|
|
116
223
|
|
|
117
224
|
|
|
118
|
-
def directory_to_zip_bytes(
|
|
119
|
-
|
|
225
|
+
def directory_to_zip_bytes(
|
|
226
|
+
context_dir: Path,
|
|
227
|
+
*,
|
|
228
|
+
respect_gitignore: bool = False,
|
|
229
|
+
respect_dockerignore: bool = False,
|
|
230
|
+
respect_hudignore: bool = True,
|
|
231
|
+
) -> bytes:
|
|
232
|
+
"""Zip *context_dir* and return the zip archive as bytes, respecting ignore rules."""
|
|
120
233
|
output = io.BytesIO()
|
|
121
234
|
with zipfile.ZipFile(output, "w", zipfile.ZIP_DEFLATED) as zipf:
|
|
122
|
-
for file_path in
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
235
|
+
for file_path, rel_path in _iter_files(
|
|
236
|
+
context_dir,
|
|
237
|
+
respect_gitignore=respect_gitignore,
|
|
238
|
+
respect_dockerignore=respect_dockerignore,
|
|
239
|
+
respect_hudignore=respect_hudignore,
|
|
240
|
+
):
|
|
241
|
+
logger.debug("Adding %s to zip archive", rel_path)
|
|
242
|
+
zipf.write(str(file_path), arcname=str(rel_path))
|
|
127
243
|
return output.getvalue()
|
|
128
244
|
|
|
129
245
|
|
hud/utils/config.py
CHANGED
|
@@ -103,6 +103,17 @@ def expand_config(config: FunctionConfigs) -> list[FunctionConfig]:
|
|
|
103
103
|
|
|
104
104
|
return [FunctionConfig(function=function_name, args=args)]
|
|
105
105
|
|
|
106
|
+
if isinstance(config, list):
|
|
107
|
+
result = []
|
|
108
|
+
for item in config:
|
|
109
|
+
if isinstance(item, tuple) and len(item) >= 1 and isinstance(item[0], str):
|
|
110
|
+
function_name = item[0]
|
|
111
|
+
args = list(item[1:]) if len(item) > 1 else []
|
|
112
|
+
result.append(FunctionConfig(function=function_name, args=args))
|
|
113
|
+
else:
|
|
114
|
+
raise ValueError(f"Invalid list item configuration: {item}")
|
|
115
|
+
return result
|
|
116
|
+
|
|
106
117
|
# Unknown configuration type
|
|
107
118
|
error_msg = f"Unknown configuration type: {type(config)}"
|
|
108
119
|
logger.error(error_msg)
|