hud-python 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +13 -10
- hud/adapters/claude/adapter.py +30 -18
- hud/adapters/common/adapter.py +0 -1
- hud/adapters/common/types.py +129 -4
- hud/adapters/operator/adapter.py +23 -13
- hud/agent/base.py +5 -4
- hud/agent/claude.py +65 -13
- hud/agent/claude_plays_pokemon.py +2 -2
- hud/agent/langchain.py +8 -2
- hud/agent/operator.py +36 -11
- hud/agent/tests/test_base.py +2 -2
- hud/env/docker_client.py +26 -3
- hud/env/environment.py +86 -40
- hud/env/local_docker_client.py +50 -4
- hud/env/remote_client.py +22 -4
- hud/env/remote_docker_client.py +6 -2
- hud/gym.py +15 -4
- hud/job.py +91 -26
- hud/settings.py +6 -0
- hud/task.py +84 -6
- hud/taskset.py +63 -8
- hud/telemetry/exporter.py +4 -6
- hud/trajectory.py +3 -0
- hud/types.py +28 -2
- hud/utils/agent.py +37 -0
- hud/utils/common.py +142 -26
- hud/utils/config.py +11 -0
- hud/utils/tests/test_common.py +225 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.2.6.dist-info → hud_python-0.2.8.dist-info}/METADATA +9 -6
- {hud_python-0.2.6.dist-info → hud_python-0.2.8.dist-info}/RECORD +34 -33
- {hud_python-0.2.6.dist-info → hud_python-0.2.8.dist-info}/WHEEL +0 -0
- {hud_python-0.2.6.dist-info → hud_python-0.2.8.dist-info}/licenses/LICENSE +0 -0
hud/types.py
CHANGED
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import enum
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Literal, TypeAlias
|
|
5
|
+
from typing import Any, Literal, TypeAlias
|
|
6
6
|
|
|
7
7
|
from pydantic import BaseModel
|
|
8
8
|
|
|
@@ -28,6 +28,9 @@ class CustomGym(BaseModel):
|
|
|
28
28
|
# B. If string, then it is the uri of the docker image to use.
|
|
29
29
|
# The controller must already be installed in the image.
|
|
30
30
|
image_or_build_context: str | Path
|
|
31
|
+
# host_config will be passed to the docker client when creating the environment.
|
|
32
|
+
# refer to official docker api documentation for available configs.
|
|
33
|
+
host_config: dict[str, Any] | None = None
|
|
31
34
|
|
|
32
35
|
|
|
33
36
|
class EnvironmentStatus(str, enum.Enum):
|
|
@@ -48,7 +51,30 @@ class EnvironmentStatus(str, enum.Enum):
|
|
|
48
51
|
|
|
49
52
|
|
|
50
53
|
# Available HUD gyms
|
|
51
|
-
ServerGym: TypeAlias = Literal["qa", "hud-browser", "OSWorld-Ubuntu"]
|
|
54
|
+
ServerGym: TypeAlias = Literal["qa", "hud-browser", "OSWorld-Ubuntu", "docker"]
|
|
52
55
|
|
|
53
56
|
# Gyms can be either custom or server-side
|
|
54
57
|
Gym: TypeAlias = CustomGym | ServerGym
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# Metadata keys for the environment.
|
|
61
|
+
# partial: Whether the environment evaluator should give partial grades.
|
|
62
|
+
# eval_model: The model to use for evaluation when running a VLM. Wraps langchain.
|
|
63
|
+
# agent_name: The name of the agent that was used for running this task.
|
|
64
|
+
ServerMetadataKeys: TypeAlias = Literal["partial", "eval_model", "agent_name"]
|
|
65
|
+
MetadataKeys: TypeAlias = str | ServerMetadataKeys
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# Dictionary of sensitive data (only supported for hud-browser environments)
|
|
69
|
+
# key: website name or page identifier
|
|
70
|
+
# value: Dictionary of credentials for the sensitive data
|
|
71
|
+
# Example:
|
|
72
|
+
# {
|
|
73
|
+
# "google.com": {
|
|
74
|
+
# "google_username": "my_username",
|
|
75
|
+
# "google_password": "my_password"
|
|
76
|
+
# }
|
|
77
|
+
# }
|
|
78
|
+
# The agent only has access to the key of the credential, not the value. (i.e. google_username)
|
|
79
|
+
# The value is only available to the environment. (i.e. my_username)
|
|
80
|
+
SensitiveData: TypeAlias = dict[str, dict[str, str]]
|
hud/utils/agent.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from hud.task import Task
|
|
7
|
+
|
|
8
|
+
AGENT_PROMPT = (
|
|
9
|
+
"You are an AI agent whose goal is to accomplish the ultimate task following the instructions."
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def format_agent_prompt(environment_prompt: str | None, task: Task | None) -> str:
|
|
14
|
+
"""
|
|
15
|
+
Format the agent prompt with the environment prompt and the task prompt.
|
|
16
|
+
"""
|
|
17
|
+
prompt = AGENT_PROMPT
|
|
18
|
+
|
|
19
|
+
# User-provided system prompt takes precedence over environment prompt
|
|
20
|
+
if task and task.system_prompt:
|
|
21
|
+
prompt += f"\n\n{task.system_prompt}"
|
|
22
|
+
elif environment_prompt:
|
|
23
|
+
prompt += f"\n\n{environment_prompt}"
|
|
24
|
+
|
|
25
|
+
if task:
|
|
26
|
+
if task.sensitive_data:
|
|
27
|
+
prompt += "\n\nHere are placeholders for sensitive data for each domain:"
|
|
28
|
+
for domain, credentials in task.sensitive_data.items():
|
|
29
|
+
prompt += f"\n{domain}: "
|
|
30
|
+
placeholders = [f"{key}" for key in credentials]
|
|
31
|
+
prompt += f"{', '.join(placeholders)}"
|
|
32
|
+
prompt += "\n\nYou can type these placeholders to enter the sensitive data when needed."
|
|
33
|
+
|
|
34
|
+
if task.prompt:
|
|
35
|
+
prompt += f"\n\n{task.prompt}"
|
|
36
|
+
|
|
37
|
+
return prompt
|
hud/utils/common.py
CHANGED
|
@@ -6,6 +6,7 @@ import tarfile
|
|
|
6
6
|
import zipfile
|
|
7
7
|
from typing import TYPE_CHECKING, Any, TypedDict
|
|
8
8
|
|
|
9
|
+
from pathspec import PathSpec
|
|
9
10
|
from pydantic import BaseModel
|
|
10
11
|
|
|
11
12
|
from hud.server.requests import make_request
|
|
@@ -67,8 +68,8 @@ class Observation(BaseModel):
|
|
|
67
68
|
|
|
68
69
|
def __str__(self) -> str:
|
|
69
70
|
return f"""Observation(screenshot={
|
|
70
|
-
self.screenshot[:100] if self.screenshot else "None"
|
|
71
|
-
}
|
|
71
|
+
f"{self.screenshot[:100]}..." if self.screenshot else "None"
|
|
72
|
+
}, text={f"{self.text[:100]}..." if self.text else "None"})"""
|
|
72
73
|
|
|
73
74
|
|
|
74
75
|
class ExecuteResult(TypedDict):
|
|
@@ -86,44 +87,159 @@ class ExecuteResult(TypedDict):
|
|
|
86
87
|
exit_code: int
|
|
87
88
|
|
|
88
89
|
|
|
89
|
-
|
|
90
|
+
# ---------------------------------------------------------------------------
|
|
91
|
+
# Helper functions for handling ignore patterns
|
|
92
|
+
# ---------------------------------------------------------------------------
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _read_ignore_file(file_path: Path) -> list[str]:
|
|
96
|
+
"""Return patterns from *file_path* (ignoring blanks / comments)."""
|
|
97
|
+
if not file_path.exists():
|
|
98
|
+
return []
|
|
99
|
+
|
|
100
|
+
patterns: list[str] = []
|
|
101
|
+
for line in file_path.read_text().splitlines():
|
|
102
|
+
stripped = line.strip()
|
|
103
|
+
if not stripped or stripped.startswith("#"):
|
|
104
|
+
continue
|
|
105
|
+
patterns.append(stripped)
|
|
106
|
+
return patterns
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _gather_ignore_patterns(root_dir: Path, filename: str) -> list[str]:
|
|
110
|
+
"""Collect *filename* patterns throughout *root_dir* respecting hierarchy.
|
|
111
|
+
|
|
112
|
+
For a nested ignore file located at ``sub/dir/.gitignore`` containing the
|
|
113
|
+
pattern ``foo/``, the returned pattern will be ``sub/dir/foo/`` so that it
|
|
114
|
+
is evaluated relative to *root_dir* when passed to ``PathSpec``.
|
|
90
115
|
"""
|
|
91
|
-
|
|
116
|
+
gathered: list[str] = []
|
|
117
|
+
|
|
118
|
+
root_dir = root_dir.resolve()
|
|
119
|
+
|
|
120
|
+
for ignore_file in root_dir.rglob(filename):
|
|
121
|
+
prefix = ignore_file.parent.relative_to(root_dir).as_posix()
|
|
122
|
+
base_prefix = "" if prefix == "." else prefix
|
|
123
|
+
|
|
124
|
+
for pat in _read_ignore_file(ignore_file):
|
|
125
|
+
negate = pat.startswith("!")
|
|
126
|
+
pat_body = pat[1:] if negate else pat
|
|
127
|
+
|
|
128
|
+
# Leading slash means relative to the directory the ignore file is
|
|
129
|
+
# located in - remove it so we can prepend *prefix* below.
|
|
130
|
+
if pat_body.startswith("/"):
|
|
131
|
+
pat_body = pat_body.lstrip("/")
|
|
92
132
|
|
|
93
|
-
|
|
94
|
-
|
|
133
|
+
full_pattern = f"{base_prefix}/{pat_body}" if base_prefix else pat_body
|
|
134
|
+
if negate:
|
|
135
|
+
full_pattern = f"!{full_pattern}"
|
|
95
136
|
|
|
96
|
-
|
|
97
|
-
path: Path to the directory to convert
|
|
137
|
+
gathered.append(full_pattern)
|
|
98
138
|
|
|
99
|
-
|
|
100
|
-
|
|
139
|
+
return gathered
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _compile_pathspec(
|
|
143
|
+
directory: Path,
|
|
144
|
+
*,
|
|
145
|
+
respect_gitignore: bool,
|
|
146
|
+
respect_dockerignore: bool,
|
|
147
|
+
respect_hudignore: bool,
|
|
148
|
+
) -> PathSpec | None:
|
|
149
|
+
"""Compile a ``PathSpec`` from all relevant ignore files under *directory*.
|
|
150
|
+
|
|
151
|
+
In addition to the standard ``.gitignore`` and ``.dockerignore`` files we now
|
|
152
|
+
recognise a project-specific ``.hudignore`` file that shares the same pattern
|
|
153
|
+
syntax. Each file can be toggled independently through the corresponding
|
|
154
|
+
``respect_*`` keyword argument.
|
|
155
|
+
"""
|
|
156
|
+
patterns: list[str] = []
|
|
157
|
+
|
|
158
|
+
if respect_gitignore:
|
|
159
|
+
patterns.extend(_gather_ignore_patterns(directory, ".gitignore"))
|
|
160
|
+
if respect_dockerignore:
|
|
161
|
+
patterns.extend(_gather_ignore_patterns(directory, ".dockerignore"))
|
|
162
|
+
if respect_hudignore:
|
|
163
|
+
patterns.extend(_gather_ignore_patterns(directory, ".hudignore"))
|
|
164
|
+
|
|
165
|
+
if not patterns:
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
return PathSpec.from_lines("gitwildmatch", patterns)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _iter_files(
|
|
172
|
+
directory: Path,
|
|
173
|
+
*,
|
|
174
|
+
respect_gitignore: bool,
|
|
175
|
+
respect_dockerignore: bool,
|
|
176
|
+
respect_hudignore: bool,
|
|
177
|
+
) -> Iterator[tuple[Path, Path]]:
|
|
178
|
+
"""Yield ``(file_path, relative_path)`` while respecting ignore files."""
|
|
179
|
+
spec = _compile_pathspec(
|
|
180
|
+
directory,
|
|
181
|
+
respect_gitignore=respect_gitignore,
|
|
182
|
+
respect_dockerignore=respect_dockerignore,
|
|
183
|
+
respect_hudignore=respect_hudignore,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
for file_path in directory.rglob("*"):
|
|
187
|
+
if not file_path.is_file():
|
|
188
|
+
continue
|
|
189
|
+
rel_path = file_path.relative_to(directory)
|
|
190
|
+
rel_str = rel_path.as_posix()
|
|
191
|
+
if spec and spec.match_file(rel_str):
|
|
192
|
+
continue
|
|
193
|
+
yield file_path, rel_path
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def directory_to_tar_bytes(
|
|
197
|
+
directory_path: Path,
|
|
198
|
+
*,
|
|
199
|
+
respect_gitignore: bool = False,
|
|
200
|
+
respect_dockerignore: bool = False,
|
|
201
|
+
respect_hudignore: bool = True,
|
|
202
|
+
) -> bytes:
|
|
203
|
+
"""
|
|
204
|
+
Converts a directory to a tar archive and returns it as bytes.
|
|
205
|
+
|
|
206
|
+
By default the archive respects ignore rules defined in ``.gitignore``,
|
|
207
|
+
``.dockerignore`` and ``.hudignore`` (each can be disabled via kwargs).
|
|
101
208
|
"""
|
|
102
209
|
output = io.BytesIO()
|
|
103
210
|
|
|
104
211
|
with tarfile.open(fileobj=output, mode="w") as tar:
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
212
|
+
for file_path, rel_path in _iter_files(
|
|
213
|
+
directory_path,
|
|
214
|
+
respect_gitignore=respect_gitignore,
|
|
215
|
+
respect_dockerignore=respect_dockerignore,
|
|
216
|
+
respect_hudignore=respect_hudignore,
|
|
217
|
+
):
|
|
218
|
+
logger.debug("Adding %s to tar archive", rel_path)
|
|
219
|
+
tar.add(file_path, arcname=str(rel_path))
|
|
220
|
+
|
|
114
221
|
output.seek(0)
|
|
115
222
|
return output.getvalue()
|
|
116
223
|
|
|
117
224
|
|
|
118
|
-
def directory_to_zip_bytes(
|
|
119
|
-
|
|
225
|
+
def directory_to_zip_bytes(
|
|
226
|
+
context_dir: Path,
|
|
227
|
+
*,
|
|
228
|
+
respect_gitignore: bool = False,
|
|
229
|
+
respect_dockerignore: bool = False,
|
|
230
|
+
respect_hudignore: bool = True,
|
|
231
|
+
) -> bytes:
|
|
232
|
+
"""Zip *context_dir* and return the zip archive as bytes, respecting ignore rules."""
|
|
120
233
|
output = io.BytesIO()
|
|
121
234
|
with zipfile.ZipFile(output, "w", zipfile.ZIP_DEFLATED) as zipf:
|
|
122
|
-
for file_path in
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
235
|
+
for file_path, rel_path in _iter_files(
|
|
236
|
+
context_dir,
|
|
237
|
+
respect_gitignore=respect_gitignore,
|
|
238
|
+
respect_dockerignore=respect_dockerignore,
|
|
239
|
+
respect_hudignore=respect_hudignore,
|
|
240
|
+
):
|
|
241
|
+
logger.debug("Adding %s to zip archive", rel_path)
|
|
242
|
+
zipf.write(str(file_path), arcname=str(rel_path))
|
|
127
243
|
return output.getvalue()
|
|
128
244
|
|
|
129
245
|
|
hud/utils/config.py
CHANGED
|
@@ -103,6 +103,17 @@ def expand_config(config: FunctionConfigs) -> list[FunctionConfig]:
|
|
|
103
103
|
|
|
104
104
|
return [FunctionConfig(function=function_name, args=args)]
|
|
105
105
|
|
|
106
|
+
if isinstance(config, list):
|
|
107
|
+
result = []
|
|
108
|
+
for item in config:
|
|
109
|
+
if isinstance(item, tuple) and len(item) >= 1 and isinstance(item[0], str):
|
|
110
|
+
function_name = item[0]
|
|
111
|
+
args = list(item[1:]) if len(item) > 1 else []
|
|
112
|
+
result.append(FunctionConfig(function=function_name, args=args))
|
|
113
|
+
else:
|
|
114
|
+
raise ValueError(f"Invalid list item configuration: {item}")
|
|
115
|
+
return result
|
|
116
|
+
|
|
106
117
|
# Unknown configuration type
|
|
107
118
|
error_msg = f"Unknown configuration type: {type(config)}"
|
|
108
119
|
logger.error(error_msg)
|
hud/utils/tests/test_common.py
CHANGED
|
@@ -50,3 +50,228 @@ async def test_get_gym_id(mocker: pytest_mock.MockerFixture):
|
|
|
50
50
|
mocker.patch("hud.utils.common.make_request", return_value={"id": "test_gym_id"})
|
|
51
51
|
gym_id = await get_gym_id("test_gym")
|
|
52
52
|
assert gym_id == "test_gym_id"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_function_config_stores_function_name_args_and_optional_id():
|
|
56
|
+
"""FunctionConfig should store function name, args list, and optional id."""
|
|
57
|
+
from hud.utils.common import FunctionConfig
|
|
58
|
+
|
|
59
|
+
# Minimal config
|
|
60
|
+
minimal = FunctionConfig(function="test_func", args=[])
|
|
61
|
+
assert minimal.function == "test_func"
|
|
62
|
+
assert minimal.args == []
|
|
63
|
+
assert minimal.id is None
|
|
64
|
+
|
|
65
|
+
# With args
|
|
66
|
+
with_args = FunctionConfig(function="navigate", args=["https://example.com", {"wait": True}])
|
|
67
|
+
assert with_args.function == "navigate"
|
|
68
|
+
assert len(with_args.args) == 2
|
|
69
|
+
assert with_args.args[0] == "https://example.com"
|
|
70
|
+
assert with_args.args[1] == {"wait": True}
|
|
71
|
+
|
|
72
|
+
# With id
|
|
73
|
+
with_id = FunctionConfig(
|
|
74
|
+
function="complex_operation",
|
|
75
|
+
args=[42, "test", {"nested": {"key": "value"}}],
|
|
76
|
+
id="op_123",
|
|
77
|
+
)
|
|
78
|
+
assert with_id.function == "complex_operation"
|
|
79
|
+
assert len(with_id.args) == 3
|
|
80
|
+
assert with_id.id == "op_123"
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@pytest.mark.asyncio
|
|
84
|
+
async def test_get_gym_id_fetches_id_from_api_response(
|
|
85
|
+
mocker: pytest_mock.MockerFixture,
|
|
86
|
+
):
|
|
87
|
+
"""get_gym_id should extract 'id' field from API response."""
|
|
88
|
+
# Arrange
|
|
89
|
+
api_response = {"id": "gym-123", "name": "Test Gym", "status": "active"}
|
|
90
|
+
mocker.patch("hud.utils.common.make_request", return_value=api_response)
|
|
91
|
+
|
|
92
|
+
# Act
|
|
93
|
+
gym_id = await get_gym_id("test_gym")
|
|
94
|
+
|
|
95
|
+
# Assert
|
|
96
|
+
assert gym_id == "gym-123"
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@pytest.mark.asyncio
|
|
100
|
+
async def test_get_gym_id_propagates_network_errors(mocker: pytest_mock.MockerFixture):
|
|
101
|
+
"""get_gym_id should propagate exceptions from make_request."""
|
|
102
|
+
# Arrange
|
|
103
|
+
mocker.patch("hud.utils.common.make_request", side_effect=ConnectionError("API unavailable"))
|
|
104
|
+
|
|
105
|
+
# Act & Assert
|
|
106
|
+
with pytest.raises(ConnectionError, match="API unavailable"):
|
|
107
|
+
await get_gym_id("test_gym")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@pytest.mark.asyncio
|
|
111
|
+
async def test_get_gym_id_raises_key_error_when_id_missing(
|
|
112
|
+
mocker: pytest_mock.MockerFixture,
|
|
113
|
+
):
|
|
114
|
+
"""get_gym_id should raise KeyError when response lacks 'id' field."""
|
|
115
|
+
# Arrange
|
|
116
|
+
incomplete_response = {"name": "Test Gym", "status": "active"} # Missing 'id'
|
|
117
|
+
mocker.patch("hud.utils.common.make_request", return_value=incomplete_response)
|
|
118
|
+
|
|
119
|
+
# Act & Assert
|
|
120
|
+
with pytest.raises(KeyError):
|
|
121
|
+
await get_gym_id("test_gym")
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def test_directory_to_tar_bytes_creates_valid_tar_archive(
|
|
125
|
+
tmpdir_factory: pytest.TempdirFactory,
|
|
126
|
+
):
|
|
127
|
+
"""directory_to_tar_bytes should create a valid tar archive containing all files."""
|
|
128
|
+
# Arrange
|
|
129
|
+
temp_dir = tmpdir_factory.mktemp("test_archive")
|
|
130
|
+
temp_dir_path = Path(temp_dir)
|
|
131
|
+
|
|
132
|
+
# Create test structure
|
|
133
|
+
(temp_dir_path / "file1.txt").write_text("content1")
|
|
134
|
+
(temp_dir_path / "file2.py").write_text("import os\nprint('hello')")
|
|
135
|
+
|
|
136
|
+
subdir = temp_dir_path / "subdir"
|
|
137
|
+
subdir.mkdir()
|
|
138
|
+
(subdir / "nested.json").write_text('{"key": "value"}')
|
|
139
|
+
|
|
140
|
+
# Act
|
|
141
|
+
tar_bytes = directory_to_tar_bytes(temp_dir_path)
|
|
142
|
+
|
|
143
|
+
# Assert
|
|
144
|
+
assert isinstance(tar_bytes, bytes)
|
|
145
|
+
assert len(tar_bytes) > 0
|
|
146
|
+
|
|
147
|
+
# Verify contents
|
|
148
|
+
with tarfile.open(fileobj=io.BytesIO(tar_bytes), mode="r:*") as tar:
|
|
149
|
+
members = {m.name for m in tar.getmembers()}
|
|
150
|
+
assert "file1.txt" in members
|
|
151
|
+
assert "file2.py" in members
|
|
152
|
+
assert "subdir/nested.json" in members
|
|
153
|
+
|
|
154
|
+
# Verify file contents
|
|
155
|
+
content = tar.extractfile("file1.txt")
|
|
156
|
+
assert content is not None
|
|
157
|
+
assert content.read().decode() == "content1"
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def test_directory_to_tar_bytes_handles_empty_directory(
|
|
161
|
+
tmpdir_factory: pytest.TempdirFactory,
|
|
162
|
+
):
|
|
163
|
+
"""directory_to_tar_bytes should handle empty directories gracefully."""
|
|
164
|
+
# Arrange
|
|
165
|
+
empty_dir = tmpdir_factory.mktemp("empty")
|
|
166
|
+
empty_dir_path = Path(empty_dir)
|
|
167
|
+
|
|
168
|
+
# Act
|
|
169
|
+
tar_bytes = directory_to_tar_bytes(empty_dir_path)
|
|
170
|
+
|
|
171
|
+
# Assert
|
|
172
|
+
assert isinstance(tar_bytes, bytes)
|
|
173
|
+
assert len(tar_bytes) > 0 # Even empty tar has headers
|
|
174
|
+
|
|
175
|
+
with tarfile.open(fileobj=io.BytesIO(tar_bytes), mode="r:*") as tar:
|
|
176
|
+
members = tar.getmembers()
|
|
177
|
+
# May contain the directory itself or be completely empty
|
|
178
|
+
assert len(members) >= 0
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def test_directory_to_tar_bytes_preserves_directory_structure(
|
|
182
|
+
tmpdir_factory: pytest.TempdirFactory,
|
|
183
|
+
):
|
|
184
|
+
"""directory_to_tar_bytes should preserve nested directory structure."""
|
|
185
|
+
# Arrange
|
|
186
|
+
root = tmpdir_factory.mktemp("root")
|
|
187
|
+
root_path = Path(root)
|
|
188
|
+
|
|
189
|
+
# Create nested structure
|
|
190
|
+
(root_path / "a" / "b" / "c").mkdir(parents=True)
|
|
191
|
+
(root_path / "a" / "file1.txt").write_text("in a")
|
|
192
|
+
(root_path / "a" / "b" / "file2.txt").write_text("in b")
|
|
193
|
+
(root_path / "a" / "b" / "c" / "file3.txt").write_text("in c")
|
|
194
|
+
|
|
195
|
+
# Act
|
|
196
|
+
tar_bytes = directory_to_tar_bytes(root_path)
|
|
197
|
+
|
|
198
|
+
# Assert
|
|
199
|
+
with tarfile.open(fileobj=io.BytesIO(tar_bytes), mode="r:*") as tar:
|
|
200
|
+
members = {m.name for m in tar.getmembers()}
|
|
201
|
+
assert "a/file1.txt" in members
|
|
202
|
+
assert "a/b/file2.txt" in members
|
|
203
|
+
assert "a/b/c/file3.txt" in members
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def test_directory_to_tar_bytes_with_exclusions(tmpdir_factory: pytest.TempdirFactory):
|
|
207
|
+
"""Test directory_to_tar_bytes with files to exclude."""
|
|
208
|
+
temp_dir = tmpdir_factory.mktemp("test_exclude_dir")
|
|
209
|
+
temp_dir_path = Path(temp_dir)
|
|
210
|
+
|
|
211
|
+
# Create various files
|
|
212
|
+
(temp_dir_path / "include_me.txt").write_text("include")
|
|
213
|
+
(temp_dir_path / ".git").mkdir()
|
|
214
|
+
(temp_dir_path / ".git" / "config").write_text("git config")
|
|
215
|
+
(temp_dir_path / "__pycache__").mkdir()
|
|
216
|
+
(temp_dir_path / "__pycache__" / "module.pyc").write_bytes(b"pyc content")
|
|
217
|
+
(temp_dir_path / "normal_dir").mkdir()
|
|
218
|
+
(temp_dir_path / "normal_dir" / "file.py").write_text("python code")
|
|
219
|
+
|
|
220
|
+
tar_bytes = directory_to_tar_bytes(temp_dir_path)
|
|
221
|
+
|
|
222
|
+
# Check contents
|
|
223
|
+
with tarfile.open(fileobj=io.BytesIO(tar_bytes), mode="r:*") as tar:
|
|
224
|
+
member_names = {m.name for m in tar.getmembers()}
|
|
225
|
+
|
|
226
|
+
# Should include regular files and directories
|
|
227
|
+
assert "include_me.txt" in member_names
|
|
228
|
+
assert "normal_dir/file.py" in member_names
|
|
229
|
+
|
|
230
|
+
# Implementation might exclude common patterns like .git and __pycache__
|
|
231
|
+
# This depends on the actual implementation
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def test_directory_to_tar_bytes_empty_directory(tmpdir_factory: pytest.TempdirFactory):
|
|
235
|
+
"""Test directory_to_tar_bytes with empty directory."""
|
|
236
|
+
temp_dir = tmpdir_factory.mktemp("empty_dir")
|
|
237
|
+
temp_dir_path = Path(temp_dir)
|
|
238
|
+
|
|
239
|
+
tar_bytes = directory_to_tar_bytes(temp_dir_path)
|
|
240
|
+
|
|
241
|
+
# Should still create a valid tar even if empty
|
|
242
|
+
assert tar_bytes is not None
|
|
243
|
+
assert len(tar_bytes) > 0
|
|
244
|
+
|
|
245
|
+
with tarfile.open(fileobj=io.BytesIO(tar_bytes), mode="r:*") as tar:
|
|
246
|
+
members = tar.getmembers()
|
|
247
|
+
# Might be empty or contain just the root directory
|
|
248
|
+
assert len(members) >= 0
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def test_directory_to_tar_bytes_symlinks(tmpdir_factory: pytest.TempdirFactory):
|
|
252
|
+
"""Test directory_to_tar_bytes with symbolic links."""
|
|
253
|
+
temp_dir = tmpdir_factory.mktemp("symlink_dir")
|
|
254
|
+
temp_dir_path = Path(temp_dir)
|
|
255
|
+
|
|
256
|
+
# Create a file and a symlink to it
|
|
257
|
+
target_file = temp_dir_path / "target.txt"
|
|
258
|
+
target_file.write_text("target content")
|
|
259
|
+
|
|
260
|
+
symlink = temp_dir_path / "link_to_target.txt"
|
|
261
|
+
try:
|
|
262
|
+
symlink.symlink_to(target_file)
|
|
263
|
+
has_symlink = True
|
|
264
|
+
except OSError:
|
|
265
|
+
# Symlinks might not be supported on all systems (e.g., Windows without admin)
|
|
266
|
+
has_symlink = False
|
|
267
|
+
|
|
268
|
+
tar_bytes = directory_to_tar_bytes(temp_dir_path)
|
|
269
|
+
|
|
270
|
+
with tarfile.open(fileobj=io.BytesIO(tar_bytes), mode="r:*") as tar:
|
|
271
|
+
members = {m.name: m for m in tar.getmembers()}
|
|
272
|
+
|
|
273
|
+
assert "target.txt" in members
|
|
274
|
+
|
|
275
|
+
if has_symlink:
|
|
276
|
+
# Check how symlinks are handled (might be followed or preserved)
|
|
277
|
+
assert "link_to_target.txt" in members
|
hud/utils/tests/test_version.py
CHANGED
hud/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.8
|
|
4
4
|
Summary: SDK for the HUD evaluation platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
|
|
@@ -47,6 +47,7 @@ Requires-Dist: langchain-openai
|
|
|
47
47
|
Requires-Dist: mcp
|
|
48
48
|
Requires-Dist: numpy
|
|
49
49
|
Requires-Dist: openai
|
|
50
|
+
Requires-Dist: pathspec>=0.12.1
|
|
50
51
|
Requires-Dist: pillow>=11.1.0
|
|
51
52
|
Requires-Dist: pydantic-settings<3,>=2
|
|
52
53
|
Requires-Dist: pydantic<3,>=2
|
|
@@ -61,7 +62,7 @@ Requires-Dist: ipython<9; extra == 'dev'
|
|
|
61
62
|
Requires-Dist: jupyter-client; extra == 'dev'
|
|
62
63
|
Requires-Dist: jupyter-core; extra == 'dev'
|
|
63
64
|
Requires-Dist: openai; extra == 'dev'
|
|
64
|
-
Requires-Dist: pyright==1.1.
|
|
65
|
+
Requires-Dist: pyright==1.1.401; extra == 'dev'
|
|
65
66
|
Requires-Dist: pytest-asyncio; extra == 'dev'
|
|
66
67
|
Requires-Dist: pytest-cov; extra == 'dev'
|
|
67
68
|
Requires-Dist: pytest-mock; extra == 'dev'
|
|
@@ -90,7 +91,7 @@ We're here to help with eval strategies, custom environments, or improving your
|
|
|
90
91
|
|
|
91
92
|
## ✨ What You Can Do
|
|
92
93
|
|
|
93
|
-
**Evaluate Existing Benchmarks**
|
|
94
|
+
**[Evaluate Existing Benchmarks](https://docs.hud.so/examples/benchmarking-agents)**
|
|
94
95
|
```python
|
|
95
96
|
from hud import load_taskset, run_job, ClaudeAgent
|
|
96
97
|
|
|
@@ -98,7 +99,7 @@ taskset = await load_taskset("WebVoyager") # or GAIA, OSWorld-Ubuntu, Mind2Web
|
|
|
98
99
|
job = await run_job(ClaudeAgent, taskset, "my-evaluation")
|
|
99
100
|
```
|
|
100
101
|
|
|
101
|
-
**Create Custom Tasks**
|
|
102
|
+
**[Create Custom Tasks](https://docs.hud.so/task-creation)**
|
|
102
103
|
```python
|
|
103
104
|
from hud.task import Task
|
|
104
105
|
|
|
@@ -110,7 +111,7 @@ task = Task(
|
|
|
110
111
|
)
|
|
111
112
|
```
|
|
112
113
|
|
|
113
|
-
**Build Custom Environments**
|
|
114
|
+
**[Build Custom Environments](https://docs.hud.so/environment-creation)**
|
|
114
115
|
```python
|
|
115
116
|
from hud.types import CustomGym
|
|
116
117
|
|
|
@@ -123,7 +124,7 @@ custom_gym = CustomGym(
|
|
|
123
124
|
# Or create complex Docker environments - see environments/ folder for examples
|
|
124
125
|
```
|
|
125
126
|
|
|
126
|
-
**Trace Tool Calls Alongside HUD Environments (or Independently)**
|
|
127
|
+
**[Trace Tool Calls Alongside HUD Environments (or Independently)](https://docs.hud.so/examples/mcp-agent-tracing)**
|
|
127
128
|
```python
|
|
128
129
|
import hud
|
|
129
130
|
|
|
@@ -171,6 +172,7 @@ async def main():
|
|
|
171
172
|
setup=("goto", "google.com"),
|
|
172
173
|
evaluate=("contains_text", "capybara")
|
|
173
174
|
)
|
|
175
|
+
print(f"Running task with prompt: {task.prompt}")
|
|
174
176
|
|
|
175
177
|
# Create environment using the gym module
|
|
176
178
|
env = await gym.make(task)
|
|
@@ -182,6 +184,7 @@ async def main():
|
|
|
182
184
|
obs, _ = await env.reset() # Gets first observation
|
|
183
185
|
for i in range(5):
|
|
184
186
|
actions, done = await agent.predict(obs)
|
|
187
|
+
print(f"Agent action {i}: {actions}")
|
|
185
188
|
|
|
186
189
|
obs, reward, terminated, info = await env.step(actions)
|
|
187
190
|
if done or terminated: break
|