hud-python 0.4.11__py3-none-any.whl → 0.4.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__main__.py +8 -0
- hud/agents/base.py +7 -8
- hud/agents/langchain.py +2 -2
- hud/agents/tests/test_openai.py +3 -1
- hud/cli/__init__.py +114 -52
- hud/cli/build.py +121 -71
- hud/cli/debug.py +2 -2
- hud/cli/{mcp_server.py → dev.py} +101 -38
- hud/cli/eval.py +175 -90
- hud/cli/init.py +442 -64
- hud/cli/list_func.py +72 -71
- hud/cli/pull.py +1 -2
- hud/cli/push.py +35 -23
- hud/cli/remove.py +35 -41
- hud/cli/tests/test_analyze.py +2 -1
- hud/cli/tests/test_analyze_metadata.py +42 -49
- hud/cli/tests/test_build.py +28 -52
- hud/cli/tests/test_cursor.py +1 -1
- hud/cli/tests/test_debug.py +1 -1
- hud/cli/tests/test_list_func.py +75 -64
- hud/cli/tests/test_main_module.py +30 -0
- hud/cli/tests/test_mcp_server.py +3 -3
- hud/cli/tests/test_pull.py +30 -61
- hud/cli/tests/test_push.py +70 -89
- hud/cli/tests/test_registry.py +36 -38
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/utils/__init__.py +1 -0
- hud/cli/{docker_utils.py → utils/docker.py} +36 -0
- hud/cli/{env_utils.py → utils/environment.py} +7 -7
- hud/cli/{interactive.py → utils/interactive.py} +91 -19
- hud/cli/{analyze_metadata.py → utils/metadata.py} +12 -8
- hud/cli/{registry.py → utils/registry.py} +28 -30
- hud/cli/{remote_runner.py → utils/remote_runner.py} +1 -1
- hud/cli/utils/runner.py +134 -0
- hud/cli/utils/server.py +250 -0
- hud/clients/base.py +1 -1
- hud/clients/fastmcp.py +5 -13
- hud/clients/mcp_use.py +6 -10
- hud/server/server.py +35 -5
- hud/shared/exceptions.py +11 -0
- hud/shared/tests/test_exceptions.py +22 -0
- hud/telemetry/tests/__init__.py +0 -0
- hud/telemetry/tests/test_replay.py +40 -0
- hud/telemetry/tests/test_trace.py +63 -0
- hud/tools/base.py +20 -3
- hud/tools/computer/hud.py +15 -6
- hud/tools/executors/tests/test_base_executor.py +27 -0
- hud/tools/response.py +12 -8
- hud/tools/tests/test_response.py +60 -0
- hud/tools/tests/test_tools_init.py +49 -0
- hud/utils/design.py +19 -8
- hud/utils/mcp.py +17 -5
- hud/utils/tests/test_mcp.py +112 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/METADATA +16 -13
- {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/RECORD +62 -52
- hud/cli/runner.py +0 -160
- /hud/cli/{cursor.py → utils/cursor.py} +0 -0
- /hud/cli/{utils.py → utils/logging.py} +0 -0
- {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/WHEEL +0 -0
- {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/licenses/LICENSE +0 -0
hud/cli/eval.py
CHANGED
|
@@ -6,18 +6,13 @@ import asyncio
|
|
|
6
6
|
import json
|
|
7
7
|
import logging
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import
|
|
9
|
+
from typing import Any, Literal
|
|
10
10
|
|
|
11
11
|
import typer
|
|
12
12
|
|
|
13
13
|
import hud
|
|
14
14
|
from hud.utils.design import HUDDesign
|
|
15
15
|
|
|
16
|
-
if TYPE_CHECKING:
|
|
17
|
-
from datasets import Dataset
|
|
18
|
-
from hud.agents import ClaudeAgent, OperatorAgent
|
|
19
|
-
from hud.agents.misc.response_agent import ResponseAgent
|
|
20
|
-
|
|
21
16
|
logger = logging.getLogger(__name__)
|
|
22
17
|
design = HUDDesign()
|
|
23
18
|
|
|
@@ -29,17 +24,8 @@ def build_agent(
|
|
|
29
24
|
allowed_tools: list[str] | None = None,
|
|
30
25
|
) -> Any:
|
|
31
26
|
"""Create and return the requested agent type."""
|
|
32
|
-
|
|
27
|
+
|
|
33
28
|
# Import agents lazily to avoid dependency issues
|
|
34
|
-
try:
|
|
35
|
-
from hud.agents.misc.response_agent import ResponseAgent
|
|
36
|
-
except ImportError as e:
|
|
37
|
-
design.error(
|
|
38
|
-
"Agent dependencies are not installed. "
|
|
39
|
-
"Please install with: pip install 'hud-python[agent]'"
|
|
40
|
-
)
|
|
41
|
-
raise typer.Exit(1) from e
|
|
42
|
-
|
|
43
29
|
if agent_type == "openai":
|
|
44
30
|
try:
|
|
45
31
|
from hud.agents import OperatorAgent
|
|
@@ -49,14 +35,14 @@ def build_agent(
|
|
|
49
35
|
"Please install with: pip install 'hud-python[agent]'"
|
|
50
36
|
)
|
|
51
37
|
raise typer.Exit(1) from e
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
38
|
+
|
|
39
|
+
if allowed_tools:
|
|
40
|
+
return OperatorAgent(
|
|
41
|
+
allowed_tools=allowed_tools,
|
|
42
|
+
)
|
|
43
|
+
else:
|
|
44
|
+
return OperatorAgent()
|
|
45
|
+
|
|
60
46
|
# Fallback Claude agent (Anthropic)
|
|
61
47
|
try:
|
|
62
48
|
from hud.agents import ClaudeAgent
|
|
@@ -66,15 +52,18 @@ def build_agent(
|
|
|
66
52
|
"Please install with: pip install 'hud-python[agent]'"
|
|
67
53
|
)
|
|
68
54
|
raise typer.Exit(1) from e
|
|
69
|
-
|
|
55
|
+
|
|
70
56
|
model = model or "claude-sonnet-4-20250514"
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
57
|
+
|
|
58
|
+
if allowed_tools:
|
|
59
|
+
return ClaudeAgent(
|
|
60
|
+
model=model,
|
|
61
|
+
allowed_tools=allowed_tools,
|
|
62
|
+
)
|
|
63
|
+
else:
|
|
64
|
+
return ClaudeAgent(
|
|
65
|
+
model=model,
|
|
66
|
+
)
|
|
78
67
|
|
|
79
68
|
|
|
80
69
|
async def run_single_task(
|
|
@@ -85,26 +74,91 @@ async def run_single_task(
|
|
|
85
74
|
allowed_tools: list[str] | None = None,
|
|
86
75
|
max_steps: int = 10,
|
|
87
76
|
) -> None:
|
|
88
|
-
"""Load one task and execute it."""
|
|
89
|
-
|
|
77
|
+
"""Load one task and execute it, or detect if JSON contains a list and run as dataset."""
|
|
78
|
+
|
|
90
79
|
design.info("📊 Loading dataset…")
|
|
91
|
-
|
|
92
|
-
# Import Task lazily
|
|
80
|
+
|
|
81
|
+
# Import Task and run_dataset lazily
|
|
93
82
|
try:
|
|
94
|
-
from hud.datasets import Task
|
|
83
|
+
from hud.datasets import Task, run_dataset
|
|
95
84
|
except ImportError as e:
|
|
96
85
|
design.error(
|
|
97
86
|
"Dataset dependencies are not installed. "
|
|
98
87
|
"Please install with: pip install 'hud-python[agent]'"
|
|
99
88
|
)
|
|
100
89
|
raise typer.Exit(1) from e
|
|
101
|
-
|
|
102
|
-
# Check if it's a
|
|
90
|
+
|
|
91
|
+
# Check if it's a JSON file
|
|
103
92
|
path = Path(source)
|
|
104
93
|
if path.exists() and path.suffix == ".json":
|
|
105
|
-
with open(path
|
|
106
|
-
|
|
107
|
-
|
|
94
|
+
with open(path) as f: # noqa: ASYNC230
|
|
95
|
+
json_data = json.load(f)
|
|
96
|
+
|
|
97
|
+
# Check if JSON contains multiple tasks (list with more than 1 task)
|
|
98
|
+
if isinstance(json_data, list) and len(json_data) > 1:
|
|
99
|
+
design.info(f"Found {len(json_data)} tasks in JSON file, running as dataset…")
|
|
100
|
+
|
|
101
|
+
# Build agent class and config for run_dataset
|
|
102
|
+
if agent_type == "openai":
|
|
103
|
+
try:
|
|
104
|
+
from hud.agents import OperatorAgent
|
|
105
|
+
|
|
106
|
+
agent_class = OperatorAgent
|
|
107
|
+
except ImportError as e:
|
|
108
|
+
design.error(
|
|
109
|
+
"OpenAI agent dependencies are not installed. "
|
|
110
|
+
"Please install with: pip install 'hud-python[agent]'"
|
|
111
|
+
)
|
|
112
|
+
raise typer.Exit(1) from e
|
|
113
|
+
|
|
114
|
+
agent_config: dict[str, Any] = {
|
|
115
|
+
}
|
|
116
|
+
if allowed_tools:
|
|
117
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
118
|
+
|
|
119
|
+
else:
|
|
120
|
+
try:
|
|
121
|
+
from hud.agents import ClaudeAgent
|
|
122
|
+
|
|
123
|
+
agent_class = ClaudeAgent
|
|
124
|
+
except ImportError as e:
|
|
125
|
+
design.error(
|
|
126
|
+
"Claude agent dependencies are not installed. "
|
|
127
|
+
"Please install with: pip install 'hud-python[agent]'"
|
|
128
|
+
)
|
|
129
|
+
raise typer.Exit(1) from e
|
|
130
|
+
|
|
131
|
+
agent_config = {
|
|
132
|
+
"model": model or "claude-sonnet-4-20250514",
|
|
133
|
+
}
|
|
134
|
+
if allowed_tools:
|
|
135
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
136
|
+
|
|
137
|
+
# Run as dataset with single-task concurrency to maintain debug behavior
|
|
138
|
+
results = await run_dataset(
|
|
139
|
+
name=f"JSON Dataset: {path.name}",
|
|
140
|
+
dataset=json_data, # Pass the list directly
|
|
141
|
+
agent_class=agent_class,
|
|
142
|
+
agent_config=agent_config,
|
|
143
|
+
max_concurrent=1, # Run sequentially for debug mode
|
|
144
|
+
metadata={"source": str(path)},
|
|
145
|
+
max_steps=max_steps,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# Display summary
|
|
149
|
+
successful = sum(1 for r in results if getattr(r, "reward", 0) > 0)
|
|
150
|
+
design.success(f"Completed {len(results)} tasks: {successful} successful")
|
|
151
|
+
return
|
|
152
|
+
|
|
153
|
+
# Single task JSON (either direct object or list with 1 task)
|
|
154
|
+
if isinstance(json_data, list) and len(json_data) == 1:
|
|
155
|
+
design.info("Found 1 task in JSON file, running as single task…")
|
|
156
|
+
task = Task(**json_data[0])
|
|
157
|
+
elif isinstance(json_data, dict):
|
|
158
|
+
task = Task(**json_data)
|
|
159
|
+
else:
|
|
160
|
+
design.error("JSON file must contain a list of tasks when using --full flag")
|
|
161
|
+
raise typer.Exit(1)
|
|
108
162
|
else:
|
|
109
163
|
# Load from HuggingFace dataset
|
|
110
164
|
try:
|
|
@@ -115,15 +169,15 @@ async def run_single_task(
|
|
|
115
169
|
"Please install with: pip install 'hud-python[agent]'"
|
|
116
170
|
)
|
|
117
171
|
raise typer.Exit(1) from e
|
|
118
|
-
|
|
172
|
+
|
|
119
173
|
dataset = load_dataset(source, split="train")
|
|
120
|
-
|
|
174
|
+
|
|
121
175
|
# Get first task from dataset
|
|
122
176
|
sample_task = dataset[0] # type: ignore[index]
|
|
123
177
|
task = Task(**sample_task) # type: ignore[arg-type]
|
|
124
|
-
|
|
178
|
+
|
|
125
179
|
task_prompt = task.prompt[:50] + "..." if len(task.prompt) > 50 else task.prompt
|
|
126
|
-
|
|
180
|
+
|
|
127
181
|
with hud.trace(name=task_prompt):
|
|
128
182
|
agent = build_agent(
|
|
129
183
|
agent_type,
|
|
@@ -145,7 +199,7 @@ async def run_full_dataset(
|
|
|
145
199
|
max_steps: int = 50,
|
|
146
200
|
) -> list[Any]:
|
|
147
201
|
"""Run evaluation across the entire dataset using hud.datasets.run_dataset."""
|
|
148
|
-
|
|
202
|
+
|
|
149
203
|
# Import run_dataset lazily
|
|
150
204
|
try:
|
|
151
205
|
from hud.datasets import run_dataset
|
|
@@ -155,11 +209,29 @@ async def run_full_dataset(
|
|
|
155
209
|
"Please install with: pip install 'hud-python[agent]'"
|
|
156
210
|
)
|
|
157
211
|
raise typer.Exit(1) from e
|
|
158
|
-
|
|
212
|
+
|
|
213
|
+
# Check if source is a JSON file with list of tasks
|
|
214
|
+
path = Path(source)
|
|
215
|
+
dataset_or_tasks = source
|
|
216
|
+
dataset_name = source.split("/")[-1]
|
|
217
|
+
|
|
218
|
+
if path.exists() and path.suffix == ".json":
|
|
219
|
+
with open(path) as f: # noqa: ASYNC230
|
|
220
|
+
json_data = json.load(f)
|
|
221
|
+
|
|
222
|
+
if isinstance(json_data, list):
|
|
223
|
+
dataset_or_tasks = json_data
|
|
224
|
+
dataset_name = f"JSON Dataset: {path.name}"
|
|
225
|
+
design.info(f"Found {len(json_data)} tasks in JSON file")
|
|
226
|
+
else:
|
|
227
|
+
design.error("JSON file must contain a list of tasks when using --full flag")
|
|
228
|
+
raise typer.Exit(1)
|
|
229
|
+
|
|
159
230
|
# Build agent class + config for run_dataset
|
|
160
231
|
if agent_type == "openai":
|
|
161
232
|
try:
|
|
162
233
|
from hud.agents import OperatorAgent
|
|
234
|
+
|
|
163
235
|
agent_class = OperatorAgent
|
|
164
236
|
except ImportError as e:
|
|
165
237
|
design.error(
|
|
@@ -167,13 +239,16 @@ async def run_full_dataset(
|
|
|
167
239
|
"Please install with: pip install 'hud-python[agent]'"
|
|
168
240
|
)
|
|
169
241
|
raise typer.Exit(1) from e
|
|
170
|
-
|
|
242
|
+
|
|
171
243
|
agent_config: dict[str, Any] = {
|
|
172
|
-
"allowed_tools": allowed_tools or ["openai_computer"],
|
|
173
244
|
}
|
|
245
|
+
if allowed_tools:
|
|
246
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
247
|
+
|
|
174
248
|
else:
|
|
175
249
|
try:
|
|
176
250
|
from hud.agents import ClaudeAgent
|
|
251
|
+
|
|
177
252
|
agent_class = ClaudeAgent
|
|
178
253
|
except ImportError as e:
|
|
179
254
|
design.error(
|
|
@@ -181,29 +256,29 @@ async def run_full_dataset(
|
|
|
181
256
|
"Please install with: pip install 'hud-python[agent]'"
|
|
182
257
|
)
|
|
183
258
|
raise typer.Exit(1) from e
|
|
184
|
-
|
|
259
|
+
|
|
185
260
|
agent_config = {
|
|
186
261
|
"model": model or "claude-sonnet-4-20250514",
|
|
187
|
-
"allowed_tools": allowed_tools or ["anthropic_computer"],
|
|
188
262
|
}
|
|
189
|
-
|
|
263
|
+
if allowed_tools:
|
|
264
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
265
|
+
|
|
190
266
|
design.info("🚀 Running evaluation…")
|
|
191
267
|
return await run_dataset(
|
|
192
|
-
name=f"Evaluation {
|
|
193
|
-
dataset=
|
|
268
|
+
name=f"Evaluation {dataset_name}",
|
|
269
|
+
dataset=dataset_or_tasks,
|
|
194
270
|
agent_class=agent_class,
|
|
195
271
|
agent_config=agent_config,
|
|
196
272
|
max_concurrent=max_concurrent,
|
|
197
273
|
metadata={"dataset": source},
|
|
198
274
|
max_steps=max_steps,
|
|
199
|
-
auto_respond=True,
|
|
200
275
|
)
|
|
201
276
|
|
|
202
277
|
|
|
203
278
|
def eval_command(
|
|
204
279
|
source: str = typer.Argument(
|
|
205
280
|
...,
|
|
206
|
-
help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50')
|
|
281
|
+
help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50'), single task JSON file, or JSON file with list of tasks", # noqa: E501
|
|
207
282
|
),
|
|
208
283
|
full: bool = typer.Option(
|
|
209
284
|
False,
|
|
@@ -237,66 +312,76 @@ def eval_command(
|
|
|
237
312
|
),
|
|
238
313
|
) -> None:
|
|
239
314
|
"""🚀 Run evaluation on datasets or individual tasks with agents.
|
|
240
|
-
|
|
315
|
+
|
|
241
316
|
Examples:
|
|
242
317
|
# Evaluate a single task from SheetBench
|
|
243
318
|
hud eval hud-evals/SheetBench-50
|
|
244
|
-
|
|
319
|
+
|
|
245
320
|
# Evaluate the FULL SheetBench dataset with Claude
|
|
246
321
|
hud eval hud-evals/SheetBench-50 --full --agent claude
|
|
247
|
-
|
|
322
|
+
|
|
248
323
|
# Run a single task from a JSON file
|
|
249
324
|
hud eval task.json
|
|
250
|
-
|
|
325
|
+
|
|
326
|
+
# Run multiple tasks from a JSON file (auto-detects list)
|
|
327
|
+
hud eval tasks.json # If tasks.json contains a list, runs all tasks
|
|
328
|
+
|
|
329
|
+
# Run JSON list with full dataset mode and concurrency
|
|
330
|
+
hud eval tasks.json --full --max-concurrent 10
|
|
331
|
+
|
|
251
332
|
# Run with OpenAI Operator agent
|
|
252
333
|
hud eval hud-evals/OSWorld-Gold-Beta --agent openai
|
|
253
334
|
"""
|
|
254
|
-
from hud.settings import settings
|
|
255
335
|
import os
|
|
256
|
-
|
|
336
|
+
|
|
337
|
+
from hud.settings import settings
|
|
338
|
+
|
|
257
339
|
# Check for required API keys
|
|
258
340
|
if agent == "claude":
|
|
259
341
|
if not settings.anthropic_api_key or not os.environ.get("ANTHROPIC_API_KEY"):
|
|
260
342
|
design.error("ANTHROPIC_API_KEY is required for Claude agent")
|
|
261
343
|
design.info("Set it in your environment or .env file: ANTHROPIC_API_KEY=your-key-here")
|
|
262
344
|
raise typer.Exit(1)
|
|
263
|
-
elif agent == "openai"
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
345
|
+
elif agent == "openai" and (
|
|
346
|
+
not settings.openai_api_key or not os.environ.get("OPENAI_API_KEY")
|
|
347
|
+
):
|
|
348
|
+
design.error("OPENAI_API_KEY is required for OpenAI agent")
|
|
349
|
+
design.info("Set it in your environment or .env file: OPENAI_API_KEY=your-key-here")
|
|
350
|
+
raise typer.Exit(1)
|
|
351
|
+
|
|
269
352
|
# Check for HUD_API_KEY if using HUD services
|
|
270
353
|
if not settings.api_key or not os.environ.get("HUD_API_KEY"):
|
|
271
354
|
design.warning("HUD_API_KEY not set. Some features may be limited.")
|
|
272
355
|
design.info("Get your API key at: https://app.hud.so")
|
|
273
|
-
|
|
356
|
+
|
|
274
357
|
# Parse allowed tools
|
|
275
358
|
allowed_tools_list = (
|
|
276
|
-
[t.strip() for t in allowed_tools.split(",") if t.strip()]
|
|
277
|
-
if allowed_tools
|
|
278
|
-
else None
|
|
359
|
+
[t.strip() for t in allowed_tools.split(",") if t.strip()] if allowed_tools else None
|
|
279
360
|
)
|
|
280
|
-
|
|
361
|
+
|
|
281
362
|
# Set default max_steps if not provided
|
|
282
363
|
if max_steps is None:
|
|
283
364
|
max_steps = 50 if full else 10
|
|
284
|
-
|
|
365
|
+
|
|
285
366
|
# Run evaluation
|
|
286
367
|
if full:
|
|
287
|
-
asyncio.run(
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
368
|
+
asyncio.run(
|
|
369
|
+
run_full_dataset(
|
|
370
|
+
source,
|
|
371
|
+
agent_type=agent,
|
|
372
|
+
model=model,
|
|
373
|
+
allowed_tools=allowed_tools_list,
|
|
374
|
+
max_concurrent=max_concurrent,
|
|
375
|
+
max_steps=max_steps,
|
|
376
|
+
)
|
|
377
|
+
)
|
|
295
378
|
else:
|
|
296
|
-
asyncio.run(
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
379
|
+
asyncio.run(
|
|
380
|
+
run_single_task(
|
|
381
|
+
source,
|
|
382
|
+
agent_type=agent,
|
|
383
|
+
model=model,
|
|
384
|
+
allowed_tools=allowed_tools_list,
|
|
385
|
+
max_steps=max_steps,
|
|
386
|
+
)
|
|
387
|
+
)
|