hud-python 0.4.11__py3-none-any.whl → 0.4.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (63) hide show
  1. hud/__main__.py +8 -0
  2. hud/agents/base.py +7 -8
  3. hud/agents/langchain.py +2 -2
  4. hud/agents/tests/test_openai.py +3 -1
  5. hud/cli/__init__.py +114 -52
  6. hud/cli/build.py +121 -71
  7. hud/cli/debug.py +2 -2
  8. hud/cli/{mcp_server.py → dev.py} +101 -38
  9. hud/cli/eval.py +175 -90
  10. hud/cli/init.py +442 -64
  11. hud/cli/list_func.py +72 -71
  12. hud/cli/pull.py +1 -2
  13. hud/cli/push.py +35 -23
  14. hud/cli/remove.py +35 -41
  15. hud/cli/tests/test_analyze.py +2 -1
  16. hud/cli/tests/test_analyze_metadata.py +42 -49
  17. hud/cli/tests/test_build.py +28 -52
  18. hud/cli/tests/test_cursor.py +1 -1
  19. hud/cli/tests/test_debug.py +1 -1
  20. hud/cli/tests/test_list_func.py +75 -64
  21. hud/cli/tests/test_main_module.py +30 -0
  22. hud/cli/tests/test_mcp_server.py +3 -3
  23. hud/cli/tests/test_pull.py +30 -61
  24. hud/cli/tests/test_push.py +70 -89
  25. hud/cli/tests/test_registry.py +36 -38
  26. hud/cli/tests/test_utils.py +1 -1
  27. hud/cli/utils/__init__.py +1 -0
  28. hud/cli/{docker_utils.py → utils/docker.py} +36 -0
  29. hud/cli/{env_utils.py → utils/environment.py} +7 -7
  30. hud/cli/{interactive.py → utils/interactive.py} +91 -19
  31. hud/cli/{analyze_metadata.py → utils/metadata.py} +12 -8
  32. hud/cli/{registry.py → utils/registry.py} +28 -30
  33. hud/cli/{remote_runner.py → utils/remote_runner.py} +1 -1
  34. hud/cli/utils/runner.py +134 -0
  35. hud/cli/utils/server.py +250 -0
  36. hud/clients/base.py +1 -1
  37. hud/clients/fastmcp.py +5 -13
  38. hud/clients/mcp_use.py +6 -10
  39. hud/server/server.py +35 -5
  40. hud/shared/exceptions.py +11 -0
  41. hud/shared/tests/test_exceptions.py +22 -0
  42. hud/telemetry/tests/__init__.py +0 -0
  43. hud/telemetry/tests/test_replay.py +40 -0
  44. hud/telemetry/tests/test_trace.py +63 -0
  45. hud/tools/base.py +20 -3
  46. hud/tools/computer/hud.py +15 -6
  47. hud/tools/executors/tests/test_base_executor.py +27 -0
  48. hud/tools/response.py +12 -8
  49. hud/tools/tests/test_response.py +60 -0
  50. hud/tools/tests/test_tools_init.py +49 -0
  51. hud/utils/design.py +19 -8
  52. hud/utils/mcp.py +17 -5
  53. hud/utils/tests/test_mcp.py +112 -0
  54. hud/utils/tests/test_version.py +1 -1
  55. hud/version.py +1 -1
  56. {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/METADATA +16 -13
  57. {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/RECORD +62 -52
  58. hud/cli/runner.py +0 -160
  59. /hud/cli/{cursor.py → utils/cursor.py} +0 -0
  60. /hud/cli/{utils.py → utils/logging.py} +0 -0
  61. {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/WHEEL +0 -0
  62. {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/entry_points.txt +0 -0
  63. {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/licenses/LICENSE +0 -0
hud/cli/eval.py CHANGED
@@ -6,18 +6,13 @@ import asyncio
6
6
  import json
7
7
  import logging
8
8
  from pathlib import Path
9
- from typing import TYPE_CHECKING, Any, Literal
9
+ from typing import Any, Literal
10
10
 
11
11
  import typer
12
12
 
13
13
  import hud
14
14
  from hud.utils.design import HUDDesign
15
15
 
16
- if TYPE_CHECKING:
17
- from datasets import Dataset
18
- from hud.agents import ClaudeAgent, OperatorAgent
19
- from hud.agents.misc.response_agent import ResponseAgent
20
-
21
16
  logger = logging.getLogger(__name__)
22
17
  design = HUDDesign()
23
18
 
@@ -29,17 +24,8 @@ def build_agent(
29
24
  allowed_tools: list[str] | None = None,
30
25
  ) -> Any:
31
26
  """Create and return the requested agent type."""
32
-
27
+
33
28
  # Import agents lazily to avoid dependency issues
34
- try:
35
- from hud.agents.misc.response_agent import ResponseAgent
36
- except ImportError as e:
37
- design.error(
38
- "Agent dependencies are not installed. "
39
- "Please install with: pip install 'hud-python[agent]'"
40
- )
41
- raise typer.Exit(1) from e
42
-
43
29
  if agent_type == "openai":
44
30
  try:
45
31
  from hud.agents import OperatorAgent
@@ -49,14 +35,14 @@ def build_agent(
49
35
  "Please install with: pip install 'hud-python[agent]'"
50
36
  )
51
37
  raise typer.Exit(1) from e
52
-
53
- allowed_tools = allowed_tools or ["openai_computer"]
54
-
55
- return OperatorAgent(
56
- allowed_tools=allowed_tools,
57
- response_agent=ResponseAgent(),
58
- )
59
-
38
+
39
+ if allowed_tools:
40
+ return OperatorAgent(
41
+ allowed_tools=allowed_tools,
42
+ )
43
+ else:
44
+ return OperatorAgent()
45
+
60
46
  # Fallback Claude agent (Anthropic)
61
47
  try:
62
48
  from hud.agents import ClaudeAgent
@@ -66,15 +52,18 @@ def build_agent(
66
52
  "Please install with: pip install 'hud-python[agent]'"
67
53
  )
68
54
  raise typer.Exit(1) from e
69
-
55
+
70
56
  model = model or "claude-sonnet-4-20250514"
71
- allowed_tools = allowed_tools or ["anthropic_computer"]
72
-
73
- return ClaudeAgent(
74
- model=model,
75
- allowed_tools=allowed_tools,
76
- response_agent=ResponseAgent(),
77
- )
57
+
58
+ if allowed_tools:
59
+ return ClaudeAgent(
60
+ model=model,
61
+ allowed_tools=allowed_tools,
62
+ )
63
+ else:
64
+ return ClaudeAgent(
65
+ model=model,
66
+ )
78
67
 
79
68
 
80
69
  async def run_single_task(
@@ -85,26 +74,91 @@ async def run_single_task(
85
74
  allowed_tools: list[str] | None = None,
86
75
  max_steps: int = 10,
87
76
  ) -> None:
88
- """Load one task and execute it."""
89
-
77
+ """Load one task and execute it, or detect if JSON contains a list and run as dataset."""
78
+
90
79
  design.info("📊 Loading dataset…")
91
-
92
- # Import Task lazily
80
+
81
+ # Import Task and run_dataset lazily
93
82
  try:
94
- from hud.datasets import Task
83
+ from hud.datasets import Task, run_dataset
95
84
  except ImportError as e:
96
85
  design.error(
97
86
  "Dataset dependencies are not installed. "
98
87
  "Please install with: pip install 'hud-python[agent]'"
99
88
  )
100
89
  raise typer.Exit(1) from e
101
-
102
- # Check if it's a single task JSON file
90
+
91
+ # Check if it's a JSON file
103
92
  path = Path(source)
104
93
  if path.exists() and path.suffix == ".json":
105
- with open(path, "r") as f:
106
- task_data = json.load(f)
107
- task = Task(**task_data)
94
+ with open(path) as f: # noqa: ASYNC230
95
+ json_data = json.load(f)
96
+
97
+ # Check if JSON contains multiple tasks (list with more than 1 task)
98
+ if isinstance(json_data, list) and len(json_data) > 1:
99
+ design.info(f"Found {len(json_data)} tasks in JSON file, running as dataset…")
100
+
101
+ # Build agent class and config for run_dataset
102
+ if agent_type == "openai":
103
+ try:
104
+ from hud.agents import OperatorAgent
105
+
106
+ agent_class = OperatorAgent
107
+ except ImportError as e:
108
+ design.error(
109
+ "OpenAI agent dependencies are not installed. "
110
+ "Please install with: pip install 'hud-python[agent]'"
111
+ )
112
+ raise typer.Exit(1) from e
113
+
114
+ agent_config: dict[str, Any] = {
115
+ }
116
+ if allowed_tools:
117
+ agent_config["allowed_tools"] = allowed_tools
118
+
119
+ else:
120
+ try:
121
+ from hud.agents import ClaudeAgent
122
+
123
+ agent_class = ClaudeAgent
124
+ except ImportError as e:
125
+ design.error(
126
+ "Claude agent dependencies are not installed. "
127
+ "Please install with: pip install 'hud-python[agent]'"
128
+ )
129
+ raise typer.Exit(1) from e
130
+
131
+ agent_config = {
132
+ "model": model or "claude-sonnet-4-20250514",
133
+ }
134
+ if allowed_tools:
135
+ agent_config["allowed_tools"] = allowed_tools
136
+
137
+ # Run as dataset with single-task concurrency to maintain debug behavior
138
+ results = await run_dataset(
139
+ name=f"JSON Dataset: {path.name}",
140
+ dataset=json_data, # Pass the list directly
141
+ agent_class=agent_class,
142
+ agent_config=agent_config,
143
+ max_concurrent=1, # Run sequentially for debug mode
144
+ metadata={"source": str(path)},
145
+ max_steps=max_steps,
146
+ )
147
+
148
+ # Display summary
149
+ successful = sum(1 for r in results if getattr(r, "reward", 0) > 0)
150
+ design.success(f"Completed {len(results)} tasks: {successful} successful")
151
+ return
152
+
153
+ # Single task JSON (either direct object or list with 1 task)
154
+ if isinstance(json_data, list) and len(json_data) == 1:
155
+ design.info("Found 1 task in JSON file, running as single task…")
156
+ task = Task(**json_data[0])
157
+ elif isinstance(json_data, dict):
158
+ task = Task(**json_data)
159
+ else:
160
+ design.error("JSON file must contain a list of tasks when using --full flag")
161
+ raise typer.Exit(1)
108
162
  else:
109
163
  # Load from HuggingFace dataset
110
164
  try:
@@ -115,15 +169,15 @@ async def run_single_task(
115
169
  "Please install with: pip install 'hud-python[agent]'"
116
170
  )
117
171
  raise typer.Exit(1) from e
118
-
172
+
119
173
  dataset = load_dataset(source, split="train")
120
-
174
+
121
175
  # Get first task from dataset
122
176
  sample_task = dataset[0] # type: ignore[index]
123
177
  task = Task(**sample_task) # type: ignore[arg-type]
124
-
178
+
125
179
  task_prompt = task.prompt[:50] + "..." if len(task.prompt) > 50 else task.prompt
126
-
180
+
127
181
  with hud.trace(name=task_prompt):
128
182
  agent = build_agent(
129
183
  agent_type,
@@ -145,7 +199,7 @@ async def run_full_dataset(
145
199
  max_steps: int = 50,
146
200
  ) -> list[Any]:
147
201
  """Run evaluation across the entire dataset using hud.datasets.run_dataset."""
148
-
202
+
149
203
  # Import run_dataset lazily
150
204
  try:
151
205
  from hud.datasets import run_dataset
@@ -155,11 +209,29 @@ async def run_full_dataset(
155
209
  "Please install with: pip install 'hud-python[agent]'"
156
210
  )
157
211
  raise typer.Exit(1) from e
158
-
212
+
213
+ # Check if source is a JSON file with list of tasks
214
+ path = Path(source)
215
+ dataset_or_tasks = source
216
+ dataset_name = source.split("/")[-1]
217
+
218
+ if path.exists() and path.suffix == ".json":
219
+ with open(path) as f: # noqa: ASYNC230
220
+ json_data = json.load(f)
221
+
222
+ if isinstance(json_data, list):
223
+ dataset_or_tasks = json_data
224
+ dataset_name = f"JSON Dataset: {path.name}"
225
+ design.info(f"Found {len(json_data)} tasks in JSON file")
226
+ else:
227
+ design.error("JSON file must contain a list of tasks when using --full flag")
228
+ raise typer.Exit(1)
229
+
159
230
  # Build agent class + config for run_dataset
160
231
  if agent_type == "openai":
161
232
  try:
162
233
  from hud.agents import OperatorAgent
234
+
163
235
  agent_class = OperatorAgent
164
236
  except ImportError as e:
165
237
  design.error(
@@ -167,13 +239,16 @@ async def run_full_dataset(
167
239
  "Please install with: pip install 'hud-python[agent]'"
168
240
  )
169
241
  raise typer.Exit(1) from e
170
-
242
+
171
243
  agent_config: dict[str, Any] = {
172
- "allowed_tools": allowed_tools or ["openai_computer"],
173
244
  }
245
+ if allowed_tools:
246
+ agent_config["allowed_tools"] = allowed_tools
247
+
174
248
  else:
175
249
  try:
176
250
  from hud.agents import ClaudeAgent
251
+
177
252
  agent_class = ClaudeAgent
178
253
  except ImportError as e:
179
254
  design.error(
@@ -181,29 +256,29 @@ async def run_full_dataset(
181
256
  "Please install with: pip install 'hud-python[agent]'"
182
257
  )
183
258
  raise typer.Exit(1) from e
184
-
259
+
185
260
  agent_config = {
186
261
  "model": model or "claude-sonnet-4-20250514",
187
- "allowed_tools": allowed_tools or ["anthropic_computer"],
188
262
  }
189
-
263
+ if allowed_tools:
264
+ agent_config["allowed_tools"] = allowed_tools
265
+
190
266
  design.info("🚀 Running evaluation…")
191
267
  return await run_dataset(
192
- name=f"Evaluation {source.split('/')[-1]}",
193
- dataset=source,
268
+ name=f"Evaluation {dataset_name}",
269
+ dataset=dataset_or_tasks,
194
270
  agent_class=agent_class,
195
271
  agent_config=agent_config,
196
272
  max_concurrent=max_concurrent,
197
273
  metadata={"dataset": source},
198
274
  max_steps=max_steps,
199
- auto_respond=True,
200
275
  )
201
276
 
202
277
 
203
278
  def eval_command(
204
279
  source: str = typer.Argument(
205
280
  ...,
206
- help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50') or task JSON file",
281
+ help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50'), single task JSON file, or JSON file with list of tasks", # noqa: E501
207
282
  ),
208
283
  full: bool = typer.Option(
209
284
  False,
@@ -237,66 +312,76 @@ def eval_command(
237
312
  ),
238
313
  ) -> None:
239
314
  """🚀 Run evaluation on datasets or individual tasks with agents.
240
-
315
+
241
316
  Examples:
242
317
  # Evaluate a single task from SheetBench
243
318
  hud eval hud-evals/SheetBench-50
244
-
319
+
245
320
  # Evaluate the FULL SheetBench dataset with Claude
246
321
  hud eval hud-evals/SheetBench-50 --full --agent claude
247
-
322
+
248
323
  # Run a single task from a JSON file
249
324
  hud eval task.json
250
-
325
+
326
+ # Run multiple tasks from a JSON file (auto-detects list)
327
+ hud eval tasks.json # If tasks.json contains a list, runs all tasks
328
+
329
+ # Run JSON list with full dataset mode and concurrency
330
+ hud eval tasks.json --full --max-concurrent 10
331
+
251
332
  # Run with OpenAI Operator agent
252
333
  hud eval hud-evals/OSWorld-Gold-Beta --agent openai
253
334
  """
254
- from hud.settings import settings
255
335
  import os
256
-
336
+
337
+ from hud.settings import settings
338
+
257
339
  # Check for required API keys
258
340
  if agent == "claude":
259
341
  if not settings.anthropic_api_key or not os.environ.get("ANTHROPIC_API_KEY"):
260
342
  design.error("ANTHROPIC_API_KEY is required for Claude agent")
261
343
  design.info("Set it in your environment or .env file: ANTHROPIC_API_KEY=your-key-here")
262
344
  raise typer.Exit(1)
263
- elif agent == "openai":
264
- if not settings.openai_api_key or not os.environ.get("OPENAI_API_KEY"):
265
- design.error("OPENAI_API_KEY is required for OpenAI agent")
266
- design.info("Set it in your environment or .env file: OPENAI_API_KEY=your-key-here")
267
- raise typer.Exit(1)
268
-
345
+ elif agent == "openai" and (
346
+ not settings.openai_api_key or not os.environ.get("OPENAI_API_KEY")
347
+ ):
348
+ design.error("OPENAI_API_KEY is required for OpenAI agent")
349
+ design.info("Set it in your environment or .env file: OPENAI_API_KEY=your-key-here")
350
+ raise typer.Exit(1)
351
+
269
352
  # Check for HUD_API_KEY if using HUD services
270
353
  if not settings.api_key or not os.environ.get("HUD_API_KEY"):
271
354
  design.warning("HUD_API_KEY not set. Some features may be limited.")
272
355
  design.info("Get your API key at: https://app.hud.so")
273
-
356
+
274
357
  # Parse allowed tools
275
358
  allowed_tools_list = (
276
- [t.strip() for t in allowed_tools.split(",") if t.strip()]
277
- if allowed_tools
278
- else None
359
+ [t.strip() for t in allowed_tools.split(",") if t.strip()] if allowed_tools else None
279
360
  )
280
-
361
+
281
362
  # Set default max_steps if not provided
282
363
  if max_steps is None:
283
364
  max_steps = 50 if full else 10
284
-
365
+
285
366
  # Run evaluation
286
367
  if full:
287
- asyncio.run(run_full_dataset(
288
- source,
289
- agent_type=agent,
290
- model=model,
291
- allowed_tools=allowed_tools_list,
292
- max_concurrent=max_concurrent,
293
- max_steps=max_steps,
294
- ))
368
+ asyncio.run(
369
+ run_full_dataset(
370
+ source,
371
+ agent_type=agent,
372
+ model=model,
373
+ allowed_tools=allowed_tools_list,
374
+ max_concurrent=max_concurrent,
375
+ max_steps=max_steps,
376
+ )
377
+ )
295
378
  else:
296
- asyncio.run(run_single_task(
297
- source,
298
- agent_type=agent,
299
- model=model,
300
- allowed_tools=allowed_tools_list,
301
- max_steps=max_steps,
302
- ))
379
+ asyncio.run(
380
+ run_single_task(
381
+ source,
382
+ agent_type=agent,
383
+ model=model,
384
+ allowed_tools=allowed_tools_list,
385
+ max_steps=max_steps,
386
+ )
387
+ )