hud-python 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

hud/agents/base.py CHANGED
@@ -85,6 +85,7 @@ class MCPAgent(ABC):
85
85
  self._tool_map: dict[str, types.Tool] = {} # Simplified: just name to tool
86
86
  self.screenshot_history: list[str] = []
87
87
  self._auto_trace = auto_trace
88
+ self._auto_trace_cm: Any | None = None # Store auto-created trace context manager
88
89
  self.initialization_complete = False
89
90
 
90
91
  # Response agent to automatically interact with the model
@@ -303,6 +304,9 @@ class MCPAgent(ABC):
303
304
  except Exception as e:
304
305
  logger.warning("ResponseAgent failed: %s", e)
305
306
  if decision == "STOP":
307
+ # Try to submit response through lifecycle tool
308
+ await self._maybe_submit_response(response, messages)
309
+
306
310
  logger.info("Stopping execution")
307
311
  final_response = response
308
312
  break
@@ -483,6 +487,40 @@ class MCPAgent(ABC):
483
487
  self._available_tools.append(tool)
484
488
  # Simplified mapping - just tool name to tool
485
489
  self._tool_map[tool.name] = tool
490
+
491
+ # Auto-detect response tool as a lifecycle tool
492
+ if tool.name == "response" and "response" not in self.lifecycle_tools:
493
+ logger.debug("Auto-detected 'response' tool as a lifecycle tool")
494
+ self.lifecycle_tools.append("response")
495
+
496
+ async def _maybe_submit_response(self, response: AgentResponse, messages: list[Any]) -> None:
497
+ """Submit response through lifecycle tool if available.
498
+
499
+ Args:
500
+ response: The agent's response
501
+ messages: The current message history (will be modified in-place)
502
+ """
503
+ # Check if we have a response lifecycle tool
504
+ if "response" in self.lifecycle_tools and "response" in self._tool_map:
505
+ logger.debug("Calling response lifecycle tool")
506
+ try:
507
+ # Call the response tool with the agent's response
508
+ response_tool_call = MCPToolCall(
509
+ name="response",
510
+ arguments={"response": response.content, "messages": messages}
511
+ )
512
+ response_results = await self.call_tools(response_tool_call)
513
+
514
+ # Format and add the response tool results to messages
515
+ response_messages = await self.format_tool_results(
516
+ [response_tool_call], response_results
517
+ )
518
+ messages.extend(response_messages)
519
+
520
+ # Mark the task as done
521
+ logger.info("Response lifecycle tool executed, marking task as done")
522
+ except Exception as e:
523
+ logger.error("Response lifecycle tool failed: %s", e)
486
524
 
487
525
  async def _setup_config(self, mcp_config: dict[str, dict[str, Any]]) -> None:
488
526
  """Inject metadata into the metadata of the initialize request."""
@@ -491,7 +529,7 @@ class MCPAgent(ABC):
491
529
  mcp_config,
492
530
  MCPConfigPatch(meta=self.metadata),
493
531
  )
494
- setup_hud_telemetry(mcp_config, auto_trace=self._auto_trace)
532
+ self._auto_trace_cm = setup_hud_telemetry(mcp_config, auto_trace=self._auto_trace)
495
533
 
496
534
  def get_available_tools(self) -> list[types.Tool]:
497
535
  """Get list of available MCP tools for LLM use (excludes lifecycle tools)."""
@@ -532,6 +570,17 @@ class MCPAgent(ABC):
532
570
 
533
571
  async def _cleanup(self) -> None:
534
572
  """Cleanup resources."""
573
+ # Clean up auto-created trace if any
574
+ if self._auto_trace_cm:
575
+ try:
576
+ self._auto_trace_cm.__exit__(None, None, None)
577
+ logger.info("Closed auto-created trace")
578
+ except Exception as e:
579
+ logger.warning("Failed to close auto-created trace: %s", e)
580
+ finally:
581
+ self._auto_trace_cm = None
582
+
583
+ # Clean up auto-created client
535
584
  if self._auto_created_client and self.mcp_client:
536
585
  try:
537
586
  await self.mcp_client.shutdown()
hud/cli/__init__.py CHANGED
@@ -23,10 +23,13 @@ from .clone import clone_repository, get_clone_message, print_error, print_tutor
23
23
  from .cursor import get_cursor_config_path, list_cursor_servers, parse_cursor_config
24
24
  from .debug import debug_mcp_stdio
25
25
  from .init import create_environment
26
+ from . import list_func as list_module
26
27
  from .mcp_server import run_mcp_dev_server
27
28
  from .pull import pull_command
28
29
  from .push import push_command
30
+ from .remove import remove_command
29
31
  from .utils import CaptureLogger
32
+ from .eval import eval_command
30
33
 
31
34
  # Create the main Typer app
32
35
  app = typer.Typer(
@@ -442,7 +445,8 @@ def run(
442
445
 
443
446
  # Get URL from options or environment
444
447
  if not url:
445
- url = os.getenv("HUD_MCP_URL", "https://mcp.hud.so/v3/mcp")
448
+ from hud.settings import settings
449
+ url = settings.hud_mcp_url
446
450
 
447
451
  run_remote_server(image, docker_args, transport, port, url, api_key, run_id, verbose)
448
452
 
@@ -561,6 +565,63 @@ def pull(
561
565
  pull_command(target, lock_file, yes, verify_only, verbose)
562
566
 
563
567
 
568
+ @app.command(name="list")
569
+ def list_environments(
570
+ filter_name: str | None = typer.Option(
571
+ None, "--filter", "-f", help="Filter environments by name (case-insensitive)"
572
+ ),
573
+ json_output: bool = typer.Option(
574
+ False, "--json", help="Output as JSON"
575
+ ),
576
+ show_all: bool = typer.Option(
577
+ False, "--all", "-a", help="Show all columns including digest"
578
+ ),
579
+ verbose: bool = typer.Option(
580
+ False, "--verbose", "-v", help="Show detailed output"
581
+ ),
582
+ ) -> None:
583
+ """📋 List all HUD environments in local registry.
584
+
585
+ Shows environments pulled with 'hud pull' stored in ~/.hud/envs/
586
+
587
+ Examples:
588
+ hud list # List all environments
589
+ hud list --filter text # Filter by name
590
+ hud list --json # Output as JSON
591
+ hud list --all # Show digest column
592
+ hud list --verbose # Show full descriptions
593
+ """
594
+ list_module.list_command(filter_name, json_output, show_all, verbose)
595
+
596
+
597
+ @app.command()
598
+ def remove(
599
+ target: str | None = typer.Argument(
600
+ None,
601
+ help="Environment to remove (digest, name, or 'all' for all environments)"
602
+ ),
603
+ yes: bool = typer.Option(
604
+ False, "--yes", "-y", help="Skip confirmation prompt"
605
+ ),
606
+ verbose: bool = typer.Option(
607
+ False, "--verbose", "-v", help="Show detailed output"
608
+ ),
609
+ ) -> None:
610
+ """🗑️ Remove HUD environments from local registry.
611
+
612
+ Removes environment metadata from ~/.hud/envs/
613
+ Note: This does not remove the Docker images.
614
+
615
+ Examples:
616
+ hud remove abc123 # Remove by digest
617
+ hud remove text_2048 # Remove by name
618
+ hud remove hudpython/test_init # Remove by full name
619
+ hud remove all # Remove all environments
620
+ hud remove all --yes # Remove all without confirmation
621
+ """
622
+ remove_command(target, yes, verbose)
623
+
624
+
564
625
  @app.command()
565
626
  def init(
566
627
  name: str = typer.Argument(None, help="Environment name (default: current directory name)"),
@@ -592,6 +653,64 @@ def quickstart() -> None:
592
653
  clone("https://github.com/hud-evals/quickstart.git")
593
654
 
594
655
 
656
+ @app.command()
657
+ def eval(
658
+ source: str = typer.Argument(
659
+ ...,
660
+ help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50') or task JSON file",
661
+ ),
662
+ full: bool = typer.Option(
663
+ False,
664
+ "--full",
665
+ help="Run the entire dataset (omit for single-task debug mode)",
666
+ ),
667
+ agent: str = typer.Option(
668
+ "claude",
669
+ "--agent",
670
+ help="Agent backend to use (claude or openai)",
671
+ ),
672
+ model: str | None = typer.Option(
673
+ None,
674
+ "--model",
675
+ help="Model name for the chosen agent",
676
+ ),
677
+ allowed_tools: str | None = typer.Option(
678
+ None,
679
+ "--allowed-tools",
680
+ help="Comma-separated list of allowed tools",
681
+ ),
682
+ max_concurrent: int = typer.Option(
683
+ 30,
684
+ "--max-concurrent",
685
+ help="Concurrency level for full-dataset mode",
686
+ ),
687
+ max_steps: int = typer.Option(
688
+ 30,
689
+ "--max-steps",
690
+ help="Maximum steps per task (default: 10 for single, 50 for full)",
691
+ ),
692
+ ) -> None:
693
+ """🚀 Run evaluation on datasets or individual tasks with agents."""
694
+ # Validate agent choice
695
+ valid_agents = ["claude", "openai"]
696
+ if agent not in valid_agents:
697
+ from hud.utils.design import HUDDesign
698
+ design = HUDDesign()
699
+ design.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}")
700
+ raise typer.Exit(1)
701
+
702
+ # Import and run the command
703
+ eval_command(
704
+ source=source,
705
+ full=full,
706
+ agent=agent, # type: ignore
707
+ model=model,
708
+ allowed_tools=allowed_tools,
709
+ max_concurrent=max_concurrent,
710
+ max_steps=max_steps,
711
+ )
712
+
713
+
595
714
  def main() -> None:
596
715
  """Main entry point for the CLI."""
597
716
  # Show header for main help
@@ -12,6 +12,8 @@ from rich.progress import Progress, SpinnerColumn, TextColumn
12
12
  from hud.settings import settings
13
13
  from hud.utils.design import HUDDesign
14
14
 
15
+ from .registry import get_registry_dir, list_registry_entries, extract_digest_from_image, load_from_registry
16
+
15
17
  console = Console()
16
18
  design = HUDDesign()
17
19
 
@@ -50,38 +52,31 @@ def fetch_lock_from_registry(reference: str) -> dict | None:
50
52
 
51
53
  def check_local_cache(reference: str) -> dict | None:
52
54
  """Check local cache for lock file."""
53
- # Extract digest if present
54
- if "@sha256:" in reference:
55
- digest = reference.split("@sha256:")[-1][:12]
56
- elif "/" in reference:
57
- # Try to find by name pattern
58
- cache_dir = Path.home() / ".hud" / "envs"
59
- if cache_dir.exists():
60
- # Look for any cached version of this image
61
- for env_dir in cache_dir.iterdir():
62
- if env_dir.is_dir():
63
- lock_file = env_dir / "hud.lock.yaml"
64
- if lock_file.exists():
65
- with open(lock_file) as f:
66
- lock_data = yaml.safe_load(f)
67
- # Check if this matches our reference
68
- if lock_data and "image" in lock_data:
69
- image = lock_data["image"]
70
- # Match by name (ignoring tag/digest)
71
- ref_base = reference.split("@")[0].split(":")[0]
72
- img_base = image.split("@")[0].split(":")[0]
73
- if ref_base in img_base or img_base in ref_base:
74
- return lock_data
75
- return None
76
- else:
77
- digest = "latest"
78
-
79
- # Check specific digest directory
80
- lock_file = Path.home() / ".hud" / "envs" / digest / "hud.lock.yaml"
81
- if lock_file.exists():
82
- with open(lock_file) as f:
83
- return yaml.safe_load(f)
84
-
55
+ # First try exact digest match
56
+ digest = extract_digest_from_image(reference)
57
+ lock_data = load_from_registry(digest)
58
+ if lock_data:
59
+ return lock_data
60
+
61
+ # If not found and reference has a name, search by name pattern
62
+ if "/" in reference:
63
+ # Look for any cached version of this image
64
+ ref_base = reference.split("@")[0].split(":")[0]
65
+
66
+ for digest, lock_file in list_registry_entries():
67
+ try:
68
+ with open(lock_file) as f:
69
+ lock_data = yaml.safe_load(f)
70
+ # Check if this matches our reference
71
+ if lock_data and "image" in lock_data:
72
+ image = lock_data["image"]
73
+ # Match by name (ignoring tag/digest)
74
+ img_base = image.split("@")[0].split(":")[0]
75
+ if ref_base in img_base or img_base in ref_base:
76
+ return lock_data
77
+ except Exception:
78
+ continue
79
+
85
80
  return None
86
81
 
87
82
 
@@ -147,15 +142,8 @@ async def analyze_from_metadata(reference: str, output_format: str, verbose: boo
147
142
  source = "registry"
148
143
 
149
144
  # Save to local cache for next time
150
- if "@sha256:" in lock_data.get("image", ""):
151
- digest = lock_data["image"].split("@sha256:")[-1][:12]
152
- else:
153
- digest = "latest"
154
-
155
- cache_dir = Path.home() / ".hud" / "envs" / digest
156
- cache_dir.mkdir(parents=True, exist_ok=True)
157
- with open(cache_dir / "hud.lock.yaml", "w") as f: # noqa: ASYNC230
158
- yaml.dump(lock_data, f, default_flow_style=False, sort_keys=False)
145
+ from .registry import save_to_registry
146
+ save_to_registry(lock_data, lock_data.get("image", ""), verbose=False)
159
147
  else:
160
148
  progress.update(task, description="[red]✗ Not found[/red]")
161
149
 
hud/cli/build.py CHANGED
@@ -17,6 +17,8 @@ from hud.clients import MCPClient
17
17
  from hud.utils.design import HUDDesign
18
18
  from hud.version import __version__ as hud_version
19
19
 
20
+ from .registry import save_to_registry
21
+
20
22
 
21
23
  def parse_version(version_str: str) -> tuple[int, int, int]:
22
24
  """Parse version string like '1.0.0' or '1.0' into tuple of integers."""
@@ -459,6 +461,11 @@ def build_environment(
459
461
  # Remove temp image after we're done
460
462
  subprocess.run(["docker", "rmi", temp_tag], capture_output=True) # noqa: S603, S607
461
463
 
464
+ # Add to local registry
465
+ if image_id:
466
+ # Save to local registry using the helper
467
+ save_to_registry(lock_content, lock_content.get("image", tag), verbose)
468
+
462
469
  # Print summary
463
470
  design.section_title("Build Complete")
464
471
 
hud/cli/debug.py CHANGED
@@ -167,7 +167,14 @@ async def debug_mcp_stdio(command: list[str], logger: CaptureLogger, max_phase:
167
167
  break
168
168
  except Exception as e:
169
169
  logger.error(f"Failed to parse MCP response: {e}")
170
- continue
170
+ logger.error(f"Raw output that caused the error: {repr(line)}")
171
+ logger.hint("This usually means non-JSON output is being sent to STDOUT")
172
+ logger.hint("Common causes:")
173
+ logger.hint(" - Print statements in your server code")
174
+ logger.hint(" - Library warnings (use warnings.filterwarnings)")
175
+ logger.hint(" - Import-time output from dependencies")
176
+ phases_completed = 1 # Mark as failed
177
+ break # Stop trying to parse
171
178
 
172
179
  if response and "result" in response:
173
180
  logger.success("MCP server initialized successfully")
hud/cli/eval.py ADDED
@@ -0,0 +1,226 @@
1
+ """HUD evaluation command for running tasks and datasets."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import logging
8
+ from pathlib import Path
9
+ from typing import Any, Literal
10
+
11
+ import typer
12
+ from datasets import load_dataset
13
+
14
+ import hud
15
+ from hud.agents import ClaudeAgent, OperatorAgent
16
+ from hud.agents.misc.response_agent import ResponseAgent
17
+ from hud.datasets import Task, run_dataset
18
+ from hud.utils.design import HUDDesign
19
+
20
+ logger = logging.getLogger(__name__)
21
+ design = HUDDesign()
22
+
23
+
24
+ def build_agent(
25
+ agent_type: Literal["claude", "openai"],
26
+ *,
27
+ model: str | None = None,
28
+ allowed_tools: list[str] | None = None,
29
+ ) -> ClaudeAgent | OperatorAgent:
30
+ """Create and return the requested agent type."""
31
+
32
+ if agent_type == "openai":
33
+ allowed_tools = allowed_tools or ["openai_computer"]
34
+
35
+ return OperatorAgent(
36
+ allowed_tools=allowed_tools,
37
+ response_agent=ResponseAgent(),
38
+ )
39
+
40
+ # Fallback Claude agent (Anthropic)
41
+ model = model or "claude-sonnet-4-20250514"
42
+ allowed_tools = allowed_tools or ["anthropic_computer"]
43
+
44
+ return ClaudeAgent(
45
+ model=model,
46
+ allowed_tools=allowed_tools,
47
+ response_agent=ResponseAgent(),
48
+ )
49
+
50
+
51
+ async def run_single_task(
52
+ source: str,
53
+ *,
54
+ agent_type: Literal["claude", "openai"] = "claude",
55
+ model: str | None = None,
56
+ allowed_tools: list[str] | None = None,
57
+ max_steps: int = 10,
58
+ ) -> None:
59
+ """Load one task and execute it."""
60
+
61
+ design.info("📊 Loading dataset…")
62
+
63
+ # Check if it's a single task JSON file
64
+ path = Path(source)
65
+ if path.exists() and path.suffix == ".json":
66
+ with open(path, "r") as f:
67
+ task_data = json.load(f)
68
+ task = Task(**task_data)
69
+ else:
70
+ # Load from HuggingFace dataset
71
+ dataset = load_dataset(source, split="train")
72
+
73
+ # Get first task from dataset
74
+ sample_task = dataset[0] # type: ignore[index]
75
+ task = Task(**sample_task) # type: ignore[arg-type]
76
+
77
+ task_prompt = task.prompt[:50] + "..." if len(task.prompt) > 50 else task.prompt
78
+
79
+ with hud.trace(name=task_prompt):
80
+ agent = build_agent(
81
+ agent_type,
82
+ model=model,
83
+ allowed_tools=allowed_tools,
84
+ )
85
+ design.info(task.prompt)
86
+ result = await agent.run(task, max_steps=max_steps)
87
+ design.success(f"Reward: {result.reward}")
88
+
89
+
90
+ async def run_full_dataset(
91
+ source: str,
92
+ *,
93
+ agent_type: Literal["claude", "openai"] = "claude",
94
+ model: str | None = None,
95
+ allowed_tools: list[str] | None = None,
96
+ max_concurrent: int = 30,
97
+ max_steps: int = 50,
98
+ ) -> list[Any]:
99
+ """Run evaluation across the entire dataset using hud.datasets.run_dataset."""
100
+
101
+ # Build agent class + config for run_dataset
102
+ if agent_type == "openai":
103
+ agent_class = OperatorAgent
104
+ agent_config: dict[str, Any] = {
105
+ "allowed_tools": allowed_tools or ["openai_computer"],
106
+ }
107
+ else:
108
+ agent_class = ClaudeAgent
109
+ agent_config = {
110
+ "model": model or "claude-sonnet-4-20250514",
111
+ "allowed_tools": allowed_tools or ["anthropic_computer"],
112
+ }
113
+
114
+ design.info("🚀 Running evaluation…")
115
+ return await run_dataset(
116
+ name=f"Evaluation {source.split('/')[-1]}",
117
+ dataset=source,
118
+ agent_class=agent_class,
119
+ agent_config=agent_config,
120
+ max_concurrent=max_concurrent,
121
+ metadata={"dataset": source},
122
+ max_steps=max_steps,
123
+ auto_respond=True,
124
+ )
125
+
126
+
127
+ def eval_command(
128
+ source: str = typer.Argument(
129
+ ...,
130
+ help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50') or task JSON file",
131
+ ),
132
+ full: bool = typer.Option(
133
+ False,
134
+ "--full",
135
+ help="Run the entire dataset (omit for single-task debug mode)",
136
+ ),
137
+ agent: Literal["claude", "openai"] = typer.Option(
138
+ "claude",
139
+ "--agent",
140
+ help="Agent backend to use",
141
+ ),
142
+ model: str | None = typer.Option(
143
+ None,
144
+ "--model",
145
+ help="Model name for the chosen agent",
146
+ ),
147
+ allowed_tools: str | None = typer.Option(
148
+ None,
149
+ "--allowed-tools",
150
+ help="Comma-separated list of allowed tools",
151
+ ),
152
+ max_concurrent: int = typer.Option(
153
+ 50,
154
+ "--max-concurrent",
155
+ help="Concurrency level for full-dataset mode",
156
+ ),
157
+ max_steps: int = typer.Option(
158
+ None,
159
+ "--max-steps",
160
+ help="Maximum steps per task (default: 10 for single, 50 for full)",
161
+ ),
162
+ ) -> None:
163
+ """🚀 Run evaluation on datasets or individual tasks with agents.
164
+
165
+ Examples:
166
+ # Evaluate a single task from SheetBench
167
+ hud eval hud-evals/SheetBench-50
168
+
169
+ # Evaluate the FULL SheetBench dataset with Claude
170
+ hud eval hud-evals/SheetBench-50 --full --agent claude
171
+
172
+ # Run a single task from a JSON file
173
+ hud eval task.json
174
+
175
+ # Run with OpenAI Operator agent
176
+ hud eval hud-evals/OSWorld-Gold-Beta --agent openai
177
+ """
178
+ from hud.settings import settings
179
+ import os
180
+
181
+ # Check for required API keys
182
+ if agent == "claude":
183
+ if not settings.anthropic_api_key or not os.environ.get("ANTHROPIC_API_KEY"):
184
+ design.error("ANTHROPIC_API_KEY is required for Claude agent")
185
+ design.info("Set it in your environment or .env file: ANTHROPIC_API_KEY=your-key-here")
186
+ raise typer.Exit(1)
187
+ elif agent == "openai":
188
+ if not settings.openai_api_key or not os.environ.get("OPENAI_API_KEY"):
189
+ design.error("OPENAI_API_KEY is required for OpenAI agent")
190
+ design.info("Set it in your environment or .env file: OPENAI_API_KEY=your-key-here")
191
+ raise typer.Exit(1)
192
+
193
+ # Check for HUD_API_KEY if using HUD services
194
+ if not settings.api_key or not os.environ.get("HUD_API_KEY"):
195
+ design.warning("HUD_API_KEY not set. Some features may be limited.")
196
+ design.info("Get your API key at: https://app.hud.so")
197
+
198
+ # Parse allowed tools
199
+ allowed_tools_list = (
200
+ [t.strip() for t in allowed_tools.split(",") if t.strip()]
201
+ if allowed_tools
202
+ else None
203
+ )
204
+
205
+ # Set default max_steps if not provided
206
+ if max_steps is None:
207
+ max_steps = 50 if full else 10
208
+
209
+ # Run evaluation
210
+ if full:
211
+ asyncio.run(run_full_dataset(
212
+ source,
213
+ agent_type=agent,
214
+ model=model,
215
+ allowed_tools=allowed_tools_list,
216
+ max_concurrent=max_concurrent,
217
+ max_steps=max_steps,
218
+ ))
219
+ else:
220
+ asyncio.run(run_single_task(
221
+ source,
222
+ agent_type=agent,
223
+ model=model,
224
+ allowed_tools=allowed_tools_list,
225
+ max_steps=max_steps,
226
+ ))