hud-python 0.4.12__py3-none-any.whl → 0.4.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/cli/__init__.py +8 -1
- hud/cli/dev.py +41 -13
- hud/cli/eval.py +36 -31
- hud/cli/init.py +448 -69
- hud/cli/list_func.py +1 -1
- hud/clients/fastmcp.py +2 -12
- hud/clients/mcp_use.py +1 -7
- hud/otel/instrumentation.py +5 -1
- hud/server/server.py +1 -1
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.12.dist-info → hud_python-0.4.14.dist-info}/METADATA +6 -7
- {hud_python-0.4.12.dist-info → hud_python-0.4.14.dist-info}/RECORD +16 -16
- {hud_python-0.4.12.dist-info → hud_python-0.4.14.dist-info}/WHEEL +0 -0
- {hud_python-0.4.12.dist-info → hud_python-0.4.14.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.12.dist-info → hud_python-0.4.14.dist-info}/licenses/LICENSE +0 -0
hud/cli/__init__.py
CHANGED
|
@@ -348,6 +348,11 @@ def dev(
|
|
|
348
348
|
),
|
|
349
349
|
port: int = typer.Option(8765, "--port", "-p", help="HTTP server port (ignored for stdio)"),
|
|
350
350
|
no_reload: bool = typer.Option(False, "--no-reload", help="Disable hot-reload"),
|
|
351
|
+
full_reload: bool = typer.Option(
|
|
352
|
+
False,
|
|
353
|
+
"--full-reload",
|
|
354
|
+
help="Restart entire container on file changes (instead of just server process)",
|
|
355
|
+
),
|
|
351
356
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Show server logs"),
|
|
352
357
|
inspector: bool = typer.Option(
|
|
353
358
|
False, "--inspector", help="Launch MCP Inspector (HTTP mode only)"
|
|
@@ -375,12 +380,13 @@ def dev(
|
|
|
375
380
|
hud dev . --inspector # Launch MCP Inspector (HTTP mode only)
|
|
376
381
|
hud dev . --interactive # Launch interactive testing mode (HTTP mode only)
|
|
377
382
|
hud dev . --no-logs # Disable Docker log streaming
|
|
383
|
+
hud dev . --full-reload # Restart entire container on file changes (instead of just server)
|
|
378
384
|
|
|
379
385
|
# With Docker arguments (after all options):
|
|
380
386
|
hud dev . -e BROWSER_PROVIDER=anchorbrowser -e ANCHOR_API_KEY=xxx
|
|
381
387
|
hud dev . -e API_KEY=secret -v /tmp/data:/data --network host
|
|
382
388
|
hud dev . --build -e DEBUG=true --memory 2g
|
|
383
|
-
"""
|
|
389
|
+
""" # noqa: E501
|
|
384
390
|
# Parse directory and Docker arguments
|
|
385
391
|
if params:
|
|
386
392
|
directory = params[0]
|
|
@@ -397,6 +403,7 @@ def dev(
|
|
|
397
403
|
transport,
|
|
398
404
|
port,
|
|
399
405
|
no_reload,
|
|
406
|
+
full_reload,
|
|
400
407
|
verbose,
|
|
401
408
|
inspector,
|
|
402
409
|
no_logs,
|
hud/cli/dev.py
CHANGED
|
@@ -35,6 +35,7 @@ def create_proxy_server(
|
|
|
35
35
|
directory: str | Path,
|
|
36
36
|
image_name: str,
|
|
37
37
|
no_reload: bool = False,
|
|
38
|
+
full_reload: bool = False,
|
|
38
39
|
verbose: bool = False,
|
|
39
40
|
docker_args: list[str] | None = None,
|
|
40
41
|
interactive: bool = False,
|
|
@@ -48,8 +49,12 @@ def create_proxy_server(
|
|
|
48
49
|
design.warning(f"Could not extract CMD from {image_name}, using default")
|
|
49
50
|
original_cmd = ["python", "-m", "hud_controller.server"]
|
|
50
51
|
|
|
51
|
-
# Generate container name from image
|
|
52
|
-
|
|
52
|
+
# Generate unique container name from image to avoid conflicts between multiple instances
|
|
53
|
+
import os
|
|
54
|
+
|
|
55
|
+
pid = str(os.getpid())[-6:] # Last 6 digits of process ID for uniqueness
|
|
56
|
+
base_name = image_name.replace(":", "-").replace("/", "-")
|
|
57
|
+
container_name = f"{base_name}-{pid}"
|
|
53
58
|
|
|
54
59
|
# Build the docker run command
|
|
55
60
|
docker_cmd = [
|
|
@@ -73,14 +78,20 @@ def create_proxy_server(
|
|
|
73
78
|
if interactive:
|
|
74
79
|
no_reload = True
|
|
75
80
|
|
|
76
|
-
|
|
77
|
-
|
|
81
|
+
# Validate reload options
|
|
82
|
+
if no_reload and full_reload:
|
|
83
|
+
design.warning("Cannot use --full-reload with --no-reload, ignoring --full-reload")
|
|
84
|
+
full_reload = False
|
|
85
|
+
|
|
86
|
+
if not no_reload and not full_reload:
|
|
87
|
+
# Standard hot-reload: inject supervisor for server restart within container
|
|
78
88
|
modified_cmd = inject_supervisor(original_cmd)
|
|
79
89
|
docker_cmd.extend(["--entrypoint", modified_cmd[0]])
|
|
80
90
|
docker_cmd.append(image_name)
|
|
81
91
|
docker_cmd.extend(modified_cmd[1:])
|
|
82
92
|
else:
|
|
83
|
-
# No reload
|
|
93
|
+
# No reload or full reload: use original CMD without supervisor
|
|
94
|
+
# Note: Full reload logic (container restart) would be implemented here in the future
|
|
84
95
|
docker_cmd.append(image_name)
|
|
85
96
|
|
|
86
97
|
# Create configuration following MCPConfig schema
|
|
@@ -96,9 +107,14 @@ def create_proxy_server(
|
|
|
96
107
|
|
|
97
108
|
# Debug output - only if verbose
|
|
98
109
|
if verbose:
|
|
99
|
-
if not no_reload:
|
|
110
|
+
if not no_reload and not full_reload:
|
|
111
|
+
design.info("Mode: Hot-reload (server restart within container)")
|
|
100
112
|
design.info("Watching: /app/src for changes")
|
|
113
|
+
elif full_reload:
|
|
114
|
+
design.info("Mode: Full reload (container restart on file changes)")
|
|
115
|
+
design.info("Note: Full container restart not yet implemented, using no-reload mode")
|
|
101
116
|
else:
|
|
117
|
+
design.info("Mode: No reload")
|
|
102
118
|
design.info("Container will run without hot-reload")
|
|
103
119
|
design.command_example(f"docker logs -f {container_name}", "View container logs")
|
|
104
120
|
|
|
@@ -127,6 +143,7 @@ async def start_mcp_proxy(
|
|
|
127
143
|
transport: str,
|
|
128
144
|
port: int,
|
|
129
145
|
no_reload: bool = False,
|
|
146
|
+
full_reload: bool = False,
|
|
130
147
|
verbose: bool = False,
|
|
131
148
|
inspector: bool = False,
|
|
132
149
|
no_logs: bool = False,
|
|
@@ -212,8 +229,12 @@ async def start_mcp_proxy(
|
|
|
212
229
|
design.error(f"Source directory not found: {src_path}")
|
|
213
230
|
raise click.Abort
|
|
214
231
|
|
|
215
|
-
# Extract container name from the proxy configuration
|
|
216
|
-
|
|
232
|
+
# Extract container name from the proxy configuration (must match create_proxy_server naming)
|
|
233
|
+
import os
|
|
234
|
+
|
|
235
|
+
pid = str(os.getpid())[-6:] # Last 6 digits of process ID for uniqueness
|
|
236
|
+
base_name = image_name.replace(":", "-").replace("/", "-")
|
|
237
|
+
container_name = f"{base_name}-{pid}"
|
|
217
238
|
|
|
218
239
|
# Remove any existing container with the same name (silently)
|
|
219
240
|
# Note: The proxy creates containers on-demand when clients connect
|
|
@@ -347,6 +368,7 @@ async def start_mcp_proxy(
|
|
|
347
368
|
# Always show waiting message
|
|
348
369
|
log_design.info("") # Empty line for spacing
|
|
349
370
|
log_design.progress_message("⏳ Waiting for first client connection to start container...")
|
|
371
|
+
log_design.info(f"📋 Looking for container: {container_name}") # noqa: G004
|
|
350
372
|
|
|
351
373
|
# Keep trying to stream logs - container is created on demand
|
|
352
374
|
has_shown_started = False
|
|
@@ -397,7 +419,8 @@ async def start_mcp_proxy(
|
|
|
397
419
|
|
|
398
420
|
# Show all logs with gold formatting like hud debug
|
|
399
421
|
# Format all logs in gold/dim style like hud debug's stderr
|
|
400
|
-
|
|
422
|
+
# Use stdout console to avoid stderr redirection when not verbose
|
|
423
|
+
log_design._stdout_console.print(
|
|
401
424
|
f"[rgb(192,150,12)]■[/rgb(192,150,12)] {decoded_line}", highlight=False
|
|
402
425
|
)
|
|
403
426
|
|
|
@@ -408,16 +431,19 @@ async def start_mcp_proxy(
|
|
|
408
431
|
await asyncio.sleep(1)
|
|
409
432
|
continue # Loop back to check if container exists
|
|
410
433
|
|
|
411
|
-
except Exception:
|
|
412
|
-
# Some unexpected error
|
|
434
|
+
except Exception as e:
|
|
435
|
+
# Some unexpected error - show it so we can debug
|
|
436
|
+
log_design.warning(f"Failed to stream Docker logs: {e}") # noqa: G004
|
|
413
437
|
if verbose:
|
|
414
|
-
|
|
438
|
+
import traceback
|
|
439
|
+
|
|
440
|
+
log_design.warning(f"Traceback: {traceback.format_exc()}") # noqa: G004
|
|
415
441
|
await asyncio.sleep(1)
|
|
416
442
|
|
|
417
443
|
# CRITICAL: Create proxy AFTER all logging setup to prevent it from resetting logging config
|
|
418
444
|
# This is important because FastMCP might initialize loggers during creation
|
|
419
445
|
proxy = create_proxy_server(
|
|
420
|
-
directory, image_name, no_reload, verbose, docker_args or [], interactive
|
|
446
|
+
directory, image_name, no_reload, full_reload, verbose, docker_args or [], interactive
|
|
421
447
|
)
|
|
422
448
|
|
|
423
449
|
# One more attempt to suppress the FastMCP server log
|
|
@@ -548,6 +574,7 @@ def run_mcp_dev_server(
|
|
|
548
574
|
transport: str = "http",
|
|
549
575
|
port: int = 8765,
|
|
550
576
|
no_reload: bool = False,
|
|
577
|
+
full_reload: bool = False,
|
|
551
578
|
verbose: bool = False,
|
|
552
579
|
inspector: bool = False,
|
|
553
580
|
no_logs: bool = False,
|
|
@@ -706,6 +733,7 @@ def run_mcp_dev_server(
|
|
|
706
733
|
transport,
|
|
707
734
|
port,
|
|
708
735
|
no_reload,
|
|
736
|
+
full_reload,
|
|
709
737
|
verbose,
|
|
710
738
|
inspector,
|
|
711
739
|
no_logs,
|
hud/cli/eval.py
CHANGED
|
@@ -26,15 +26,6 @@ def build_agent(
|
|
|
26
26
|
"""Create and return the requested agent type."""
|
|
27
27
|
|
|
28
28
|
# Import agents lazily to avoid dependency issues
|
|
29
|
-
try:
|
|
30
|
-
from hud.agents.misc.response_agent import ResponseAgent
|
|
31
|
-
except ImportError as e:
|
|
32
|
-
design.error(
|
|
33
|
-
"Agent dependencies are not installed. "
|
|
34
|
-
"Please install with: pip install 'hud-python[agent]'"
|
|
35
|
-
)
|
|
36
|
-
raise typer.Exit(1) from e
|
|
37
|
-
|
|
38
29
|
if agent_type == "openai":
|
|
39
30
|
try:
|
|
40
31
|
from hud.agents import OperatorAgent
|
|
@@ -45,12 +36,12 @@ def build_agent(
|
|
|
45
36
|
)
|
|
46
37
|
raise typer.Exit(1) from e
|
|
47
38
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
39
|
+
if allowed_tools:
|
|
40
|
+
return OperatorAgent(
|
|
41
|
+
allowed_tools=allowed_tools,
|
|
42
|
+
)
|
|
43
|
+
else:
|
|
44
|
+
return OperatorAgent()
|
|
54
45
|
|
|
55
46
|
# Fallback Claude agent (Anthropic)
|
|
56
47
|
try:
|
|
@@ -63,13 +54,16 @@ def build_agent(
|
|
|
63
54
|
raise typer.Exit(1) from e
|
|
64
55
|
|
|
65
56
|
model = model or "claude-sonnet-4-20250514"
|
|
66
|
-
allowed_tools = allowed_tools or ["anthropic_computer"]
|
|
67
57
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
58
|
+
if allowed_tools:
|
|
59
|
+
return ClaudeAgent(
|
|
60
|
+
model=model,
|
|
61
|
+
allowed_tools=allowed_tools,
|
|
62
|
+
)
|
|
63
|
+
else:
|
|
64
|
+
return ClaudeAgent(
|
|
65
|
+
model=model,
|
|
66
|
+
)
|
|
73
67
|
|
|
74
68
|
|
|
75
69
|
async def run_single_task(
|
|
@@ -100,8 +94,8 @@ async def run_single_task(
|
|
|
100
94
|
with open(path) as f: # noqa: ASYNC230
|
|
101
95
|
json_data = json.load(f)
|
|
102
96
|
|
|
103
|
-
# Check if JSON contains
|
|
104
|
-
if isinstance(json_data, list):
|
|
97
|
+
# Check if JSON contains multiple tasks (list with more than 1 task)
|
|
98
|
+
if isinstance(json_data, list) and len(json_data) > 1:
|
|
105
99
|
design.info(f"Found {len(json_data)} tasks in JSON file, running as dataset…")
|
|
106
100
|
|
|
107
101
|
# Build agent class and config for run_dataset
|
|
@@ -118,8 +112,10 @@ async def run_single_task(
|
|
|
118
112
|
raise typer.Exit(1) from e
|
|
119
113
|
|
|
120
114
|
agent_config: dict[str, Any] = {
|
|
121
|
-
"allowed_tools": allowed_tools or ["openai_computer"],
|
|
122
115
|
}
|
|
116
|
+
if allowed_tools:
|
|
117
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
118
|
+
|
|
123
119
|
else:
|
|
124
120
|
try:
|
|
125
121
|
from hud.agents import ClaudeAgent
|
|
@@ -134,8 +130,9 @@ async def run_single_task(
|
|
|
134
130
|
|
|
135
131
|
agent_config = {
|
|
136
132
|
"model": model or "claude-sonnet-4-20250514",
|
|
137
|
-
"allowed_tools": allowed_tools or ["anthropic_computer"],
|
|
138
133
|
}
|
|
134
|
+
if allowed_tools:
|
|
135
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
139
136
|
|
|
140
137
|
# Run as dataset with single-task concurrency to maintain debug behavior
|
|
141
138
|
results = await run_dataset(
|
|
@@ -146,7 +143,6 @@ async def run_single_task(
|
|
|
146
143
|
max_concurrent=1, # Run sequentially for debug mode
|
|
147
144
|
metadata={"source": str(path)},
|
|
148
145
|
max_steps=max_steps,
|
|
149
|
-
auto_respond=True,
|
|
150
146
|
)
|
|
151
147
|
|
|
152
148
|
# Display summary
|
|
@@ -154,8 +150,15 @@ async def run_single_task(
|
|
|
154
150
|
design.success(f"Completed {len(results)} tasks: {successful} successful")
|
|
155
151
|
return
|
|
156
152
|
|
|
157
|
-
# Single task JSON
|
|
158
|
-
|
|
153
|
+
# Single task JSON (either direct object or list with 1 task)
|
|
154
|
+
if isinstance(json_data, list) and len(json_data) == 1:
|
|
155
|
+
design.info("Found 1 task in JSON file, running as single task…")
|
|
156
|
+
task = Task(**json_data[0])
|
|
157
|
+
elif isinstance(json_data, dict):
|
|
158
|
+
task = Task(**json_data)
|
|
159
|
+
else:
|
|
160
|
+
design.error("JSON file must contain a list of tasks when using --full flag")
|
|
161
|
+
raise typer.Exit(1)
|
|
159
162
|
else:
|
|
160
163
|
# Load from HuggingFace dataset
|
|
161
164
|
try:
|
|
@@ -238,8 +241,10 @@ async def run_full_dataset(
|
|
|
238
241
|
raise typer.Exit(1) from e
|
|
239
242
|
|
|
240
243
|
agent_config: dict[str, Any] = {
|
|
241
|
-
"allowed_tools": allowed_tools or ["openai_computer"],
|
|
242
244
|
}
|
|
245
|
+
if allowed_tools:
|
|
246
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
247
|
+
|
|
243
248
|
else:
|
|
244
249
|
try:
|
|
245
250
|
from hud.agents import ClaudeAgent
|
|
@@ -254,8 +259,9 @@ async def run_full_dataset(
|
|
|
254
259
|
|
|
255
260
|
agent_config = {
|
|
256
261
|
"model": model or "claude-sonnet-4-20250514",
|
|
257
|
-
"allowed_tools": allowed_tools or ["anthropic_computer"],
|
|
258
262
|
}
|
|
263
|
+
if allowed_tools:
|
|
264
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
259
265
|
|
|
260
266
|
design.info("🚀 Running evaluation…")
|
|
261
267
|
return await run_dataset(
|
|
@@ -266,7 +272,6 @@ async def run_full_dataset(
|
|
|
266
272
|
max_concurrent=max_concurrent,
|
|
267
273
|
metadata={"dataset": source},
|
|
268
274
|
max_steps=max_steps,
|
|
269
|
-
auto_respond=True,
|
|
270
275
|
)
|
|
271
276
|
|
|
272
277
|
|
hud/cli/init.py
CHANGED
|
@@ -15,20 +15,14 @@ DOCKERFILE_TEMPLATE = """FROM python:3.11-slim
|
|
|
15
15
|
|
|
16
16
|
WORKDIR /app
|
|
17
17
|
|
|
18
|
-
# Install git for hud-python dependency
|
|
19
|
-
RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
|
|
20
|
-
|
|
21
18
|
# Copy and install dependencies
|
|
22
19
|
COPY pyproject.toml ./
|
|
23
20
|
COPY src/ ./src/
|
|
24
21
|
RUN pip install --no-cache-dir -e .
|
|
25
22
|
|
|
26
|
-
# Set logging to stderr
|
|
27
|
-
ENV HUD_LOG_STREAM=stderr
|
|
28
|
-
|
|
29
23
|
# Start context server in background, then MCP server
|
|
30
|
-
CMD ["sh", "-c", "python -m
|
|
31
|
-
"""
|
|
24
|
+
CMD ["sh", "-c", "python -m controller.env & sleep 1 && exec python -m controller.server"]
|
|
25
|
+
"""
|
|
32
26
|
|
|
33
27
|
PYPROJECT_TEMPLATE = """[project]
|
|
34
28
|
name = "{name}"
|
|
@@ -50,112 +44,472 @@ image = "{name}:dev"
|
|
|
50
44
|
allow-direct-references = true
|
|
51
45
|
|
|
52
46
|
[tool.hatch.build.targets.wheel]
|
|
53
|
-
packages = ["src/
|
|
47
|
+
packages = ["src/controller"]
|
|
54
48
|
"""
|
|
55
49
|
|
|
56
|
-
|
|
50
|
+
ENV_TEMPLATE = '''"""Minimal environment that persists across hot-reloads."""
|
|
57
51
|
from hud.server.context import run_context_server
|
|
58
52
|
import asyncio
|
|
59
53
|
|
|
60
|
-
class
|
|
54
|
+
class Environment:
|
|
55
|
+
"""Simple counter environment."""
|
|
56
|
+
|
|
61
57
|
def __init__(self):
|
|
62
58
|
self.count = 0
|
|
63
59
|
|
|
64
60
|
def act(self):
|
|
61
|
+
"""Increment the counter."""
|
|
65
62
|
self.count += 1
|
|
66
63
|
return self.count
|
|
67
64
|
|
|
68
65
|
def get_count(self):
|
|
66
|
+
"""Get current counter."""
|
|
69
67
|
return self.count
|
|
68
|
+
|
|
69
|
+
def reset(self):
|
|
70
|
+
"""Reset counter to zero."""
|
|
71
|
+
self.count = 0
|
|
70
72
|
|
|
71
73
|
if __name__ == "__main__":
|
|
72
|
-
asyncio.run(run_context_server(
|
|
74
|
+
asyncio.run(run_context_server(Environment(), sock_path="/tmp/hud_ctx.sock"))
|
|
73
75
|
'''
|
|
74
76
|
|
|
75
77
|
SERVER_TEMPLATE = '''"""Minimal MCP server for HUD."""
|
|
78
|
+
import sys
|
|
79
|
+
import logging
|
|
76
80
|
from hud.server import MCPServer
|
|
77
81
|
from hud.server.context import attach_context
|
|
82
|
+
from hud.tools.types import EvaluationResult
|
|
83
|
+
|
|
84
|
+
# Configure logging to stderr
|
|
85
|
+
logging.basicConfig(
|
|
86
|
+
stream=sys.stderr,
|
|
87
|
+
level=logging.INFO,
|
|
88
|
+
format='[%(levelname)s] %(asctime)s | %(name)s | %(message)s'
|
|
89
|
+
)
|
|
78
90
|
|
|
79
91
|
mcp = MCPServer(name="{name}")
|
|
80
|
-
|
|
92
|
+
env = None
|
|
81
93
|
|
|
82
94
|
@mcp.initialize
|
|
83
|
-
async def init(
|
|
84
|
-
global
|
|
85
|
-
|
|
95
|
+
async def init(ctx):
|
|
96
|
+
global env
|
|
97
|
+
env = attach_context("/tmp/hud_ctx.sock")
|
|
98
|
+
logging.info("Connected to context server")
|
|
86
99
|
|
|
87
100
|
@mcp.shutdown
|
|
88
101
|
async def cleanup():
|
|
89
|
-
global
|
|
90
|
-
|
|
102
|
+
global env
|
|
103
|
+
env = None
|
|
91
104
|
|
|
92
105
|
@mcp.tool()
|
|
93
106
|
async def act() -> str:
|
|
94
|
-
"""Perform an action."""
|
|
95
|
-
|
|
107
|
+
"""Perform an action that changes the environment state."""
|
|
108
|
+
if env is None:
|
|
109
|
+
raise RuntimeError("Context not initialized")
|
|
110
|
+
count = env.act()
|
|
111
|
+
return f"Action #{{count}} performed. Current count: {{count}}"
|
|
96
112
|
|
|
97
113
|
@mcp.tool()
|
|
98
114
|
async def setup() -> str:
|
|
99
|
-
"""
|
|
100
|
-
|
|
115
|
+
"""Reset the environment to initial state."""
|
|
116
|
+
if env is None:
|
|
117
|
+
raise RuntimeError("Context not initialized")
|
|
118
|
+
env.reset()
|
|
119
|
+
return "Counter reset to 0"
|
|
101
120
|
|
|
102
121
|
@mcp.tool()
|
|
103
|
-
async def evaluate() ->
|
|
104
|
-
"""
|
|
105
|
-
|
|
122
|
+
async def evaluate(target: int = 10) -> EvaluationResult:
|
|
123
|
+
"""Check if the counter reached the target value."""
|
|
124
|
+
if env is None:
|
|
125
|
+
raise RuntimeError("Context not initialized")
|
|
126
|
+
current_count = env.get_count()
|
|
127
|
+
|
|
128
|
+
# Calculate reward as progress towards target
|
|
129
|
+
reward = min(current_count / target, 1.0) if target > 0 else 0.0
|
|
130
|
+
done = current_count >= target
|
|
131
|
+
|
|
132
|
+
return EvaluationResult(
|
|
133
|
+
reward=reward,
|
|
134
|
+
done=done,
|
|
135
|
+
content=f"Counter at {{current_count}}/{{target}}"
|
|
136
|
+
)
|
|
106
137
|
|
|
107
138
|
if __name__ == "__main__":
|
|
108
139
|
mcp.run()
|
|
109
140
|
'''
|
|
110
141
|
|
|
142
|
+
TASKS_JSON_TEMPLATE = '''[
|
|
143
|
+
{{
|
|
144
|
+
"prompt": "Increment the counter to reach 10",
|
|
145
|
+
"mcp_config": {{
|
|
146
|
+
"{name}": {{
|
|
147
|
+
"url": "http://localhost:8765/mcp"
|
|
148
|
+
}}
|
|
149
|
+
}},
|
|
150
|
+
"setup_tool": {{
|
|
151
|
+
"name": "setup",
|
|
152
|
+
"arguments": {{}}
|
|
153
|
+
}},
|
|
154
|
+
"evaluate_tool": {{
|
|
155
|
+
"name": "evaluate",
|
|
156
|
+
"arguments": {{
|
|
157
|
+
"target": 10
|
|
158
|
+
}}
|
|
159
|
+
}}
|
|
160
|
+
}}
|
|
161
|
+
]
|
|
162
|
+
'''
|
|
163
|
+
|
|
164
|
+
TEST_TASK_TEMPLATE = '''#!/usr/bin/env python
|
|
165
|
+
"""Simple example of running tasks from tasks.json.
|
|
166
|
+
|
|
167
|
+
Make sure to run 'hud dev --build' in another terminal first!
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
import asyncio
|
|
171
|
+
import json
|
|
172
|
+
from hud.datasets import Task
|
|
173
|
+
from hud.clients import MCPClient
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
async def run_task(task_data: dict):
|
|
177
|
+
task = Task(**task_data)
|
|
178
|
+
client = MCPClient(mcp_config=task.mcp_config)
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
print("Initializing client...")
|
|
182
|
+
await client.initialize()
|
|
183
|
+
|
|
184
|
+
result = await client.call_tool(task.setup_tool) # type: ignore
|
|
185
|
+
print(f"✅ Setup: {{result.content}}")
|
|
186
|
+
|
|
187
|
+
print("\\n🔄 Performing actions:")
|
|
188
|
+
for _ in range(10):
|
|
189
|
+
result = await client.call_tool(name="act", arguments={{}})
|
|
190
|
+
print(f" {{result.content}}")
|
|
191
|
+
|
|
192
|
+
result = await client.call_tool(task.evaluate_tool) # type: ignore
|
|
193
|
+
print(f"\\n📊 Evaluation: {{result.content}}")
|
|
194
|
+
|
|
195
|
+
return result.content
|
|
196
|
+
except Exception as e:
|
|
197
|
+
if "connection" in str(e).lower():
|
|
198
|
+
print("❌ Could not connect. Make sure 'hud dev --build' is running in another terminal.")
|
|
199
|
+
else:
|
|
200
|
+
raise e
|
|
201
|
+
finally:
|
|
202
|
+
await client.shutdown()
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
async def main():
|
|
206
|
+
for task_data in json.load(open("tasks.json")):
|
|
207
|
+
await run_task(task_data)
|
|
208
|
+
|
|
209
|
+
if __name__ == "__main__":
|
|
210
|
+
asyncio.run(main())
|
|
211
|
+
'''
|
|
212
|
+
|
|
213
|
+
NOTEBOOK_TEMPLATE = '''{{
|
|
214
|
+
"cells": [
|
|
215
|
+
{{
|
|
216
|
+
"cell_type": "markdown",
|
|
217
|
+
"metadata": {{}},
|
|
218
|
+
"source": [
|
|
219
|
+
"### Step 1: Create a Task\\n",
|
|
220
|
+
"\\n",
|
|
221
|
+
"A Task combines:\\n",
|
|
222
|
+
"- **Prompt**: What we want an agent to accomplish\\n",
|
|
223
|
+
"- **MCP Config**: How to spawn the environment\\n",
|
|
224
|
+
"- **Setup Tool**: How to prepare the environment\\n",
|
|
225
|
+
"- **Evaluate Tool**: How to check if the task succeeded"
|
|
226
|
+
]
|
|
227
|
+
}},
|
|
228
|
+
{{
|
|
229
|
+
"cell_type": "code",
|
|
230
|
+
"execution_count": null,
|
|
231
|
+
"metadata": {{}},
|
|
232
|
+
"outputs": [],
|
|
233
|
+
"source": [
|
|
234
|
+
"from hud.datasets import Task\\n",
|
|
235
|
+
"from hud.types import MCPToolCall\\n",
|
|
236
|
+
"\\n",
|
|
237
|
+
"# Create a task that uses our {name} environment\\n",
|
|
238
|
+
"# See tasks.json for how to build a loadable task dataset\\n",
|
|
239
|
+
"task = Task(\\n",
|
|
240
|
+
" prompt=\\"Increment the counter to reach 10\\",\\n",
|
|
241
|
+
" mcp_config={{\\n",
|
|
242
|
+
" \\"{name}\\": {{\\n",
|
|
243
|
+
" \\"url\\": \\"http://localhost:8765/mcp\\"\\n",
|
|
244
|
+
" }},\\n",
|
|
245
|
+
" }},\\n",
|
|
246
|
+
" setup_tool=MCPToolCall(name=\\"setup\\", arguments={{}}),\\n",
|
|
247
|
+
" evaluate_tool=MCPToolCall(name=\\"evaluate\\", arguments={{\\"target\\": 10}}),\\n",
|
|
248
|
+
")"
|
|
249
|
+
]
|
|
250
|
+
}},
|
|
251
|
+
{{
|
|
252
|
+
"cell_type": "markdown",
|
|
253
|
+
"metadata": {{}},
|
|
254
|
+
"source": [
|
|
255
|
+
"### Step 2: Initialize MCP Client\\n",
|
|
256
|
+
"\\n",
|
|
257
|
+
"Run `hud dev --build` before this cell to intialize the server at `http://localhost:8765/mcp`"
|
|
258
|
+
]
|
|
259
|
+
}},
|
|
260
|
+
{{
|
|
261
|
+
"cell_type": "code",
|
|
262
|
+
"execution_count": null,
|
|
263
|
+
"metadata": {{}},
|
|
264
|
+
"outputs": [],
|
|
265
|
+
"source": [
|
|
266
|
+
"from hud.clients import MCPClient\\n",
|
|
267
|
+
"\\n",
|
|
268
|
+
"# Create the client\\n",
|
|
269
|
+
"client = MCPClient(mcp_config=task.mcp_config, auto_trace=False)\\n",
|
|
270
|
+
"\\n",
|
|
271
|
+
"# Initialize it (this connects to our dev server)\\n",
|
|
272
|
+
"await client.initialize()"
|
|
273
|
+
]
|
|
274
|
+
}},
|
|
275
|
+
{{
|
|
276
|
+
"cell_type": "markdown",
|
|
277
|
+
"metadata": {{}},
|
|
278
|
+
"source": [
|
|
279
|
+
"### Step 3: Run Setup\\n",
|
|
280
|
+
"\\n",
|
|
281
|
+
"Call the setup tool to prepare the environment according to the task."
|
|
282
|
+
]
|
|
283
|
+
}},
|
|
284
|
+
{{
|
|
285
|
+
"cell_type": "code",
|
|
286
|
+
"execution_count": null,
|
|
287
|
+
"metadata": {{}},
|
|
288
|
+
"outputs": [],
|
|
289
|
+
"source": [
|
|
290
|
+
"# Run the setup from our task\\n",
|
|
291
|
+
"setup_result = await client.call_tool(task.setup_tool) # type: ignore\\n",
|
|
292
|
+
"print(f\\"Setup result: {{setup_result}}\\")"
|
|
293
|
+
]
|
|
294
|
+
}},
|
|
295
|
+
{{
|
|
296
|
+
"cell_type": "markdown",
|
|
297
|
+
"metadata": {{}},
|
|
298
|
+
"source": [
|
|
299
|
+
"### Step 4: Perform Actions\\n",
|
|
300
|
+
"\\n",
|
|
301
|
+
"Now we'll manually perform actions to complete the task. In a real scenario, an AI agent would figure out what actions to take."
|
|
302
|
+
]
|
|
303
|
+
}},
|
|
304
|
+
{{
|
|
305
|
+
"cell_type": "code",
|
|
306
|
+
"execution_count": null,
|
|
307
|
+
"metadata": {{}},
|
|
308
|
+
"outputs": [],
|
|
309
|
+
"source": [
|
|
310
|
+
"# Increment the counter 10 times\\n",
|
|
311
|
+
"for i in range(10):\\n",
|
|
312
|
+
" result = await client.call_tool(name=\\"act\\", arguments={{}})\\n",
|
|
313
|
+
" print(f\\"Step {{i+1}}: {{result.content}}\\")"
|
|
314
|
+
]
|
|
315
|
+
}},
|
|
316
|
+
{{
|
|
317
|
+
"cell_type": "markdown",
|
|
318
|
+
"metadata": {{}},
|
|
319
|
+
"source": [
|
|
320
|
+
"## Step 5: Evaluate Success\\n",
|
|
321
|
+
"\\n",
|
|
322
|
+
"Check if we completed the task according to the evaluation criteria."
|
|
323
|
+
]
|
|
324
|
+
}},
|
|
325
|
+
{{
|
|
326
|
+
"cell_type": "code",
|
|
327
|
+
"execution_count": null,
|
|
328
|
+
"metadata": {{}},
|
|
329
|
+
"outputs": [],
|
|
330
|
+
"source": [
|
|
331
|
+
"# Run the evaluation from our task\\n",
|
|
332
|
+
"eval_result = await client.call_tool(task.evaluate_tool) # type: ignore\\n",
|
|
333
|
+
"\\n",
|
|
334
|
+
"# The result is a list with one TextContent item containing JSON\\n",
|
|
335
|
+
"print(eval_result)"
|
|
336
|
+
]
|
|
337
|
+
}},
|
|
338
|
+
{{
|
|
339
|
+
"cell_type": "markdown",
|
|
340
|
+
"metadata": {{}},
|
|
341
|
+
"source": [
|
|
342
|
+
"### Step 6: Cleanup\\n",
|
|
343
|
+
"\\n",
|
|
344
|
+
"Always shut down the client when done to stop the Docker container. Either stop hud dev in the terminal, or run this command:"
|
|
345
|
+
]
|
|
346
|
+
}},
|
|
347
|
+
{{
|
|
348
|
+
"cell_type": "code",
|
|
349
|
+
"execution_count": null,
|
|
350
|
+
"metadata": {{}},
|
|
351
|
+
"outputs": [],
|
|
352
|
+
"source": [
|
|
353
|
+
"await client.shutdown()"
|
|
354
|
+
]
|
|
355
|
+
}},
|
|
356
|
+
{{
|
|
357
|
+
"cell_type": "markdown",
|
|
358
|
+
"metadata": {{}},
|
|
359
|
+
"source": [
|
|
360
|
+
"### Bonus: Running with an AI Agent\\n",
|
|
361
|
+
"\\n",
|
|
362
|
+
"Instead of manually calling tools, you can have an AI agent solve the task automatically."
|
|
363
|
+
]
|
|
364
|
+
}},
|
|
365
|
+
{{
|
|
366
|
+
"cell_type": "code",
|
|
367
|
+
"execution_count": null,
|
|
368
|
+
"metadata": {{}},
|
|
369
|
+
"outputs": [],
|
|
370
|
+
"source": [
|
|
371
|
+
"# Uncomment to run with Claude (requires ANTHROPIC_API_KEY)\\n",
|
|
372
|
+
"from hud.agents import ClaudeAgent\\n",
|
|
373
|
+
"\\n",
|
|
374
|
+
"# Create an agent\\n",
|
|
375
|
+
"agent = ClaudeAgent(\\n",
|
|
376
|
+
" model=\\"claude-sonnet-4-20250514\\",\\n",
|
|
377
|
+
" allowed_tools=[\\"act\\"] # Only allow the act tool\\n",
|
|
378
|
+
")\\n",
|
|
379
|
+
"\\n",
|
|
380
|
+
"# Run the task\\n",
|
|
381
|
+
"result = await agent.run(task)\\n",
|
|
382
|
+
"print(f\\"Final reward: {{result.reward}}\\")"
|
|
383
|
+
]
|
|
384
|
+
}},
|
|
385
|
+
{{
|
|
386
|
+
"cell_type": "markdown",
|
|
387
|
+
"metadata": {{}},
|
|
388
|
+
"source": [
|
|
389
|
+
"### Next Steps\\n",
|
|
390
|
+
"\\n",
|
|
391
|
+
"1. **Create your own evaluators**: Add new evaluation functions to `server.py`\\n",
|
|
392
|
+
"2. **Build complex environments**: Replace the simple counter with your actual application\\n",
|
|
393
|
+
"3. **Test with agents**: Use different AI models to solve your tasks\\n",
|
|
394
|
+
"\\n",
|
|
395
|
+
"For more examples, check out:\\n",
|
|
396
|
+
"- `environments/text_2048/` - A complete 2048 game environment\\n",
|
|
397
|
+
"- `environments/browser/` - A full browser automation environment with GUI"
|
|
398
|
+
]
|
|
399
|
+
}},
|
|
400
|
+
{{
|
|
401
|
+
"cell_type": "code",
|
|
402
|
+
"execution_count": null,
|
|
403
|
+
"metadata": {{}},
|
|
404
|
+
"outputs": [],
|
|
405
|
+
"source": []
|
|
406
|
+
}}
|
|
407
|
+
],
|
|
408
|
+
"metadata": {{
|
|
409
|
+
"kernelspec": {{
|
|
410
|
+
"display_name": "Python 3",
|
|
411
|
+
"language": "python",
|
|
412
|
+
"name": "python3"
|
|
413
|
+
}},
|
|
414
|
+
"language_info": {{
|
|
415
|
+
"codemirror_mode": {{
|
|
416
|
+
"name": "ipython",
|
|
417
|
+
"version": 3
|
|
418
|
+
}},
|
|
419
|
+
"file_extension": ".py",
|
|
420
|
+
"mimetype": "text/x-python",
|
|
421
|
+
"name": "python",
|
|
422
|
+
"nbconvert_exporter": "python",
|
|
423
|
+
"pygments_lexer": "ipython3",
|
|
424
|
+
"version": "3.11.0"
|
|
425
|
+
}}
|
|
426
|
+
}},
|
|
427
|
+
"nbformat": 4,
|
|
428
|
+
"nbformat_minor": 4
|
|
429
|
+
}}
|
|
430
|
+
'''
|
|
431
|
+
|
|
111
432
|
README_TEMPLATE = '''# {title}
|
|
112
433
|
|
|
113
|
-
A minimal HUD environment
|
|
434
|
+
A minimal HUD environment demonstrating the Task pattern with a simple counter.
|
|
114
435
|
|
|
115
436
|
## Quick Start
|
|
116
437
|
|
|
438
|
+
### Interactive Development
|
|
117
439
|
```bash
|
|
118
|
-
#
|
|
119
|
-
hud dev
|
|
440
|
+
# 1. Start the environment (optional: with inspector)
|
|
441
|
+
hud dev --build --inspector
|
|
442
|
+
|
|
443
|
+
# 2. Choose your preferred way to test:
|
|
444
|
+
|
|
445
|
+
# Option A: Interactive notebook test_env.ipynb (great for learning!)
|
|
120
446
|
|
|
121
|
-
#
|
|
122
|
-
|
|
123
|
-
hud dev --image {name}:dev
|
|
447
|
+
# Option B: Simple Python script (runs all tasks from tasks.json)
|
|
448
|
+
python test_task.py
|
|
124
449
|
```
|
|
125
450
|
|
|
126
|
-
|
|
451
|
+
### Run with an Agent
|
|
452
|
+
```bash
|
|
453
|
+
# Run the task with Claude
|
|
454
|
+
hud eval tasks.json --agent claude
|
|
455
|
+
```
|
|
127
456
|
|
|
128
|
-
|
|
129
|
-
- `src/hud_controller/context.py` - Persistent state across hot-reloads
|
|
130
|
-
- `Dockerfile` - Container configuration
|
|
131
|
-
- `pyproject.toml` - Python dependencies
|
|
457
|
+
## How HUD Environments Work
|
|
132
458
|
|
|
133
|
-
|
|
459
|
+
The environment is split into two components:
|
|
134
460
|
|
|
135
|
-
|
|
461
|
+
- **`env.py`** - Stateful logic that persists across reloads
|
|
462
|
+
- **`server.py`** - MCP server with tools (reloads on file changes)
|
|
136
463
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
464
|
+
This separation is crucial for `hud dev` - it allows you to modify the MCP tools and see changes immediately without losing the environment state. The environment runs as a separate process and communicates via socket, while the server can be restarted freely.
|
|
465
|
+
|
|
466
|
+
If you are ever seeing issues with the environment itself, running `hud dev --full-reload` will reload both the environment and the server.
|
|
467
|
+
|
|
468
|
+
## Publishing Your Environment
|
|
469
|
+
|
|
470
|
+
Once your environment is ready, you can share it with the community:
|
|
471
|
+
|
|
472
|
+
### 1. Push to Registry
|
|
473
|
+
```bash
|
|
474
|
+
# Build and push your environment (this requires docker hub login and hud api key)
|
|
475
|
+
hud build
|
|
476
|
+
hud push
|
|
142
477
|
```
|
|
143
478
|
|
|
144
|
-
|
|
479
|
+
### 2. Create a Dataset
|
|
145
480
|
|
|
146
|
-
|
|
481
|
+
Create a dataset on HuggingFace with your tasks:
|
|
147
482
|
|
|
483
|
+
**Option A: Upload manually**
|
|
484
|
+
1. Upload your `tasks.json` to HuggingFace
|
|
485
|
+
2. Make sure it's **public** to appear on leaderboards
|
|
486
|
+
|
|
487
|
+
**Option B: Use the SDK**
|
|
148
488
|
```python
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
489
|
+
from hud.datasets import save_tasks
|
|
490
|
+
import json
|
|
491
|
+
|
|
492
|
+
# Load your tasks
|
|
493
|
+
with open("tasks.json") as f:
|
|
494
|
+
tasks = json.load(f)
|
|
495
|
+
|
|
496
|
+
# Push to HuggingFace
|
|
497
|
+
save_tasks(tasks, repo_id="your-org/your-dataset")
|
|
153
498
|
```
|
|
154
499
|
|
|
155
|
-
|
|
500
|
+
### 3. Run and Track Performance
|
|
156
501
|
|
|
157
|
-
|
|
158
|
-
|
|
502
|
+
```bash
|
|
503
|
+
# Run Claude on your benchmark
|
|
504
|
+
hud eval "your-org/your-dataset" --agent claude
|
|
505
|
+
|
|
506
|
+
# View results at:
|
|
507
|
+
# app.hud.so/leaderboards/your-org/your-dataset
|
|
508
|
+
```
|
|
509
|
+
|
|
510
|
+
**Note**: Only public HuggingFace datasets appear as leaderboards!
|
|
511
|
+
|
|
512
|
+
📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
|
|
159
513
|
'''
|
|
160
514
|
|
|
161
515
|
|
|
@@ -202,7 +556,7 @@ def create_environment(name: str | None, directory: str, force: bool) -> None:
|
|
|
202
556
|
design.warning(f"Overwriting existing files in {target_dir}")
|
|
203
557
|
|
|
204
558
|
# Create directory structure
|
|
205
|
-
src_dir = target_dir / "src" / "
|
|
559
|
+
src_dir = target_dir / "src" / "controller"
|
|
206
560
|
src_dir.mkdir(parents=True, exist_ok=True)
|
|
207
561
|
|
|
208
562
|
# Write files with proper formatting
|
|
@@ -210,37 +564,54 @@ def create_environment(name: str | None, directory: str, force: bool) -> None:
|
|
|
210
564
|
|
|
211
565
|
# Dockerfile
|
|
212
566
|
dockerfile_path = target_dir / "Dockerfile"
|
|
213
|
-
dockerfile_path.write_text(DOCKERFILE_TEMPLATE.strip() + "\n")
|
|
567
|
+
dockerfile_path.write_text(DOCKERFILE_TEMPLATE.strip() + "\n", encoding="utf-8")
|
|
214
568
|
files_created.append("Dockerfile")
|
|
215
569
|
|
|
216
570
|
# pyproject.toml
|
|
217
571
|
pyproject_path = target_dir / "pyproject.toml"
|
|
218
572
|
pyproject_content = PYPROJECT_TEMPLATE.format(name=package_name).strip() + "\n"
|
|
219
|
-
pyproject_path.write_text(pyproject_content)
|
|
573
|
+
pyproject_path.write_text(pyproject_content, encoding="utf-8")
|
|
220
574
|
files_created.append("pyproject.toml")
|
|
221
575
|
|
|
222
576
|
# README.md
|
|
223
577
|
readme_path = target_dir / "README.md"
|
|
224
578
|
readme_content = README_TEMPLATE.format(name=package_name, title=name).strip() + "\n"
|
|
225
|
-
readme_path.write_text(readme_content)
|
|
579
|
+
readme_path.write_text(readme_content, encoding="utf-8")
|
|
226
580
|
files_created.append("README.md")
|
|
227
581
|
|
|
228
582
|
# Python files
|
|
229
583
|
# __init__.py
|
|
230
584
|
init_path = src_dir / "__init__.py"
|
|
231
|
-
init_path.write_text('"""
|
|
232
|
-
files_created.append("src/
|
|
585
|
+
init_path.write_text('"""Controller Package"""\n', encoding="utf-8")
|
|
586
|
+
files_created.append("src/controller/__init__.py")
|
|
233
587
|
|
|
234
|
-
#
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
files_created.append("src/
|
|
588
|
+
# env.py
|
|
589
|
+
env_path = src_dir / "env.py"
|
|
590
|
+
env_path.write_text(ENV_TEMPLATE.strip() + "\n", encoding="utf-8")
|
|
591
|
+
files_created.append("src/controller/env.py")
|
|
238
592
|
|
|
239
593
|
# server.py (need to escape the double braces for .format())
|
|
240
594
|
server_path = src_dir / "server.py"
|
|
241
595
|
server_content = SERVER_TEMPLATE.format(name=package_name).strip() + "\n"
|
|
242
|
-
server_path.write_text(server_content)
|
|
243
|
-
files_created.append("src/
|
|
596
|
+
server_path.write_text(server_content, encoding="utf-8")
|
|
597
|
+
files_created.append("src/controller/server.py")
|
|
598
|
+
|
|
599
|
+
# tasks.json
|
|
600
|
+
tasks_path = target_dir / "tasks.json"
|
|
601
|
+
tasks_content = TASKS_JSON_TEMPLATE.format(name=package_name).strip() + "\n"
|
|
602
|
+
tasks_path.write_text(tasks_content, encoding="utf-8")
|
|
603
|
+
files_created.append("tasks.json")
|
|
604
|
+
|
|
605
|
+
# test_task.py
|
|
606
|
+
test_task_path = target_dir / "test_task.py"
|
|
607
|
+
test_task_path.write_text(TEST_TASK_TEMPLATE.strip() + "\n", encoding="utf-8")
|
|
608
|
+
files_created.append("test_task.py")
|
|
609
|
+
|
|
610
|
+
# notebook.ipynb
|
|
611
|
+
notebook_path = target_dir / "test_env.ipynb"
|
|
612
|
+
notebook_content = NOTEBOOK_TEMPLATE.format(name=package_name).strip() + "\n"
|
|
613
|
+
notebook_path.write_text(notebook_content, encoding="utf-8")
|
|
614
|
+
files_created.append("test_env.ipynb")
|
|
244
615
|
|
|
245
616
|
# Success message
|
|
246
617
|
design.header(f"Created HUD Environment: {name}")
|
|
@@ -264,16 +635,24 @@ def create_environment(name: str | None, directory: str, force: bool) -> None:
|
|
|
264
635
|
design.info("\n3. Connect from Cursor or test via the MCP inspector:")
|
|
265
636
|
design.info(" Follow the instructions shown by hud dev --inspector")
|
|
266
637
|
|
|
267
|
-
design.info("\n4.
|
|
268
|
-
design.
|
|
269
|
-
|
|
638
|
+
design.info("\n4. Test your environment:")
|
|
639
|
+
design.command_example("python test_task.py")
|
|
640
|
+
|
|
641
|
+
design.info("\n5. Customize your environment:")
|
|
642
|
+
design.info(" - Add tools to src/controller/server.py")
|
|
643
|
+
design.info(" - Add state to src/controller/env.py")
|
|
644
|
+
design.info(" - Modify tasks in tasks.json")
|
|
645
|
+
design.info(" - Experiment in test_env.ipynb")
|
|
270
646
|
|
|
271
647
|
# Show a sample of the server code
|
|
272
648
|
design.section_title("Your MCP server")
|
|
273
649
|
sample_code = '''@mcp.tool()
|
|
274
650
|
async def act() -> str:
|
|
275
|
-
"""Perform an action."""
|
|
276
|
-
|
|
651
|
+
"""Perform an action that changes the environment state."""
|
|
652
|
+
if env is None:
|
|
653
|
+
raise RuntimeError("Context not initialized")
|
|
654
|
+
count = env.act()
|
|
655
|
+
return f"Action #{count} performed. Current count: {count}"'''
|
|
277
656
|
|
|
278
657
|
syntax = Syntax(sample_code, "python", theme="monokai", line_numbers=False)
|
|
279
658
|
design.console.print(Panel(syntax, border_style="dim"))
|
hud/cli/list_func.py
CHANGED
hud/clients/fastmcp.py
CHANGED
|
@@ -106,19 +106,9 @@ class FastMCPHUDClient(BaseHUDClient):
|
|
|
106
106
|
|
|
107
107
|
# Configure validation for output schemas based on client setting
|
|
108
108
|
try:
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
)
|
|
112
|
-
|
|
113
|
-
if (
|
|
114
|
-
hasattr(self._client, "_session_state")
|
|
115
|
-
and self._client._session_state.session is not None
|
|
116
|
-
):
|
|
117
|
-
self._client._session_state.session._validation_options = ValidationOptions( # type: ignore[attr-defined]
|
|
118
|
-
strict_output_validation=self._strict_validation
|
|
119
|
-
)
|
|
109
|
+
if hasattr(self._client, "_session_state") and self._client._session_state.session is not None: # noqa: E501
|
|
110
|
+
self._client._session_state.session._validate_structured_outputs = self._strict_validation # noqa: E501
|
|
120
111
|
except ImportError:
|
|
121
|
-
# ValidationOptions may not be available in some mcp versions
|
|
122
112
|
pass
|
|
123
113
|
|
|
124
114
|
logger.info("FastMCP client connected")
|
hud/clients/mcp_use.py
CHANGED
|
@@ -73,19 +73,13 @@ class MCPUseHUDClient(BaseHUDClient):
|
|
|
73
73
|
|
|
74
74
|
# Configure validation for all sessions based on client setting
|
|
75
75
|
try:
|
|
76
|
-
from hud_mcp.client.session import ( # type: ignore[import-not-found]
|
|
77
|
-
ValidationOptions, # type: ignore[import-not-found]
|
|
78
|
-
)
|
|
79
|
-
|
|
80
76
|
for session in self._sessions.values():
|
|
81
77
|
if (
|
|
82
78
|
hasattr(session, "connector")
|
|
83
79
|
and hasattr(session.connector, "client_session")
|
|
84
80
|
and session.connector.client_session is not None
|
|
85
81
|
):
|
|
86
|
-
session.connector.client_session.
|
|
87
|
-
strict_output_validation=self._strict_validation
|
|
88
|
-
)
|
|
82
|
+
session.connector.client_session._validate_structured_outputs = self._strict_validation # noqa: E501
|
|
89
83
|
except ImportError:
|
|
90
84
|
# ValidationOptions may not be available in some mcp versions
|
|
91
85
|
pass
|
hud/otel/instrumentation.py
CHANGED
|
@@ -30,11 +30,15 @@ def install_mcp_instrumentation(provider: TracerProvider) -> None:
|
|
|
30
30
|
logger = logging.getLogger(__name__)
|
|
31
31
|
|
|
32
32
|
try:
|
|
33
|
+
# First, patch the _instruments to use our fork
|
|
34
|
+
import opentelemetry.instrumentation.mcp.instrumentation as mcp_inst
|
|
35
|
+
mcp_inst._instruments = ("hud-mcp-python-sdk >= 3.13.1",)
|
|
36
|
+
|
|
33
37
|
from opentelemetry.instrumentation.mcp.instrumentation import (
|
|
34
38
|
McpInstrumentor,
|
|
35
39
|
)
|
|
36
40
|
|
|
37
|
-
#
|
|
41
|
+
# Then, patch the instrumentation to handle 3-value transports correctly
|
|
38
42
|
_patch_mcp_instrumentation()
|
|
39
43
|
|
|
40
44
|
McpInstrumentor().instrument(tracer_provider=provider)
|
hud/server/server.py
CHANGED
|
@@ -116,7 +116,7 @@ class MCPServer(FastMCP):
|
|
|
116
116
|
|
|
117
117
|
# Replace FastMCP's low-level server with our version that supports
|
|
118
118
|
# per-server initialization hooks
|
|
119
|
-
def _run_init(ctx: RequestContext) -> Any:
|
|
119
|
+
def _run_init(ctx: RequestContext | None = None) -> Any:
|
|
120
120
|
if self._initializer_fn is not None and not self._did_init:
|
|
121
121
|
self._did_init = True
|
|
122
122
|
# Redirect stdout to stderr during initialization to prevent
|
hud/utils/tests/test_version.py
CHANGED
hud/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.14
|
|
4
4
|
Summary: SDK for the HUD platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-python
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
|
|
@@ -35,10 +35,9 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
35
35
|
Classifier: Programming Language :: Python :: 3.12
|
|
36
36
|
Classifier: Programming Language :: Python :: 3.13
|
|
37
37
|
Requires-Python: <3.14,>=3.11
|
|
38
|
-
Requires-Dist: fastmcp>=2.11.2
|
|
39
38
|
Requires-Dist: httpx<1,>=0.23.0
|
|
40
|
-
Requires-Dist: hud-
|
|
41
|
-
Requires-Dist: mcp>=
|
|
39
|
+
Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
|
|
40
|
+
Requires-Dist: hud-mcp-python-sdk>=3.13.2
|
|
42
41
|
Requires-Dist: opentelemetry-api>=1.34.1
|
|
43
42
|
Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.34.1
|
|
44
43
|
Requires-Dist: opentelemetry-instrumentation-mcp>=0.44.1
|
|
@@ -56,6 +55,7 @@ Provides-Extra: agent
|
|
|
56
55
|
Requires-Dist: anthropic; extra == 'agent'
|
|
57
56
|
Requires-Dist: datasets>=2.14.0; extra == 'agent'
|
|
58
57
|
Requires-Dist: dotenv>=0.9.9; extra == 'agent'
|
|
58
|
+
Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'agent'
|
|
59
59
|
Requires-Dist: ipykernel; extra == 'agent'
|
|
60
60
|
Requires-Dist: ipython<9; extra == 'agent'
|
|
61
61
|
Requires-Dist: jupyter-client; extra == 'agent'
|
|
@@ -63,13 +63,13 @@ Requires-Dist: jupyter-core; extra == 'agent'
|
|
|
63
63
|
Requires-Dist: langchain; extra == 'agent'
|
|
64
64
|
Requires-Dist: langchain-anthropic; extra == 'agent'
|
|
65
65
|
Requires-Dist: langchain-openai; extra == 'agent'
|
|
66
|
-
Requires-Dist: mcp-use; extra == 'agent'
|
|
67
66
|
Requires-Dist: numpy>=1.24.0; extra == 'agent'
|
|
68
67
|
Requires-Dist: openai; extra == 'agent'
|
|
69
68
|
Provides-Extra: agents
|
|
70
69
|
Requires-Dist: anthropic; extra == 'agents'
|
|
71
70
|
Requires-Dist: datasets>=2.14.0; extra == 'agents'
|
|
72
71
|
Requires-Dist: dotenv>=0.9.9; extra == 'agents'
|
|
72
|
+
Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'agents'
|
|
73
73
|
Requires-Dist: ipykernel; extra == 'agents'
|
|
74
74
|
Requires-Dist: ipython<9; extra == 'agents'
|
|
75
75
|
Requires-Dist: jupyter-client; extra == 'agents'
|
|
@@ -77,7 +77,6 @@ Requires-Dist: jupyter-core; extra == 'agents'
|
|
|
77
77
|
Requires-Dist: langchain; extra == 'agents'
|
|
78
78
|
Requires-Dist: langchain-anthropic; extra == 'agents'
|
|
79
79
|
Requires-Dist: langchain-openai; extra == 'agents'
|
|
80
|
-
Requires-Dist: mcp-use; extra == 'agents'
|
|
81
80
|
Requires-Dist: numpy>=1.24.0; extra == 'agents'
|
|
82
81
|
Requires-Dist: openai; extra == 'agents'
|
|
83
82
|
Provides-Extra: dev
|
|
@@ -85,6 +84,7 @@ Requires-Dist: aiodocker>=0.24.0; extra == 'dev'
|
|
|
85
84
|
Requires-Dist: anthropic; extra == 'dev'
|
|
86
85
|
Requires-Dist: datasets>=2.14.0; extra == 'dev'
|
|
87
86
|
Requires-Dist: dotenv>=0.9.9; extra == 'dev'
|
|
87
|
+
Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'dev'
|
|
88
88
|
Requires-Dist: inspect-ai>=0.3.80; extra == 'dev'
|
|
89
89
|
Requires-Dist: ipykernel; extra == 'dev'
|
|
90
90
|
Requires-Dist: ipython<9; extra == 'dev'
|
|
@@ -93,7 +93,6 @@ Requires-Dist: jupyter-core; extra == 'dev'
|
|
|
93
93
|
Requires-Dist: langchain; extra == 'dev'
|
|
94
94
|
Requires-Dist: langchain-anthropic; extra == 'dev'
|
|
95
95
|
Requires-Dist: langchain-openai; extra == 'dev'
|
|
96
|
-
Requires-Dist: mcp-use; extra == 'dev'
|
|
97
96
|
Requires-Dist: numpy>=1.24.0; extra == 'dev'
|
|
98
97
|
Requires-Dist: openai; extra == 'dev'
|
|
99
98
|
Requires-Dist: pillow>=11.1.0; extra == 'dev'
|
|
@@ -3,7 +3,7 @@ hud/__main__.py,sha256=YR8Dq8OhINOsVfQ55PmRXXg4fEK84Rt_-rMtJ5rvhWo,145
|
|
|
3
3
|
hud/datasets.py,sha256=8lqC840kcNx01D2CcWZCd1j0eZTpepILmQrvohZIZYU,12056
|
|
4
4
|
hud/settings.py,sha256=WIJDsyrfwBZGcaGT46YUOpW8xjBZl3siXXprd92ASAg,2039
|
|
5
5
|
hud/types.py,sha256=pQWOPYXUZ2hhK0h-AHBc3DCj5tkbRXHqKZnsQQIcSFA,4237
|
|
6
|
-
hud/version.py,sha256=
|
|
6
|
+
hud/version.py,sha256=EujFSzlsB3e5WmhxNLuJ-8DYtTfWdY6iOL9lPpx0r5U,105
|
|
7
7
|
hud/agents/__init__.py,sha256=UoIkljWdbq4bM0LD-mSaw6w826EqdEjOk7r6glNYwYQ,286
|
|
8
8
|
hud/agents/base.py,sha256=M2g7Cj5InE4EsXpmxqURprC3IHNGvNZFBZ8HPIQxz-A,24574
|
|
9
9
|
hud/agents/claude.py,sha256=snbYFPW-KAkw4n9Rdz7dC2f46RuSHJKC53HPm8SucFM,14273
|
|
@@ -17,16 +17,16 @@ hud/agents/tests/test_base.py,sha256=F39ajSqASGUbPyPoWSY9KARFav62qNTK74W11Tr1Tg4
|
|
|
17
17
|
hud/agents/tests/test_claude.py,sha256=wqEKlzEvx8obz1sSm4NY0j-Zyt1qWNfDOmRqYIuAEd0,13069
|
|
18
18
|
hud/agents/tests/test_client.py,sha256=Sk5bGZw2hL5GsVi2LMp9tsLngl5ZQ18pkpeeQmts0ao,13908
|
|
19
19
|
hud/agents/tests/test_openai.py,sha256=ZJqctxCbJtKw6TkJCP4D2xAcG8CkxzDXO7dh5IIWN_M,9175
|
|
20
|
-
hud/cli/__init__.py,sha256=
|
|
20
|
+
hud/cli/__init__.py,sha256=ecjrYlswQB9JJsbxQFcKZVD0fn0ZWneKSLm_kBYWpQ0,30302
|
|
21
21
|
hud/cli/__main__.py,sha256=fDH7XITyuDITwSDIVwRso06aouADO0CzTHKqp5TOwJE,143
|
|
22
22
|
hud/cli/analyze.py,sha256=G-tjT1xLPLcYhDhZEaI7TAIS0z0OACUksnGFoAWd2ag,14416
|
|
23
23
|
hud/cli/build.py,sha256=c8pg8iUlCT1-E4koEKFX1Nx8oGaB2ln57pHdOCCDAvs,19126
|
|
24
24
|
hud/cli/clone.py,sha256=AwVDIuhr8mHb1oT2Af2HrD25SiTdwATpE6zd93vzLgA,6099
|
|
25
25
|
hud/cli/debug.py,sha256=FNzg9-_ZzUJA1nJfubmop7_2OT5mqnWsdpZyi4AVSXA,14163
|
|
26
|
-
hud/cli/dev.py,sha256=
|
|
27
|
-
hud/cli/eval.py,sha256=
|
|
28
|
-
hud/cli/init.py,sha256=
|
|
29
|
-
hud/cli/list_func.py,sha256=
|
|
26
|
+
hud/cli/dev.py,sha256=ANsd34gHX08eQxeXz6atIuDyi7Tw8qngqvmDPAx-PI0,28640
|
|
27
|
+
hud/cli/eval.py,sha256=zrUoXYdSe5cVbWa5fc9-tNK9syBCtKOpKDvc0ApeYQU,12604
|
|
28
|
+
hud/cli/init.py,sha256=guJbNkVuFhc-c2jTEx_jZxzzPkJRtGTJapWk5hyuyd8,18710
|
|
29
|
+
hud/cli/list_func.py,sha256=ENxLL4X5uuqAASWZdQuI0k-tEzmlhUn5LATgz3QPQqQ,7065
|
|
30
30
|
hud/cli/pull.py,sha256=JHwCwUwRO0Nzbgm9mkjsz6EpxbxgwQVhgNSY64nNZ-s,11969
|
|
31
31
|
hud/cli/push.py,sha256=4KrEHj0_i3xJNCB3eRjANmHFhSW4MFfpnld3nfVYENs,17904
|
|
32
32
|
hud/cli/remove.py,sha256=USAvB6pbMA3jd19xUtLEBiMsklVTEfE2Maw9nYcpSAE,6619
|
|
@@ -60,8 +60,8 @@ hud/cli/utils/server.py,sha256=uSx2DjG5vX-PFoD8zNH-gBHbkTNSHveFSVdAfmp09Tc,7341
|
|
|
60
60
|
hud/clients/README.md,sha256=XNE3mch95ozDgVqfwCGcrhlHY9CwT1GKfNANNboowto,3826
|
|
61
61
|
hud/clients/__init__.py,sha256=bcPIa7dwH5ENsjh7CzjsJ84fm7Ma93NBc2lGfSjGAKM,328
|
|
62
62
|
hud/clients/base.py,sha256=ob8G7_Gi-aENnc0yxHpZmzuqBD-swn_jVWkY2Iw7F4k,13995
|
|
63
|
-
hud/clients/fastmcp.py,sha256=
|
|
64
|
-
hud/clients/mcp_use.py,sha256=
|
|
63
|
+
hud/clients/fastmcp.py,sha256=b1Q5HltWWmnAhj-Nv6T4T5gitDn5bEfqiLy5PU5yD9g,9102
|
|
64
|
+
hud/clients/mcp_use.py,sha256=qRHDJ6ELRISD4V9NVPAX5SNE3NZqyunPAqDdpBtaslg,11920
|
|
65
65
|
hud/clients/tests/__init__.py,sha256=sKOtJFFa4mDIXh1U6O8ZUHjigE8CiRMQ2PzJTIBZuVE,33
|
|
66
66
|
hud/clients/tests/test_client_integration.py,sha256=kohU6jfCNfwSnAushHeB1_CmDlRfQc7VBL0GEdJYSeI,4198
|
|
67
67
|
hud/clients/tests/test_fastmcp.py,sha256=4q3TzDjuieTZa89taiNJIrzbUncNkYOG4MaubypA21k,13030
|
|
@@ -75,14 +75,14 @@ hud/otel/collector.py,sha256=jLZymZ8r7xt2VDuWexfbnT7PY1-0aiyLMgjBy8KDY1M,4497
|
|
|
75
75
|
hud/otel/config.py,sha256=6np_C2UXhtKHHjY41HQxZElua2Eh_EUCBiRB_YuiSuc,6249
|
|
76
76
|
hud/otel/context.py,sha256=C9MvO99cRSNNDEDC7ehO3eoTPnb6J7AemUYvEp57yEU,17774
|
|
77
77
|
hud/otel/exporters.py,sha256=TP7SF6ySCP-gFV1i-u5-HbpYsK3n9GP3OjW_ZBfsj-w,14246
|
|
78
|
-
hud/otel/instrumentation.py,sha256=
|
|
78
|
+
hud/otel/instrumentation.py,sha256=xbRRmTDwDyCvJVm4iWmB65kXOhotTnv9GjwkufARBuk,3782
|
|
79
79
|
hud/otel/processors.py,sha256=yI5BWsDBMEPfwMzD-iWbJd4KWH3qUDSe-5-C1yT6fjU,4615
|
|
80
80
|
hud/otel/tests/__init__.py,sha256=VNJKBMaxTtbn7trW-1Ph50zCvCok_wTSGcI1HD6GOLA,43
|
|
81
81
|
hud/otel/tests/test_processors.py,sha256=np0R4ssd9j6LJSJykJ5bNjl0POwNYNhgb7BqOZHwcMY,6778
|
|
82
82
|
hud/server/__init__.py,sha256=8LUwgsXO8xiViWP7uImDwcOsWLu01r5F4r8U8qH3rSY,91
|
|
83
83
|
hud/server/context.py,sha256=6bCdSzv1FGyItu9472HbbYef279H7QuMGJDR8EtYg5Y,3210
|
|
84
84
|
hud/server/low_level.py,sha256=XYs2pOJ9kN4OcJ6ahDmXM5mWkzq5wJLpKFInUYrWEok,4701
|
|
85
|
-
hud/server/server.py,sha256=
|
|
85
|
+
hud/server/server.py,sha256=jx2JEGeVkV5wDVKM7Sb474uY4fd-c6azo7HS_SFYDxo,8013
|
|
86
86
|
hud/server/helper/__init__.py,sha256=ZxO8VP3RZEBBp-q65VixuhzQgqEPSVzW0hEY9J9QqDA,116
|
|
87
87
|
hud/server/tests/__init__.py,sha256=eEYYkxX5Hz9woXVOBJ2H2_CQoEih0vH6nRt3sH2Z8v8,49
|
|
88
88
|
hud/shared/__init__.py,sha256=IPxPCqtPLguryN-nBq78Sakypw2bRiE2iHv3SXG8YRk,139
|
|
@@ -144,10 +144,10 @@ hud/utils/tests/test_init.py,sha256=2QLQSGgyP9wJhOvPCusm_zjJad0qApOZi1BXpxcdHXQ,
|
|
|
144
144
|
hud/utils/tests/test_mcp.py,sha256=0pUa16mL-bqbZDXp5NHBnt1gO5o10BOg7zTMHZ1DNPM,4023
|
|
145
145
|
hud/utils/tests/test_progress.py,sha256=QSF7Kpi03Ff_l3mAeqW9qs1nhK50j9vBiSobZq7T4f4,7394
|
|
146
146
|
hud/utils/tests/test_telemetry.py,sha256=5jl7bEx8C8b-FfFUko5pf4UY-mPOR-9HaeL98dGtVHM,2781
|
|
147
|
-
hud/utils/tests/test_version.py,sha256=
|
|
147
|
+
hud/utils/tests/test_version.py,sha256=JXMZuhuGL6fqB8mARikOgFFMpmq1Y0rG-7kz7V43w5k,160
|
|
148
148
|
hud/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
149
|
-
hud_python-0.4.
|
|
150
|
-
hud_python-0.4.
|
|
151
|
-
hud_python-0.4.
|
|
152
|
-
hud_python-0.4.
|
|
153
|
-
hud_python-0.4.
|
|
149
|
+
hud_python-0.4.14.dist-info/METADATA,sha256=e7OCOwaSi0F_gPdM0CUg2buSpTx0wd9w4uny00NH2xM,20233
|
|
150
|
+
hud_python-0.4.14.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
151
|
+
hud_python-0.4.14.dist-info/entry_points.txt,sha256=jJbodNFg1m0-CDofe5AHvB4zKBq7sSdP97-ohaQ3ae4,63
|
|
152
|
+
hud_python-0.4.14.dist-info/licenses/LICENSE,sha256=yIzBheVUf86FC1bztAcr7RYWWNxyd3B-UJQ3uddg1HA,1078
|
|
153
|
+
hud_python-0.4.14.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|