hud-python 0.4.53__py3-none-any.whl → 0.4.55__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/agents/base.py +8 -0
- hud/agents/claude.py +4 -3
- hud/agents/openai.py +2 -1
- hud/agents/openai_chat_generic.py +3 -2
- hud/agents/tests/test_claude.py +2 -2
- hud/agents/tests/test_openai.py +1 -1
- hud/agents/utils.py +50 -0
- hud/cli/__init__.py +65 -9
- hud/cli/build.py +185 -25
- hud/cli/dev.py +130 -40
- hud/cli/eval.py +123 -24
- hud/cli/flows/dev.py +155 -0
- hud/cli/flows/tasks.py +29 -9
- hud/cli/tests/test_eval.py +6 -6
- hud/cli/utils/docker.py +6 -3
- hud/clients/base.py +2 -2
- hud/otel/context.py +42 -1
- hud/server/server.py +29 -3
- hud/settings.py +6 -0
- hud/telemetry/async_context.py +16 -2
- hud/telemetry/trace.py +6 -1
- hud/types.py +10 -0
- hud/utils/group_eval.py +14 -2
- hud/utils/tests/test_agent_factories.py +2 -1
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.53.dist-info → hud_python-0.4.55.dist-info}/METADATA +8 -7
- {hud_python-0.4.53.dist-info → hud_python-0.4.55.dist-info}/RECORD +31 -29
- {hud_python-0.4.53.dist-info → hud_python-0.4.55.dist-info}/WHEEL +0 -0
- {hud_python-0.4.53.dist-info → hud_python-0.4.55.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.53.dist-info → hud_python-0.4.55.dist-info}/licenses/LICENSE +0 -0
hud/cli/dev.py
CHANGED
|
@@ -25,6 +25,7 @@ def show_dev_server_info(
|
|
|
25
25
|
inspector: bool,
|
|
26
26
|
interactive: bool,
|
|
27
27
|
env_dir: Path | None = None,
|
|
28
|
+
new: bool = False,
|
|
28
29
|
) -> str:
|
|
29
30
|
"""Show consistent server info for both Python and Docker modes.
|
|
30
31
|
|
|
@@ -125,6 +126,7 @@ async def run_mcp_module(
|
|
|
125
126
|
verbose: bool,
|
|
126
127
|
inspector: bool,
|
|
127
128
|
interactive: bool,
|
|
129
|
+
new: bool = False,
|
|
128
130
|
) -> None:
|
|
129
131
|
"""Run an MCP module directly."""
|
|
130
132
|
# Check if this is a reload (not first run)
|
|
@@ -222,14 +224,53 @@ async def run_mcp_module(
|
|
|
222
224
|
|
|
223
225
|
# Show server info only on first run
|
|
224
226
|
if not is_reload:
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
227
|
+
# Try dynamic trace first for HTTP mode (only if --new)
|
|
228
|
+
live_trace_url: str | None = None
|
|
229
|
+
if transport == "http" and new:
|
|
230
|
+
try:
|
|
231
|
+
local_mcp_config: dict[str, dict[str, Any]] = {
|
|
232
|
+
"hud": {
|
|
233
|
+
"url": f"http://localhost:{port}/mcp",
|
|
234
|
+
"headers": {},
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
from hud.cli.flows.dev import create_dynamic_trace
|
|
239
|
+
|
|
240
|
+
live_trace_url = await create_dynamic_trace(
|
|
241
|
+
mcp_config=local_mcp_config,
|
|
242
|
+
build_status=False,
|
|
243
|
+
environment_name=mcp_server.name or "mcp-server",
|
|
244
|
+
)
|
|
245
|
+
except Exception: # noqa: S110
|
|
246
|
+
pass
|
|
247
|
+
|
|
248
|
+
# Show UI using shared flow logic
|
|
249
|
+
if transport == "http" and live_trace_url and new:
|
|
250
|
+
# Minimal UI with live trace
|
|
251
|
+
from hud.cli.flows.dev import generate_cursor_deeplink, show_dev_ui
|
|
252
|
+
|
|
253
|
+
server_name = mcp_server.name or "mcp-server"
|
|
254
|
+
cursor_deeplink = generate_cursor_deeplink(server_name, port)
|
|
255
|
+
|
|
256
|
+
show_dev_ui(
|
|
257
|
+
live_trace_url=live_trace_url,
|
|
258
|
+
server_name=server_name,
|
|
259
|
+
port=port,
|
|
260
|
+
cursor_deeplink=cursor_deeplink,
|
|
261
|
+
is_docker=False,
|
|
262
|
+
)
|
|
263
|
+
else:
|
|
264
|
+
# Full UI for HTTP without trace, or stdio mode
|
|
265
|
+
show_dev_server_info(
|
|
266
|
+
server_name=mcp_server.name or "mcp-server",
|
|
267
|
+
port=port,
|
|
268
|
+
transport=transport,
|
|
269
|
+
inspector=inspector,
|
|
270
|
+
interactive=interactive,
|
|
271
|
+
env_dir=Path.cwd().parent if (Path.cwd().parent / "environment").exists() else None,
|
|
272
|
+
new=new,
|
|
273
|
+
)
|
|
233
274
|
|
|
234
275
|
# Check if there's an environment backend and remind user to start it (first run only)
|
|
235
276
|
if not is_reload:
|
|
@@ -238,9 +279,10 @@ async def run_mcp_module(
|
|
|
238
279
|
if env_dir.exists() and (env_dir / "server.py").exists():
|
|
239
280
|
hud_console.info("")
|
|
240
281
|
hud_console.info(
|
|
241
|
-
f"{hud_console.sym.FLOW} Don't forget to start the environment backend
|
|
282
|
+
f"{hud_console.sym.FLOW} Don't forget to start the environment backend in another "
|
|
283
|
+
"terminal:"
|
|
242
284
|
)
|
|
243
|
-
hud_console.info(" cd
|
|
285
|
+
hud_console.info(" cd environment && uv run python uvicorn server:app --reload")
|
|
244
286
|
|
|
245
287
|
# Launch inspector if requested (first run only)
|
|
246
288
|
if inspector and transport == "http":
|
|
@@ -347,6 +389,7 @@ def run_with_reload(
|
|
|
347
389
|
verbose: bool,
|
|
348
390
|
inspector: bool,
|
|
349
391
|
interactive: bool,
|
|
392
|
+
new: bool = False,
|
|
350
393
|
) -> None:
|
|
351
394
|
"""Run module with file watching and auto-reload."""
|
|
352
395
|
try:
|
|
@@ -389,6 +432,11 @@ def run_with_reload(
|
|
|
389
432
|
|
|
390
433
|
if verbose:
|
|
391
434
|
cmd.append("--verbose")
|
|
435
|
+
|
|
436
|
+
if new:
|
|
437
|
+
cmd.append("--new")
|
|
438
|
+
|
|
439
|
+
if verbose:
|
|
392
440
|
hud_console.info(f"Starting: {' '.join(cmd)}")
|
|
393
441
|
|
|
394
442
|
# Mark as reload after first run to suppress logs
|
|
@@ -454,7 +502,12 @@ def run_with_reload(
|
|
|
454
502
|
|
|
455
503
|
|
|
456
504
|
def run_docker_dev_server(
|
|
457
|
-
port: int,
|
|
505
|
+
port: int,
|
|
506
|
+
verbose: bool,
|
|
507
|
+
inspector: bool,
|
|
508
|
+
interactive: bool,
|
|
509
|
+
docker_args: list[str],
|
|
510
|
+
new: bool = False,
|
|
458
511
|
) -> None:
|
|
459
512
|
"""Run MCP server in Docker with volume mounts, expose via local HTTP proxy."""
|
|
460
513
|
import typer
|
|
@@ -462,6 +515,11 @@ def run_docker_dev_server(
|
|
|
462
515
|
|
|
463
516
|
from hud.server import MCPServer
|
|
464
517
|
|
|
518
|
+
# Ensure Docker CLI and daemon are available before proceeding
|
|
519
|
+
from .utils.docker import require_docker_running
|
|
520
|
+
|
|
521
|
+
require_docker_running()
|
|
522
|
+
|
|
465
523
|
cwd = Path.cwd()
|
|
466
524
|
|
|
467
525
|
# Find environment directory (current or parent with hud.lock.yaml)
|
|
@@ -528,15 +586,6 @@ def run_docker_dev_server(
|
|
|
528
586
|
env_dir=env_dir,
|
|
529
587
|
)
|
|
530
588
|
|
|
531
|
-
# Env flags already injected by create_docker_run_command
|
|
532
|
-
|
|
533
|
-
# Print startup info
|
|
534
|
-
hud_console.header("HUD Development Mode (Docker)")
|
|
535
|
-
|
|
536
|
-
if verbose:
|
|
537
|
-
hud_console.section_title("Docker Command")
|
|
538
|
-
hud_console.info(" ".join(docker_cmd))
|
|
539
|
-
|
|
540
589
|
# Create MCP config pointing to the Docker container's stdio
|
|
541
590
|
mcp_config = {
|
|
542
591
|
"docker": {
|
|
@@ -545,15 +594,62 @@ def run_docker_dev_server(
|
|
|
545
594
|
}
|
|
546
595
|
}
|
|
547
596
|
|
|
548
|
-
#
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
597
|
+
# Attempt to create dynamic trace early (before any UI)
|
|
598
|
+
import asyncio as _asy
|
|
599
|
+
|
|
600
|
+
from hud.cli.flows.dev import create_dynamic_trace, generate_cursor_deeplink, show_dev_ui
|
|
601
|
+
|
|
602
|
+
live_trace_url: str | None = None
|
|
603
|
+
if new:
|
|
604
|
+
try:
|
|
605
|
+
local_mcp_config: dict[str, dict[str, Any]] = {
|
|
606
|
+
"hud": {
|
|
607
|
+
"url": f"http://localhost:{port}/mcp",
|
|
608
|
+
"headers": {},
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
live_trace_url = _asy.run(
|
|
612
|
+
create_dynamic_trace(
|
|
613
|
+
mcp_config=local_mcp_config,
|
|
614
|
+
build_status=True,
|
|
615
|
+
environment_name=image_name,
|
|
616
|
+
)
|
|
617
|
+
)
|
|
618
|
+
except Exception: # noqa: S110
|
|
619
|
+
pass
|
|
620
|
+
|
|
621
|
+
# Show appropriate UI
|
|
622
|
+
if live_trace_url and new:
|
|
623
|
+
# Minimal UI with live trace
|
|
624
|
+
cursor_deeplink = generate_cursor_deeplink(image_name, port)
|
|
625
|
+
show_dev_ui(
|
|
626
|
+
live_trace_url=live_trace_url,
|
|
627
|
+
server_name=image_name,
|
|
628
|
+
port=port,
|
|
629
|
+
cursor_deeplink=cursor_deeplink,
|
|
630
|
+
is_docker=True,
|
|
631
|
+
)
|
|
632
|
+
else:
|
|
633
|
+
# Full UI
|
|
634
|
+
hud_console.header("HUD Development Mode (Docker)")
|
|
635
|
+
if verbose:
|
|
636
|
+
hud_console.section_title("Docker Command")
|
|
637
|
+
hud_console.info(" ".join(docker_cmd))
|
|
638
|
+
show_dev_server_info(
|
|
639
|
+
server_name=image_name,
|
|
640
|
+
port=port,
|
|
641
|
+
transport="http",
|
|
642
|
+
inspector=inspector,
|
|
643
|
+
interactive=interactive,
|
|
644
|
+
env_dir=env_dir,
|
|
645
|
+
new=new,
|
|
646
|
+
)
|
|
647
|
+
hud_console.dim_info(
|
|
648
|
+
"",
|
|
649
|
+
"Container restarts on file changes (mounted volumes), "
|
|
650
|
+
"if changing tools run hud dev again",
|
|
651
|
+
)
|
|
652
|
+
hud_console.info("")
|
|
557
653
|
|
|
558
654
|
# Suppress logs unless verbose
|
|
559
655
|
if not verbose:
|
|
@@ -562,13 +658,6 @@ def run_docker_dev_server(
|
|
|
562
658
|
logging.getLogger("uvicorn").setLevel(logging.ERROR)
|
|
563
659
|
os.environ["FASTMCP_DISABLE_BANNER"] = "1"
|
|
564
660
|
|
|
565
|
-
# Note about hot-reload behavior
|
|
566
|
-
hud_console.dim_info(
|
|
567
|
-
"",
|
|
568
|
-
"Container restarts on file changes (mounted volumes), if changing tools run hud dev again",
|
|
569
|
-
)
|
|
570
|
-
hud_console.info("")
|
|
571
|
-
|
|
572
661
|
# Create and run proxy with HUD helpers
|
|
573
662
|
async def run_proxy() -> None:
|
|
574
663
|
from fastmcp import FastMCP
|
|
@@ -617,6 +706,7 @@ def run_mcp_dev_server(
|
|
|
617
706
|
watch: list[str] | None,
|
|
618
707
|
docker: bool = False,
|
|
619
708
|
docker_args: list[str] | None = None,
|
|
709
|
+
new: bool = False,
|
|
620
710
|
) -> None:
|
|
621
711
|
"""Run MCP development server with hot-reload."""
|
|
622
712
|
docker_args = docker_args or []
|
|
@@ -627,12 +717,12 @@ def run_mcp_dev_server(
|
|
|
627
717
|
hud_console.note("Detected Dockerfile - using Docker mode with volume mounts")
|
|
628
718
|
hud_console.dim_info("Tip", "Use 'hud dev --help' to see all options")
|
|
629
719
|
hud_console.info("")
|
|
630
|
-
run_docker_dev_server(port, verbose, inspector, interactive, docker_args)
|
|
720
|
+
run_docker_dev_server(port, verbose, inspector, interactive, docker_args, new)
|
|
631
721
|
return
|
|
632
722
|
|
|
633
723
|
# Route to Docker mode if explicitly requested
|
|
634
724
|
if docker:
|
|
635
|
-
run_docker_dev_server(port, verbose, inspector, interactive, docker_args)
|
|
725
|
+
run_docker_dev_server(port, verbose, inspector, interactive, docker_args, new)
|
|
636
726
|
return
|
|
637
727
|
|
|
638
728
|
transport = "stdio" if stdio else "http"
|
|
@@ -676,6 +766,6 @@ def run_mcp_dev_server(
|
|
|
676
766
|
is_child = os.environ.get("_HUD_DEV_CHILD") == "1"
|
|
677
767
|
|
|
678
768
|
if is_child:
|
|
679
|
-
asyncio.run(run_mcp_module(module, transport, port, verbose, False, False))
|
|
769
|
+
asyncio.run(run_mcp_module(module, transport, port, verbose, False, False, new))
|
|
680
770
|
else:
|
|
681
|
-
run_with_reload(module, watch_paths, transport, port, verbose, inspector, interactive)
|
|
771
|
+
run_with_reload(module, watch_paths, transport, port, verbose, inspector, interactive, new)
|
hud/cli/eval.py
CHANGED
|
@@ -5,13 +5,14 @@ from __future__ import annotations
|
|
|
5
5
|
import asyncio
|
|
6
6
|
import logging
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
from typing import TYPE_CHECKING, Any
|
|
8
|
+
from typing import TYPE_CHECKING, Any
|
|
9
9
|
|
|
10
10
|
import typer
|
|
11
11
|
|
|
12
12
|
import hud
|
|
13
13
|
from hud.cli.utils.env_check import ensure_built, find_environment_dir
|
|
14
14
|
from hud.settings import settings
|
|
15
|
+
from hud.types import AgentType
|
|
15
16
|
from hud.utils.group_eval import display_group_statistics, run_tasks_grouped
|
|
16
17
|
from hud.utils.hud_console import HUDConsole
|
|
17
18
|
|
|
@@ -21,6 +22,28 @@ logger = logging.getLogger(__name__)
|
|
|
21
22
|
hud_console = HUDConsole()
|
|
22
23
|
|
|
23
24
|
|
|
25
|
+
def _tasks_use_local_mcp(tasks: list[Task]) -> bool:
|
|
26
|
+
"""Return True if any task's MCP config uses a local command instead of a URL.
|
|
27
|
+
|
|
28
|
+
A config is considered local when a server entry contains a 'command' key and
|
|
29
|
+
does not provide a 'url'.
|
|
30
|
+
"""
|
|
31
|
+
try:
|
|
32
|
+
for t in tasks:
|
|
33
|
+
cfg = getattr(t, "mcp_config", {}) or {}
|
|
34
|
+
if not isinstance(cfg, dict):
|
|
35
|
+
continue
|
|
36
|
+
for server_cfg in cfg.values():
|
|
37
|
+
if isinstance(server_cfg, dict) and (
|
|
38
|
+
"command" in server_cfg and not server_cfg.get("url")
|
|
39
|
+
):
|
|
40
|
+
return True
|
|
41
|
+
return False
|
|
42
|
+
except Exception:
|
|
43
|
+
# Be conservative: if detection fails, do not block
|
|
44
|
+
return False
|
|
45
|
+
|
|
46
|
+
|
|
24
47
|
def get_available_models() -> list[dict[str, str | None]]:
|
|
25
48
|
"""Fetch available models from the HUD API (only ready models).
|
|
26
49
|
|
|
@@ -113,7 +136,7 @@ def _build_vllm_config(
|
|
|
113
136
|
|
|
114
137
|
|
|
115
138
|
def build_agent(
|
|
116
|
-
agent_type:
|
|
139
|
+
agent_type: AgentType,
|
|
117
140
|
*,
|
|
118
141
|
model: str | None = None,
|
|
119
142
|
allowed_tools: list[str] | None = None,
|
|
@@ -123,11 +146,11 @@ def build_agent(
|
|
|
123
146
|
"""Create and return the requested agent type."""
|
|
124
147
|
|
|
125
148
|
# Import agents lazily to avoid dependency issues
|
|
126
|
-
if agent_type ==
|
|
149
|
+
if agent_type == AgentType.INTEGRATION_TEST:
|
|
127
150
|
from hud.agents.misc.integration_test_agent import IntegrationTestRunner
|
|
128
151
|
|
|
129
152
|
return IntegrationTestRunner(verbose=verbose)
|
|
130
|
-
elif agent_type ==
|
|
153
|
+
elif agent_type == AgentType.VLLM:
|
|
131
154
|
# Create a generic OpenAI agent for vLLM server
|
|
132
155
|
try:
|
|
133
156
|
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
|
|
@@ -147,7 +170,7 @@ def build_agent(
|
|
|
147
170
|
)
|
|
148
171
|
return GenericOpenAIChatAgent(**config)
|
|
149
172
|
|
|
150
|
-
elif agent_type ==
|
|
173
|
+
elif agent_type == AgentType.OPENAI:
|
|
151
174
|
try:
|
|
152
175
|
from hud.agents import OperatorAgent
|
|
153
176
|
except ImportError as e:
|
|
@@ -165,7 +188,7 @@ def build_agent(
|
|
|
165
188
|
else:
|
|
166
189
|
return OperatorAgent(verbose=verbose)
|
|
167
190
|
|
|
168
|
-
elif agent_type ==
|
|
191
|
+
elif agent_type == AgentType.LITELLM:
|
|
169
192
|
try:
|
|
170
193
|
from hud.agents.lite_llm import LiteAgent
|
|
171
194
|
except ImportError as e:
|
|
@@ -209,7 +232,7 @@ def build_agent(
|
|
|
209
232
|
async def run_single_task(
|
|
210
233
|
source: str,
|
|
211
234
|
*,
|
|
212
|
-
agent_type:
|
|
235
|
+
agent_type: AgentType = AgentType.CLAUDE,
|
|
213
236
|
model: str | None = None,
|
|
214
237
|
allowed_tools: list[str] | None = None,
|
|
215
238
|
max_steps: int = 10,
|
|
@@ -264,18 +287,44 @@ async def run_single_task(
|
|
|
264
287
|
"Using first task from dataset (run with --full to run the entire dataset)..."
|
|
265
288
|
)
|
|
266
289
|
|
|
267
|
-
|
|
290
|
+
# Warn/confirm if the task uses local MCP config
|
|
291
|
+
try:
|
|
292
|
+
if group_size > 1 and _tasks_use_local_mcp([task]):
|
|
293
|
+
hud_console.warning(
|
|
294
|
+
"Detected a local MCP configuration (uses 'command' instead of a 'url')."
|
|
295
|
+
)
|
|
296
|
+
hud_console.info(
|
|
297
|
+
"Ensure there are no exposed port conflicts during Docker runs/builds in eval."
|
|
298
|
+
)
|
|
299
|
+
proceed = hud_console.confirm(
|
|
300
|
+
"Proceed with running local MCP servers for this evaluation?",
|
|
301
|
+
default=True,
|
|
302
|
+
)
|
|
303
|
+
if not proceed:
|
|
304
|
+
# Provide a helpful next step
|
|
305
|
+
hud_console.hint("You can convert tasks to remote with: hud convert <tasks_file>")
|
|
306
|
+
raise typer.Exit(1)
|
|
307
|
+
# Always show the convert hint for awareness
|
|
308
|
+
hud_console.hint(
|
|
309
|
+
"Avoid local port conflicts by converting to remote: hud convert <tasks_file>"
|
|
310
|
+
)
|
|
311
|
+
except typer.Exit:
|
|
312
|
+
raise
|
|
313
|
+
except Exception as e:
|
|
314
|
+
hud_console.debug(f"Local MCP confirmation skipped due to error: {e}")
|
|
315
|
+
|
|
316
|
+
task_prompt = task.prompt
|
|
268
317
|
|
|
269
318
|
# Use grouped evaluation if group_size > 1
|
|
270
319
|
agent_config: dict[str, Any] = {}
|
|
271
|
-
if agent_type ==
|
|
320
|
+
if agent_type == AgentType.INTEGRATION_TEST:
|
|
272
321
|
from hud.agents.misc.integration_test_agent import IntegrationTestRunner
|
|
273
322
|
|
|
274
323
|
agent_class = IntegrationTestRunner
|
|
275
324
|
agent_config = {"verbose": verbose}
|
|
276
325
|
if allowed_tools:
|
|
277
326
|
agent_config["allowed_tools"] = allowed_tools
|
|
278
|
-
elif agent_type ==
|
|
327
|
+
elif agent_type == AgentType.VLLM:
|
|
279
328
|
# Special handling for vLLM
|
|
280
329
|
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
|
|
281
330
|
|
|
@@ -288,14 +337,14 @@ async def run_single_task(
|
|
|
288
337
|
allowed_tools=allowed_tools,
|
|
289
338
|
verbose=verbose,
|
|
290
339
|
)
|
|
291
|
-
elif agent_type ==
|
|
340
|
+
elif agent_type == AgentType.OPENAI:
|
|
292
341
|
from hud.agents import OperatorAgent
|
|
293
342
|
|
|
294
343
|
agent_class = OperatorAgent
|
|
295
344
|
agent_config = {"verbose": verbose}
|
|
296
345
|
if allowed_tools:
|
|
297
346
|
agent_config["allowed_tools"] = allowed_tools
|
|
298
|
-
elif agent_type ==
|
|
347
|
+
elif agent_type == AgentType.LITELLM:
|
|
299
348
|
from hud.agents.lite_llm import LiteAgent
|
|
300
349
|
|
|
301
350
|
agent_class = LiteAgent
|
|
@@ -305,7 +354,7 @@ async def run_single_task(
|
|
|
305
354
|
}
|
|
306
355
|
if allowed_tools:
|
|
307
356
|
agent_config["allowed_tools"] = allowed_tools
|
|
308
|
-
elif agent_type ==
|
|
357
|
+
elif agent_type == AgentType.CLAUDE:
|
|
309
358
|
from hud.agents import ClaudeAgent
|
|
310
359
|
|
|
311
360
|
agent_class = ClaudeAgent
|
|
@@ -353,7 +402,7 @@ async def run_single_task(
|
|
|
353
402
|
async def run_full_dataset(
|
|
354
403
|
source: str,
|
|
355
404
|
*,
|
|
356
|
-
agent_type:
|
|
405
|
+
agent_type: AgentType = AgentType.CLAUDE,
|
|
357
406
|
model: str | None = None,
|
|
358
407
|
allowed_tools: list[str] | None = None,
|
|
359
408
|
max_concurrent: int = 30,
|
|
@@ -386,6 +435,56 @@ async def run_full_dataset(
|
|
|
386
435
|
hud_console.error(f"No tasks found in: {source}")
|
|
387
436
|
raise typer.Exit(1)
|
|
388
437
|
|
|
438
|
+
# Warn/confirm once if any task uses local MCP config
|
|
439
|
+
try:
|
|
440
|
+
if _tasks_use_local_mcp(tasks):
|
|
441
|
+
hud_console.warning(
|
|
442
|
+
"Detected local MCP configurations (use 'command' instead of a 'url')."
|
|
443
|
+
)
|
|
444
|
+
hud_console.info(
|
|
445
|
+
"When running many tasks concurrently, exposed host ports from Docker may conflict."
|
|
446
|
+
)
|
|
447
|
+
proceed = hud_console.confirm(
|
|
448
|
+
"Proceed with running local MCP servers for this evaluation?",
|
|
449
|
+
default=True,
|
|
450
|
+
)
|
|
451
|
+
if not proceed:
|
|
452
|
+
# Helpful hint when source is a file path
|
|
453
|
+
try:
|
|
454
|
+
path = Path(source)
|
|
455
|
+
if path.exists():
|
|
456
|
+
hud_console.hint(
|
|
457
|
+
f"You can convert tasks to remote with: hud convert {path.name}"
|
|
458
|
+
)
|
|
459
|
+
else:
|
|
460
|
+
hud_console.hint(
|
|
461
|
+
"You can convert tasks to remote with: hud convert <tasks_file>"
|
|
462
|
+
)
|
|
463
|
+
except Exception:
|
|
464
|
+
hud_console.hint(
|
|
465
|
+
"You can convert tasks to remote with: hud convert <tasks_file>"
|
|
466
|
+
)
|
|
467
|
+
raise typer.Exit(1)
|
|
468
|
+
# Always show the convert hint for awareness
|
|
469
|
+
try:
|
|
470
|
+
path = Path(source)
|
|
471
|
+
if path.exists():
|
|
472
|
+
hud_console.hint(
|
|
473
|
+
f"Convert to remote to avoid port conflicts: hud convert {path.name}"
|
|
474
|
+
)
|
|
475
|
+
else:
|
|
476
|
+
hud_console.hint(
|
|
477
|
+
"Convert to remote to avoid port conflicts: hud convert <tasks_file>"
|
|
478
|
+
)
|
|
479
|
+
except Exception:
|
|
480
|
+
hud_console.hint(
|
|
481
|
+
"Convert to remote to avoid port conflicts: hud convert <tasks_file>"
|
|
482
|
+
)
|
|
483
|
+
except typer.Exit:
|
|
484
|
+
raise
|
|
485
|
+
except Exception as e:
|
|
486
|
+
hud_console.debug(f"Local MCP confirmation skipped due to error: {e}")
|
|
487
|
+
|
|
389
488
|
# Convert Task objects to dicts for dataset runners
|
|
390
489
|
dataset_or_tasks = [task.model_dump() for task in tasks]
|
|
391
490
|
|
|
@@ -395,12 +494,12 @@ async def run_full_dataset(
|
|
|
395
494
|
|
|
396
495
|
# Build agent class + config for run_dataset
|
|
397
496
|
agent_config: dict[str, Any]
|
|
398
|
-
if agent_type ==
|
|
497
|
+
if agent_type == AgentType.INTEGRATION_TEST: # --integration-test mode
|
|
399
498
|
from hud.agents.misc.integration_test_agent import IntegrationTestRunner
|
|
400
499
|
|
|
401
500
|
agent_class = IntegrationTestRunner
|
|
402
501
|
agent_config = {"verbose": verbose}
|
|
403
|
-
elif agent_type ==
|
|
502
|
+
elif agent_type == AgentType.VLLM:
|
|
404
503
|
try:
|
|
405
504
|
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
|
|
406
505
|
|
|
@@ -419,7 +518,7 @@ async def run_full_dataset(
|
|
|
419
518
|
allowed_tools=allowed_tools,
|
|
420
519
|
verbose=verbose,
|
|
421
520
|
)
|
|
422
|
-
elif agent_type ==
|
|
521
|
+
elif agent_type == AgentType.OPENAI:
|
|
423
522
|
try:
|
|
424
523
|
from hud.agents import OperatorAgent
|
|
425
524
|
|
|
@@ -435,7 +534,7 @@ async def run_full_dataset(
|
|
|
435
534
|
if allowed_tools:
|
|
436
535
|
agent_config["allowed_tools"] = allowed_tools
|
|
437
536
|
|
|
438
|
-
elif agent_type ==
|
|
537
|
+
elif agent_type == AgentType.LITELLM:
|
|
439
538
|
try:
|
|
440
539
|
from hud.agents.lite_llm import LiteAgent
|
|
441
540
|
|
|
@@ -539,8 +638,8 @@ def eval_command(
|
|
|
539
638
|
"--full",
|
|
540
639
|
help="Run the entire dataset (omit for single-task debug mode)",
|
|
541
640
|
),
|
|
542
|
-
agent:
|
|
543
|
-
|
|
641
|
+
agent: AgentType = typer.Option( # noqa: B008
|
|
642
|
+
AgentType.CLAUDE,
|
|
544
643
|
"--agent",
|
|
545
644
|
help="Agent backend to use (claude, openai, vllm for local server, or litellm)",
|
|
546
645
|
),
|
|
@@ -648,21 +747,21 @@ def eval_command(
|
|
|
648
747
|
|
|
649
748
|
# We pass integration_test as the agent_type
|
|
650
749
|
if integration_test:
|
|
651
|
-
agent =
|
|
750
|
+
agent = AgentType.INTEGRATION_TEST
|
|
652
751
|
|
|
653
752
|
# Check for required API keys
|
|
654
|
-
if agent ==
|
|
753
|
+
if agent == AgentType.CLAUDE:
|
|
655
754
|
if not settings.anthropic_api_key:
|
|
656
755
|
hud_console.error("ANTHROPIC_API_KEY is required for Claude agent")
|
|
657
756
|
hud_console.info(
|
|
658
757
|
"Set it in your environment or run: hud set ANTHROPIC_API_KEY=your-key-here"
|
|
659
758
|
)
|
|
660
759
|
raise typer.Exit(1)
|
|
661
|
-
elif agent ==
|
|
760
|
+
elif agent == AgentType.OPENAI and not settings.openai_api_key:
|
|
662
761
|
hud_console.error("OPENAI_API_KEY is required for OpenAI agent")
|
|
663
762
|
hud_console.info("Set it in your environment or run: hud set OPENAI_API_KEY=your-key-here")
|
|
664
763
|
raise typer.Exit(1)
|
|
665
|
-
elif agent ==
|
|
764
|
+
elif agent == AgentType.VLLM:
|
|
666
765
|
if model:
|
|
667
766
|
hud_console.info(f"Using vLLM with model: {model}")
|
|
668
767
|
else:
|