hud-python 0.4.51__py3-none-any.whl → 0.4.53__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +13 -1
- hud/agents/base.py +14 -3
- hud/agents/lite_llm.py +1 -1
- hud/agents/openai_chat_generic.py +15 -3
- hud/agents/tests/test_base.py +9 -2
- hud/agents/tests/test_base_runtime.py +164 -0
- hud/cli/__init__.py +18 -25
- hud/cli/build.py +35 -27
- hud/cli/dev.py +11 -29
- hud/cli/eval.py +114 -145
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +26 -3
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +134 -0
- hud/cli/tests/test_eval.py +4 -0
- hud/cli/tests/test_mcp_server.py +8 -7
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/utils/docker.py +120 -1
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +257 -0
- hud/clients/base.py +1 -1
- hud/clients/mcp_use.py +3 -1
- hud/datasets/parallel.py +2 -2
- hud/datasets/runner.py +85 -24
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_runner.py +106 -0
- hud/datasets/tests/test_utils.py +228 -0
- hud/otel/config.py +8 -6
- hud/otel/context.py +4 -4
- hud/otel/exporters.py +231 -57
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_instrumentation.py +207 -0
- hud/rl/learner.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/shared/exceptions.py +35 -9
- hud/shared/hints.py +25 -0
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +39 -30
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +30 -6
- hud/telemetry/async_context.py +331 -0
- hud/telemetry/job.py +51 -12
- hud/telemetry/tests/test_async_context.py +242 -0
- hud/telemetry/tests/test_instrument.py +414 -0
- hud/telemetry/tests/test_job.py +609 -0
- hud/telemetry/tests/test_trace.py +184 -6
- hud/telemetry/trace.py +16 -17
- hud/tools/computer/qwen.py +4 -1
- hud/tools/computer/settings.py +2 -2
- hud/tools/executors/base.py +4 -2
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/types.py +7 -1
- hud/utils/agent_factories.py +1 -3
- hud/utils/mcp.py +1 -1
- hud/utils/task_tracking.py +223 -0
- hud/utils/tests/test_agent_factories.py +60 -0
- hud/utils/tests/test_mcp.py +4 -6
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tasks.py +187 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.51.dist-info → hud_python-0.4.53.dist-info}/METADATA +48 -48
- {hud_python-0.4.51.dist-info → hud_python-0.4.53.dist-info}/RECORD +88 -47
- {hud_python-0.4.51.dist-info → hud_python-0.4.53.dist-info}/WHEEL +0 -0
- {hud_python-0.4.51.dist-info → hud_python-0.4.53.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.51.dist-info → hud_python-0.4.53.dist-info}/licenses/LICENSE +0 -0
hud/cli/eval.py
CHANGED
|
@@ -68,6 +68,50 @@ def get_available_models() -> list[dict[str, str | None]]:
|
|
|
68
68
|
return []
|
|
69
69
|
|
|
70
70
|
|
|
71
|
+
def _build_vllm_config(
|
|
72
|
+
vllm_base_url: str | None,
|
|
73
|
+
model: str | None,
|
|
74
|
+
allowed_tools: list[str] | None,
|
|
75
|
+
verbose: bool,
|
|
76
|
+
) -> dict[str, Any]:
|
|
77
|
+
"""Build configuration for vLLM agent.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
vllm_base_url: Optional base URL for vLLM server
|
|
81
|
+
model: Model name to use
|
|
82
|
+
allowed_tools: Optional list of allowed tools
|
|
83
|
+
verbose: Enable verbose output
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Dictionary with agent configuration
|
|
87
|
+
"""
|
|
88
|
+
# Determine base URL and API key
|
|
89
|
+
if vllm_base_url is not None:
|
|
90
|
+
base_url = vllm_base_url
|
|
91
|
+
api_key = settings.api_key if base_url.startswith(settings.hud_rl_url) else "token-abc123"
|
|
92
|
+
hud_console.info(f"Using vLLM server at {base_url}")
|
|
93
|
+
else:
|
|
94
|
+
base_url = "http://localhost:8000/v1"
|
|
95
|
+
api_key = "token-abc123"
|
|
96
|
+
|
|
97
|
+
config: dict[str, Any] = {
|
|
98
|
+
"api_key": api_key,
|
|
99
|
+
"base_url": base_url,
|
|
100
|
+
"model_name": model or "served-model",
|
|
101
|
+
"verbose": verbose,
|
|
102
|
+
"completion_kwargs": {
|
|
103
|
+
"temperature": 0.7,
|
|
104
|
+
"max_tokens": 2048,
|
|
105
|
+
"tool_choice": "auto",
|
|
106
|
+
},
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
if allowed_tools:
|
|
110
|
+
config["allowed_tools"] = allowed_tools
|
|
111
|
+
|
|
112
|
+
return config
|
|
113
|
+
|
|
114
|
+
|
|
71
115
|
def build_agent(
|
|
72
116
|
agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"],
|
|
73
117
|
*,
|
|
@@ -86,8 +130,6 @@ def build_agent(
|
|
|
86
130
|
elif agent_type == "vllm":
|
|
87
131
|
# Create a generic OpenAI agent for vLLM server
|
|
88
132
|
try:
|
|
89
|
-
from openai import AsyncOpenAI
|
|
90
|
-
|
|
91
133
|
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
|
|
92
134
|
except ImportError as e:
|
|
93
135
|
hud_console.error(
|
|
@@ -96,36 +138,14 @@ def build_agent(
|
|
|
96
138
|
)
|
|
97
139
|
raise typer.Exit(1) from e
|
|
98
140
|
|
|
99
|
-
#
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
api_key = (
|
|
105
|
-
settings.api_key if base_url.startswith(settings.hud_rl_url) else "token-abc123"
|
|
106
|
-
)
|
|
107
|
-
else:
|
|
108
|
-
# Default to localhost
|
|
109
|
-
base_url = "http://localhost:8000/v1"
|
|
110
|
-
api_key = "token-abc123"
|
|
111
|
-
|
|
112
|
-
# Create OpenAI client for vLLM
|
|
113
|
-
openai_client = AsyncOpenAI(
|
|
114
|
-
base_url=base_url,
|
|
115
|
-
api_key=api_key,
|
|
116
|
-
timeout=30.0,
|
|
117
|
-
)
|
|
118
|
-
|
|
119
|
-
return GenericOpenAIChatAgent(
|
|
120
|
-
openai_client=openai_client,
|
|
121
|
-
model_name=model or "served-model", # Default model name
|
|
141
|
+
# Use the shared config builder
|
|
142
|
+
config = _build_vllm_config(
|
|
143
|
+
vllm_base_url=vllm_base_url,
|
|
144
|
+
model=model,
|
|
145
|
+
allowed_tools=allowed_tools,
|
|
122
146
|
verbose=verbose,
|
|
123
|
-
completion_kwargs={
|
|
124
|
-
"temperature": 0.7,
|
|
125
|
-
"max_tokens": 2048,
|
|
126
|
-
"tool_choice": "required", # if self.actor_config.force_tool_choice else "auto",
|
|
127
|
-
},
|
|
128
147
|
)
|
|
148
|
+
return GenericOpenAIChatAgent(**config)
|
|
129
149
|
|
|
130
150
|
elif agent_type == "openai":
|
|
131
151
|
try:
|
|
@@ -257,25 +277,17 @@ async def run_single_task(
|
|
|
257
277
|
agent_config["allowed_tools"] = allowed_tools
|
|
258
278
|
elif agent_type == "vllm":
|
|
259
279
|
# Special handling for vLLM
|
|
260
|
-
|
|
261
|
-
|
|
280
|
+
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
|
|
281
|
+
|
|
282
|
+
agent_class = GenericOpenAIChatAgent
|
|
283
|
+
|
|
284
|
+
# Use the shared config builder
|
|
285
|
+
agent_config = _build_vllm_config(
|
|
286
|
+
vllm_base_url=vllm_base_url,
|
|
262
287
|
model=model,
|
|
263
288
|
allowed_tools=allowed_tools,
|
|
264
289
|
verbose=verbose,
|
|
265
|
-
vllm_base_url=vllm_base_url,
|
|
266
290
|
)
|
|
267
|
-
agent_config = {
|
|
268
|
-
"openai_client": sample_agent.oai,
|
|
269
|
-
"model_name": sample_agent.model_name,
|
|
270
|
-
"verbose": verbose,
|
|
271
|
-
"completion_kwargs": sample_agent.completion_kwargs,
|
|
272
|
-
}
|
|
273
|
-
if allowed_tools:
|
|
274
|
-
agent_config["allowed_tools"] = allowed_tools
|
|
275
|
-
|
|
276
|
-
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
|
|
277
|
-
|
|
278
|
-
agent_class = GenericOpenAIChatAgent
|
|
279
291
|
elif agent_type == "openai":
|
|
280
292
|
from hud.agents import OperatorAgent
|
|
281
293
|
|
|
@@ -300,6 +312,7 @@ async def run_single_task(
|
|
|
300
312
|
agent_config = {
|
|
301
313
|
"model": model or "claude-sonnet-4-20250514",
|
|
302
314
|
"verbose": verbose,
|
|
315
|
+
"validate_api_key": False,
|
|
303
316
|
}
|
|
304
317
|
if allowed_tools:
|
|
305
318
|
agent_config["allowed_tools"] = allowed_tools
|
|
@@ -345,24 +358,18 @@ async def run_full_dataset(
|
|
|
345
358
|
allowed_tools: list[str] | None = None,
|
|
346
359
|
max_concurrent: int = 30,
|
|
347
360
|
max_steps: int = 10,
|
|
348
|
-
parallel: bool = False,
|
|
349
|
-
max_workers: int | None = None,
|
|
350
|
-
max_concurrent_per_worker: int = 25,
|
|
351
361
|
verbose: bool = False,
|
|
352
362
|
vllm_base_url: str | None = None,
|
|
353
363
|
group_size: int = 1,
|
|
354
364
|
) -> list[Any]:
|
|
355
|
-
"""Run evaluation across the entire dataset.
|
|
356
|
-
|
|
357
|
-
Uses either asyncio-based run_dataset or process-based parallel execution
|
|
358
|
-
depending on the parallel flag."""
|
|
365
|
+
"""Run evaluation across the entire dataset using asyncio-based concurrency."""
|
|
359
366
|
|
|
360
367
|
# Provide early feedback to user
|
|
361
368
|
hud_console.info("🔧 Initializing evaluation...")
|
|
362
369
|
|
|
363
370
|
# Import run_dataset lazily
|
|
364
371
|
try:
|
|
365
|
-
from hud.datasets import run_dataset
|
|
372
|
+
from hud.datasets import run_dataset
|
|
366
373
|
from hud.utils.tasks import load_tasks
|
|
367
374
|
except ImportError as e:
|
|
368
375
|
hud_console.error(
|
|
@@ -387,6 +394,7 @@ async def run_full_dataset(
|
|
|
387
394
|
dataset_name = f"Dataset: {path.name}" if path.exists() else source.split("/")[-1]
|
|
388
395
|
|
|
389
396
|
# Build agent class + config for run_dataset
|
|
397
|
+
agent_config: dict[str, Any]
|
|
390
398
|
if agent_type == "integration_test": # --integration-test mode
|
|
391
399
|
from hud.agents.misc.integration_test_agent import IntegrationTestRunner
|
|
392
400
|
|
|
@@ -404,24 +412,13 @@ async def run_full_dataset(
|
|
|
404
412
|
)
|
|
405
413
|
raise typer.Exit(1) from e
|
|
406
414
|
|
|
407
|
-
# Use
|
|
408
|
-
|
|
409
|
-
|
|
415
|
+
# Use the shared config builder
|
|
416
|
+
agent_config = _build_vllm_config(
|
|
417
|
+
vllm_base_url=vllm_base_url,
|
|
410
418
|
model=model,
|
|
411
419
|
allowed_tools=allowed_tools,
|
|
412
420
|
verbose=verbose,
|
|
413
|
-
vllm_base_url=vllm_base_url,
|
|
414
421
|
)
|
|
415
|
-
|
|
416
|
-
# Extract the config from the sample agent
|
|
417
|
-
agent_config: dict[str, Any] = {
|
|
418
|
-
"openai_client": sample_agent.oai,
|
|
419
|
-
"model_name": sample_agent.model_name,
|
|
420
|
-
"verbose": verbose,
|
|
421
|
-
"completion_kwargs": sample_agent.completion_kwargs,
|
|
422
|
-
}
|
|
423
|
-
if allowed_tools:
|
|
424
|
-
agent_config["allowed_tools"] = allowed_tools
|
|
425
422
|
elif agent_type == "openai":
|
|
426
423
|
try:
|
|
427
424
|
from hud.agents import OperatorAgent
|
|
@@ -434,7 +431,7 @@ async def run_full_dataset(
|
|
|
434
431
|
)
|
|
435
432
|
raise typer.Exit(1) from e
|
|
436
433
|
|
|
437
|
-
agent_config = {"verbose": verbose}
|
|
434
|
+
agent_config = {"verbose": verbose, "validate_api_key": False}
|
|
438
435
|
if allowed_tools:
|
|
439
436
|
agent_config["allowed_tools"] = allowed_tools
|
|
440
437
|
|
|
@@ -472,6 +469,7 @@ async def run_full_dataset(
|
|
|
472
469
|
agent_config = {
|
|
473
470
|
"model": model or "claude-sonnet-4-20250514",
|
|
474
471
|
"verbose": verbose,
|
|
472
|
+
"validate_api_key": False,
|
|
475
473
|
}
|
|
476
474
|
if allowed_tools:
|
|
477
475
|
agent_config["allowed_tools"] = allowed_tools
|
|
@@ -505,9 +503,7 @@ async def run_full_dataset(
|
|
|
505
503
|
agent_class=agent_class,
|
|
506
504
|
agent_config=agent_config,
|
|
507
505
|
group_size=group_size,
|
|
508
|
-
max_parallel_episodes=max_concurrent
|
|
509
|
-
if not parallel
|
|
510
|
-
else max_concurrent_per_worker * (max_workers or 4),
|
|
506
|
+
max_parallel_episodes=max_concurrent,
|
|
511
507
|
max_steps=max_steps,
|
|
512
508
|
verbose=verbose,
|
|
513
509
|
job_id=job.id,
|
|
@@ -519,48 +515,18 @@ async def run_full_dataset(
|
|
|
519
515
|
# Return stats for consistency with other modes
|
|
520
516
|
return stats
|
|
521
517
|
|
|
522
|
-
#
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
max_concurrent=max_concurrent,
|
|
535
|
-
metadata={"dataset": source, "parallel": True},
|
|
536
|
-
max_steps=max_steps,
|
|
537
|
-
auto_respond=True,
|
|
538
|
-
)
|
|
539
|
-
else:
|
|
540
|
-
# Use manual configuration
|
|
541
|
-
return await run_dataset_parallel_manual(
|
|
542
|
-
name=f"Evaluation {dataset_name}",
|
|
543
|
-
dataset=dataset_or_tasks,
|
|
544
|
-
agent_class=agent_class,
|
|
545
|
-
agent_config=agent_config,
|
|
546
|
-
max_workers=max_workers,
|
|
547
|
-
max_concurrent_per_worker=max_concurrent_per_worker,
|
|
548
|
-
max_concurrent=max_concurrent,
|
|
549
|
-
metadata={"dataset": source, "parallel": True},
|
|
550
|
-
max_steps=max_steps,
|
|
551
|
-
auto_respond=True,
|
|
552
|
-
)
|
|
553
|
-
else:
|
|
554
|
-
hud_console.info(f"🚀 Running evaluation (max_concurrent: {max_concurrent})…")
|
|
555
|
-
return await run_dataset(
|
|
556
|
-
name=f"Evaluation {dataset_name}",
|
|
557
|
-
dataset=dataset_or_tasks,
|
|
558
|
-
agent_class=agent_class,
|
|
559
|
-
agent_config=agent_config,
|
|
560
|
-
max_concurrent=max_concurrent,
|
|
561
|
-
metadata={"dataset": source},
|
|
562
|
-
max_steps=max_steps,
|
|
563
|
-
)
|
|
518
|
+
# Run evaluation with asyncio-based concurrency
|
|
519
|
+
hud_console.info(f"🚀 Running evaluation (max_concurrent: {max_concurrent})…")
|
|
520
|
+
return await run_dataset(
|
|
521
|
+
name=f"Evaluation {dataset_name}",
|
|
522
|
+
dataset=dataset_or_tasks,
|
|
523
|
+
agent_class=agent_class,
|
|
524
|
+
agent_config=agent_config,
|
|
525
|
+
max_concurrent=max_concurrent,
|
|
526
|
+
metadata={"dataset": source},
|
|
527
|
+
max_steps=max_steps,
|
|
528
|
+
auto_respond=True,
|
|
529
|
+
)
|
|
564
530
|
|
|
565
531
|
|
|
566
532
|
def eval_command(
|
|
@@ -591,31 +557,20 @@ def eval_command(
|
|
|
591
557
|
max_concurrent: int = typer.Option(
|
|
592
558
|
30,
|
|
593
559
|
"--max-concurrent",
|
|
594
|
-
help=
|
|
560
|
+
help=(
|
|
561
|
+
"Maximum concurrent tasks (1-200 recommended, prevents rate limits "
|
|
562
|
+
"and resource exhaustion)"
|
|
563
|
+
),
|
|
595
564
|
),
|
|
596
565
|
max_steps: int | None = typer.Option(
|
|
597
566
|
None,
|
|
598
567
|
"--max-steps",
|
|
599
568
|
help="Maximum steps per task (default: 10 for single, 50 for full)",
|
|
600
569
|
),
|
|
601
|
-
parallel: bool = typer.Option(
|
|
602
|
-
False,
|
|
603
|
-
"--parallel",
|
|
604
|
-
help="Use process-based parallel execution for large datasets (100+ tasks)",
|
|
605
|
-
),
|
|
606
|
-
max_workers: int | None = typer.Option(
|
|
607
|
-
None,
|
|
608
|
-
"--max-workers",
|
|
609
|
-
help="Number of worker processes for parallel mode (auto-optimized if not set)",
|
|
610
|
-
),
|
|
611
|
-
max_concurrent_per_worker: int = typer.Option(
|
|
612
|
-
20,
|
|
613
|
-
"--max-concurrent-per-worker",
|
|
614
|
-
help="Maximum concurrent tasks per worker in parallel mode",
|
|
615
|
-
),
|
|
616
570
|
verbose: bool = typer.Option(
|
|
617
571
|
False,
|
|
618
572
|
"--verbose",
|
|
573
|
+
"-v",
|
|
619
574
|
help="Enable verbose output from the agent",
|
|
620
575
|
),
|
|
621
576
|
very_verbose: bool = typer.Option(
|
|
@@ -650,23 +605,20 @@ def eval_command(
|
|
|
650
605
|
# Evaluate a single task from SheetBench
|
|
651
606
|
hud eval hud-evals/SheetBench-50
|
|
652
607
|
|
|
653
|
-
# Evaluate the FULL SheetBench dataset with Claude
|
|
608
|
+
# Evaluate the FULL SheetBench dataset with Claude
|
|
654
609
|
hud eval hud-evals/SheetBench-50 --full --agent claude
|
|
655
610
|
|
|
656
|
-
# Run
|
|
657
|
-
hud eval hud-evals/OSWorld-Verified-Gold --full --
|
|
611
|
+
# Run with higher concurrency for faster evaluation
|
|
612
|
+
hud eval hud-evals/OSWorld-Verified-Gold --full --max-concurrent 100
|
|
658
613
|
|
|
659
|
-
#
|
|
660
|
-
hud eval hud-evals/
|
|
661
|
-
|
|
662
|
-
# Limit total concurrent tasks to prevent rate limits
|
|
663
|
-
hud eval hud-evals/SheetBench-50 --full --parallel --max-concurrent 20
|
|
614
|
+
# Limit concurrent tasks to prevent rate limits
|
|
615
|
+
hud eval hud-evals/SheetBench-50 --full --max-concurrent 20
|
|
664
616
|
|
|
665
617
|
# Run a single task from a JSON file
|
|
666
618
|
hud eval task.json
|
|
667
619
|
|
|
668
|
-
# Run multiple tasks from a JSON file
|
|
669
|
-
hud eval tasks.json --full
|
|
620
|
+
# Run multiple tasks from a JSON file
|
|
621
|
+
hud eval tasks.json --full
|
|
670
622
|
|
|
671
623
|
# Run with OpenAI Operator agent
|
|
672
624
|
hud eval hud-evals/OSWorld-Gold-Beta --agent openai
|
|
@@ -680,8 +632,6 @@ def eval_command(
|
|
|
680
632
|
# Run with verbose output for debugging
|
|
681
633
|
hud eval task.json --verbose
|
|
682
634
|
"""
|
|
683
|
-
from hud.settings import settings
|
|
684
|
-
|
|
685
635
|
# Always configure basic logging so agent steps can be logged
|
|
686
636
|
# Set to INFO by default for consistency with run_evaluation.py
|
|
687
637
|
if very_verbose:
|
|
@@ -736,7 +686,11 @@ def eval_command(
|
|
|
736
686
|
|
|
737
687
|
# Run evaluation
|
|
738
688
|
if full:
|
|
739
|
-
|
|
689
|
+
import time
|
|
690
|
+
|
|
691
|
+
start_time = time.time()
|
|
692
|
+
|
|
693
|
+
results = asyncio.run(
|
|
740
694
|
run_full_dataset(
|
|
741
695
|
source,
|
|
742
696
|
agent_type=agent,
|
|
@@ -744,14 +698,29 @@ def eval_command(
|
|
|
744
698
|
allowed_tools=allowed_tools_list,
|
|
745
699
|
max_concurrent=max_concurrent,
|
|
746
700
|
max_steps=max_steps,
|
|
747
|
-
parallel=parallel,
|
|
748
|
-
max_workers=max_workers,
|
|
749
|
-
max_concurrent_per_worker=max_concurrent_per_worker,
|
|
750
701
|
verbose=very_verbose or verbose,
|
|
751
702
|
vllm_base_url=vllm_base_url,
|
|
752
703
|
group_size=group_size,
|
|
753
704
|
)
|
|
754
705
|
)
|
|
706
|
+
|
|
707
|
+
elapsed = time.time() - start_time
|
|
708
|
+
|
|
709
|
+
# Print statistics (only for non-grouped mode)
|
|
710
|
+
if group_size == 1 and results:
|
|
711
|
+
hud_console.info("\n" + "=" * 50)
|
|
712
|
+
hud_console.success("📊 Evaluation Complete!")
|
|
713
|
+
hud_console.info("=" * 50)
|
|
714
|
+
hud_console.info(f"Total tasks: {len(results)}")
|
|
715
|
+
hud_console.info(f"Time elapsed: {elapsed:.2f} seconds")
|
|
716
|
+
hud_console.info(f"Throughput: {len(results) / elapsed:.2f} tasks/second")
|
|
717
|
+
hud_console.info(f"Execution mode: ASYNCIO (max_concurrent: {max_concurrent})")
|
|
718
|
+
|
|
719
|
+
# Count successes
|
|
720
|
+
successful = sum(1 for r in results if getattr(r, "reward", 0) > 0.7)
|
|
721
|
+
success_rate = 100 * successful / len(results)
|
|
722
|
+
hud_console.info(f"Successful tasks: {successful}/{len(results)} ({success_rate:.1f}%)")
|
|
723
|
+
hud_console.info("=" * 50)
|
|
755
724
|
else:
|
|
756
725
|
asyncio.run(
|
|
757
726
|
run_single_task(
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
from unittest.mock import AsyncMock, MagicMock, patch
|
|
5
|
+
|
|
6
|
+
import pytest
|
|
7
|
+
|
|
8
|
+
from hud.cli.analyze import (
|
|
9
|
+
analyze_environment,
|
|
10
|
+
analyze_environment_from_config,
|
|
11
|
+
analyze_environment_from_mcp_config,
|
|
12
|
+
display_interactive,
|
|
13
|
+
display_markdown,
|
|
14
|
+
parse_docker_command,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# Mark entire module as asyncio to ensure async tests run with pytest-asyncio
|
|
22
|
+
pytestmark = pytest.mark.asyncio
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_parse_docker_command():
|
|
26
|
+
cmd = ["docker", "run", "--rm", "-i", "img"]
|
|
27
|
+
cfg = parse_docker_command(cmd)
|
|
28
|
+
assert cfg == {"local": {"command": "docker", "args": ["run", "--rm", "-i", "img"]}}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@pytest.mark.asyncio
|
|
32
|
+
@patch("hud.cli.analyze.MCPClient")
|
|
33
|
+
@patch("hud.cli.analyze.console")
|
|
34
|
+
async def test_analyze_environment_success_json(mock_console, MockClient):
|
|
35
|
+
client = AsyncMock()
|
|
36
|
+
client.initialize.return_value = None
|
|
37
|
+
client.analyze_environment.return_value = {"tools": [], "resources": []}
|
|
38
|
+
client.shutdown.return_value = None
|
|
39
|
+
MockClient.return_value = client
|
|
40
|
+
|
|
41
|
+
await analyze_environment(["docker", "run", "img"], output_format="json", verbose=False)
|
|
42
|
+
assert client.initialize.awaited
|
|
43
|
+
assert client.analyze_environment.awaited
|
|
44
|
+
assert client.shutdown.awaited
|
|
45
|
+
assert mock_console.print_json.called
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@pytest.mark.asyncio
|
|
49
|
+
@patch("hud.cli.analyze.MCPClient")
|
|
50
|
+
@patch("hud.cli.analyze.console")
|
|
51
|
+
async def test_analyze_environment_failure(mock_console, MockClient):
|
|
52
|
+
client = AsyncMock()
|
|
53
|
+
client.initialize.side_effect = RuntimeError("boom")
|
|
54
|
+
client.shutdown.return_value = None
|
|
55
|
+
MockClient.return_value = client
|
|
56
|
+
|
|
57
|
+
# Should swallow exception and return without raising
|
|
58
|
+
await analyze_environment(["docker", "run", "img"], output_format="json", verbose=True)
|
|
59
|
+
assert client.shutdown.awaited
|
|
60
|
+
assert mock_console.print_json.called is False
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def test_display_interactive_metadata_only(monkeypatch):
|
|
64
|
+
import hud.cli.analyze as mod
|
|
65
|
+
|
|
66
|
+
monkeypatch.setattr(mod, "console", MagicMock(), raising=False)
|
|
67
|
+
monkeypatch.setattr(mod, "hud_console", MagicMock(), raising=False)
|
|
68
|
+
|
|
69
|
+
analysis = {
|
|
70
|
+
"image": "img:latest",
|
|
71
|
+
"status": "cached",
|
|
72
|
+
"tool_count": 2,
|
|
73
|
+
"tools": [
|
|
74
|
+
{"name": "t1", "description": "d1", "inputSchema": {"type": "object"}},
|
|
75
|
+
{"name": "t2", "description": "d2"},
|
|
76
|
+
],
|
|
77
|
+
"resources": [],
|
|
78
|
+
}
|
|
79
|
+
display_interactive(analysis)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def test_display_markdown_both_paths(capsys):
|
|
83
|
+
# metadata-only
|
|
84
|
+
md_only = {"image": "img:latest", "tool_count": 0, "tools": [], "resources": []}
|
|
85
|
+
display_markdown(md_only)
|
|
86
|
+
|
|
87
|
+
# live metadata
|
|
88
|
+
live = {"metadata": {"servers": ["s1"], "initialized": True}, "tools": [], "resources": []}
|
|
89
|
+
display_markdown(live)
|
|
90
|
+
|
|
91
|
+
# Check that output was generated
|
|
92
|
+
captured = capsys.readouterr()
|
|
93
|
+
assert "MCP Environment Analysis" in captured.out
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@patch("hud.cli.analyze.MCPClient")
|
|
97
|
+
async def test_analyze_environment_from_config(MockClient, tmp_path: Path):
|
|
98
|
+
client = AsyncMock()
|
|
99
|
+
client.initialize.return_value = None
|
|
100
|
+
client.analyze_environment.return_value = {"tools": [], "resources": []}
|
|
101
|
+
client.shutdown.return_value = None
|
|
102
|
+
MockClient.return_value = client
|
|
103
|
+
|
|
104
|
+
cfg = tmp_path / "mcp.json"
|
|
105
|
+
cfg.write_text('{"local": {"command": "docker", "args": ["run", "img"]}}')
|
|
106
|
+
await analyze_environment_from_config(cfg, output_format="json", verbose=False)
|
|
107
|
+
assert client.initialize.awaited and client.shutdown.awaited
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@patch("hud.cli.analyze.MCPClient")
|
|
111
|
+
async def test_analyze_environment_from_mcp_config(MockClient):
|
|
112
|
+
client = AsyncMock()
|
|
113
|
+
client.initialize.return_value = None
|
|
114
|
+
client.analyze_environment.return_value = {"tools": [], "resources": []}
|
|
115
|
+
client.shutdown.return_value = None
|
|
116
|
+
MockClient.return_value = client
|
|
117
|
+
|
|
118
|
+
mcp_config = {"local": {"command": "docker", "args": ["run", "img"]}}
|
|
119
|
+
await analyze_environment_from_mcp_config(mcp_config, output_format="json", verbose=False)
|
|
120
|
+
assert client.initialize.awaited and client.shutdown.awaited
|
hud/cli/tests/test_build.py
CHANGED
|
@@ -219,6 +219,17 @@ class TestAnalyzeMcpEnvironment:
|
|
|
219
219
|
mock_tool.description = "Test tool"
|
|
220
220
|
mock_tool.inputSchema = {"type": "object"}
|
|
221
221
|
|
|
222
|
+
# Prefer analyze_environment path (aligns with analyze CLI tests)
|
|
223
|
+
mock_client.analyze_environment = mock.AsyncMock(
|
|
224
|
+
return_value={
|
|
225
|
+
"metadata": {"servers": ["local"], "initialized": True},
|
|
226
|
+
"tools": [{"name": "test_tool", "description": "Test tool"}],
|
|
227
|
+
"hub_tools": {},
|
|
228
|
+
"resources": [],
|
|
229
|
+
"telemetry": {},
|
|
230
|
+
}
|
|
231
|
+
)
|
|
232
|
+
# Fallback still defined for completeness
|
|
222
233
|
mock_client.list_tools.return_value = [mock_tool]
|
|
223
234
|
|
|
224
235
|
result = await analyze_mcp_environment("test:latest")
|
|
@@ -237,7 +248,9 @@ class TestAnalyzeMcpEnvironment:
|
|
|
237
248
|
mock_client_class.return_value = mock_client
|
|
238
249
|
mock_client.initialize.side_effect = ConnectionError("Connection failed")
|
|
239
250
|
|
|
240
|
-
|
|
251
|
+
from hud.shared.exceptions import HudException
|
|
252
|
+
|
|
253
|
+
with pytest.raises(HudException, match="Connection failed"):
|
|
241
254
|
await analyze_mcp_environment("test:latest")
|
|
242
255
|
|
|
243
256
|
@mock.patch("hud.cli.build.MCPClient")
|
|
@@ -245,6 +258,15 @@ class TestAnalyzeMcpEnvironment:
|
|
|
245
258
|
"""Test analysis in verbose mode."""
|
|
246
259
|
mock_client = mock.AsyncMock()
|
|
247
260
|
mock_client_class.return_value = mock_client
|
|
261
|
+
mock_client.analyze_environment = mock.AsyncMock(
|
|
262
|
+
return_value={
|
|
263
|
+
"metadata": {"servers": ["local"], "initialized": True},
|
|
264
|
+
"tools": [],
|
|
265
|
+
"hub_tools": {},
|
|
266
|
+
"resources": [],
|
|
267
|
+
"telemetry": {},
|
|
268
|
+
}
|
|
269
|
+
)
|
|
248
270
|
mock_client.list_tools.return_value = []
|
|
249
271
|
|
|
250
272
|
# Just test that it runs without error in verbose mode
|
|
@@ -363,7 +385,7 @@ ENV API_KEY
|
|
|
363
385
|
mock_run.return_value = mock_result
|
|
364
386
|
|
|
365
387
|
# Run build
|
|
366
|
-
build_environment(str(env_dir), "test
|
|
388
|
+
build_environment(str(env_dir), "test-env:latest")
|
|
367
389
|
|
|
368
390
|
# Check lock file was created
|
|
369
391
|
lock_file = env_dir / "hud.lock.yaml"
|
|
@@ -373,7 +395,8 @@ ENV API_KEY
|
|
|
373
395
|
with open(lock_file) as f:
|
|
374
396
|
lock_data = yaml.safe_load(f)
|
|
375
397
|
|
|
376
|
-
assert lock_data["
|
|
398
|
+
assert lock_data["images"]["full"] == "test-env:0.1.0@sha256:abc123"
|
|
399
|
+
assert lock_data["images"]["local"] == "test-env:0.1.0"
|
|
377
400
|
assert lock_data["build"]["version"] == "0.1.0"
|
|
378
401
|
assert lock_data["environment"]["toolCount"] == 2
|
|
379
402
|
assert len(lock_data["tools"]) == 2
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
from unittest.mock import patch
|
|
5
|
+
|
|
6
|
+
import pytest
|
|
7
|
+
import typer
|
|
8
|
+
|
|
9
|
+
from hud.cli.build import build_environment
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@patch("hud.cli.build.compute_source_hash", return_value="deadbeef")
|
|
16
|
+
@patch(
|
|
17
|
+
"hud.cli.build.analyze_mcp_environment",
|
|
18
|
+
return_value={"initializeMs": 10, "toolCount": 0, "tools": []},
|
|
19
|
+
)
|
|
20
|
+
@patch("hud.cli.build.build_docker_image", return_value=True)
|
|
21
|
+
def test_build_label_rebuild_failure(_bd, _an, _hash, tmp_path: Path, monkeypatch):
|
|
22
|
+
# Minimal environment dir
|
|
23
|
+
env = tmp_path / "env"
|
|
24
|
+
env.mkdir()
|
|
25
|
+
(env / "Dockerfile").write_text("FROM python:3.11")
|
|
26
|
+
|
|
27
|
+
# Ensure subprocess.run returns non-zero for the second build (label build)
|
|
28
|
+
import types
|
|
29
|
+
|
|
30
|
+
def run_side_effect(cmd, *a, **k):
|
|
31
|
+
# Return 0 for first docker build, 1 for label build
|
|
32
|
+
if isinstance(cmd, list) and cmd[:2] == ["docker", "build"] and "--label" in cmd:
|
|
33
|
+
return types.SimpleNamespace(returncode=1, stderr="boom")
|
|
34
|
+
return types.SimpleNamespace(returncode=0, stdout="")
|
|
35
|
+
|
|
36
|
+
monkeypatch.setenv("FASTMCP_DISABLE_BANNER", "1")
|
|
37
|
+
with (
|
|
38
|
+
patch("hud.cli.build.subprocess.run", side_effect=run_side_effect),
|
|
39
|
+
pytest.raises(typer.Exit),
|
|
40
|
+
):
|
|
41
|
+
build_environment(str(env), verbose=False)
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
from unittest import mock
|
|
5
|
+
|
|
6
|
+
from hud.cli.build import (
|
|
7
|
+
extract_env_vars_from_dockerfile,
|
|
8
|
+
get_docker_image_digest,
|
|
9
|
+
get_docker_image_id,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_extract_env_vars_from_dockerfile_complex(tmp_path: Path):
|
|
17
|
+
dockerfile = tmp_path / "Dockerfile"
|
|
18
|
+
dockerfile.write_text(
|
|
19
|
+
"""
|
|
20
|
+
FROM python:3.11
|
|
21
|
+
ARG BUILD_TOKEN
|
|
22
|
+
ARG DEFAULTED=1
|
|
23
|
+
ENV RUNTIME_KEY
|
|
24
|
+
ENV FROM_ARG=$BUILD_TOKEN
|
|
25
|
+
ENV WITH_DEFAULT=val
|
|
26
|
+
"""
|
|
27
|
+
)
|
|
28
|
+
required, optional = extract_env_vars_from_dockerfile(dockerfile)
|
|
29
|
+
# BUILD_TOKEN required (ARG without default)
|
|
30
|
+
assert "BUILD_TOKEN" in required
|
|
31
|
+
# RUNTIME_KEY required (ENV without value)
|
|
32
|
+
assert "RUNTIME_KEY" in required
|
|
33
|
+
# FROM_ARG references BUILD_TOKEN -> required
|
|
34
|
+
assert "FROM_ARG" in required
|
|
35
|
+
# DEFAULTED and WITH_DEFAULT should not be marked required by default
|
|
36
|
+
assert "DEFAULTED" not in required
|
|
37
|
+
assert "WITH_DEFAULT" not in required
|
|
38
|
+
assert optional == []
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@mock.patch("subprocess.run")
|
|
42
|
+
def test_get_docker_image_digest_none(mock_run):
|
|
43
|
+
mock_run.return_value = mock.Mock(stdout="[]", returncode=0)
|
|
44
|
+
assert get_docker_image_digest("img") is None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@mock.patch("subprocess.run")
|
|
48
|
+
def test_get_docker_image_id_ok(mock_run):
|
|
49
|
+
mock_run.return_value = mock.Mock(stdout="sha256:abc", returncode=0)
|
|
50
|
+
assert get_docker_image_id("img") == "sha256:abc"
|