hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/datasets/utils.py
CHANGED
|
@@ -1,118 +1,298 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Utility functions and schemas for the datasets module."""
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
import json
|
|
6
5
|
import logging
|
|
7
|
-
from typing import Any
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
8
7
|
|
|
9
|
-
|
|
8
|
+
import httpx
|
|
9
|
+
from pydantic import BaseModel, Field, field_validator, model_validator
|
|
10
10
|
|
|
11
|
-
from hud.
|
|
11
|
+
from hud.settings import settings
|
|
12
|
+
from hud.types import AgentType, TaskInput
|
|
13
|
+
from hud.utils.hud_console import HUDConsole
|
|
12
14
|
|
|
13
|
-
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from collections.abc import Sequence
|
|
14
17
|
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
hud_console = HUDConsole()
|
|
15
20
|
|
|
16
|
-
|
|
17
|
-
""
|
|
18
|
-
|
|
21
|
+
__all__ = [
|
|
22
|
+
"BatchRequest",
|
|
23
|
+
"SingleTaskRequest",
|
|
24
|
+
"cancel_all_jobs",
|
|
25
|
+
"cancel_job",
|
|
26
|
+
"cancel_task",
|
|
27
|
+
"submit_rollouts",
|
|
28
|
+
]
|
|
19
29
|
|
|
20
|
-
Args:
|
|
21
|
-
dataset_id: HuggingFace dataset identifier (e.g., "hud-evals/SheetBench-50")
|
|
22
30
|
|
|
23
|
-
|
|
24
|
-
|
|
31
|
+
class SingleTaskRequest(BaseModel):
|
|
32
|
+
"""Request to run a single task remotely - mirrors run_single_task() args."""
|
|
33
|
+
|
|
34
|
+
task: dict[str, Any] = Field(
|
|
35
|
+
description="Task definition (v4 LegacyTask or v5 Task format).",
|
|
36
|
+
)
|
|
37
|
+
agent_type: AgentType = Field(description="Agent type to execute the task.")
|
|
38
|
+
agent_params: dict[str, Any] = Field(
|
|
39
|
+
default_factory=dict,
|
|
40
|
+
description="Agent constructor parameters passed to agent.create(). "
|
|
41
|
+
"Should include fields from BaseCreateParams (auto_trace, auto_respond, verbose) "
|
|
42
|
+
"plus agent-specific config fields (e.g., checkpoint_name for ClaudeConfig).",
|
|
43
|
+
)
|
|
44
|
+
max_steps: int = Field(default=10, description="Maximum steps allowed for the agent.")
|
|
45
|
+
job_id: str = Field(description="HUD job identifier for telemetry association.")
|
|
46
|
+
task_id: str | None = Field(default=None, description="Task identifier.")
|
|
47
|
+
trace_name: str | None = Field(default=None, description="Trace name.")
|
|
48
|
+
group_id: str | None = Field(default=None, description="Optional HUD group identifier.")
|
|
49
|
+
metadata: dict[str, Any] = Field(
|
|
50
|
+
default_factory=dict,
|
|
51
|
+
description="Additional metadata to inject into the trace context.",
|
|
52
|
+
)
|
|
53
|
+
trace_id: str | None = Field(default=None, description="Pre-assigned trace ID.")
|
|
54
|
+
use_byok: bool = Field(
|
|
55
|
+
default=False,
|
|
56
|
+
description="If True, use BYOK headers from encrypted env vars for inference.",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
@model_validator(mode="after")
|
|
60
|
+
def _validate_task(self) -> SingleTaskRequest:
|
|
61
|
+
"""Validate task is either v4 LegacyTask or v5 Task format."""
|
|
62
|
+
from hud.eval.utils import is_v4_format, validate_v4_task
|
|
63
|
+
|
|
64
|
+
# v4 format: looks like v4 (prompt + mcp_config)?
|
|
65
|
+
if is_v4_format(self.task):
|
|
66
|
+
# Validate completeness (requires evaluate_tool too)
|
|
67
|
+
validate_v4_task(self.task)
|
|
68
|
+
return self
|
|
69
|
+
|
|
70
|
+
# v5 format: env required
|
|
71
|
+
if "env" in self.task:
|
|
72
|
+
return self
|
|
73
|
+
|
|
74
|
+
# Neither v4 nor v5
|
|
75
|
+
raise ValueError("Task must have 'env' (v5) or 'prompt'+'mcp_config'+'evaluate_tool' (v4)")
|
|
76
|
+
|
|
77
|
+
@field_validator("job_id")
|
|
78
|
+
@classmethod
|
|
79
|
+
def _validate_job_id(cls, value: str) -> str:
|
|
80
|
+
if not value or not value.strip():
|
|
81
|
+
raise ValueError("job_id must be a non-empty string.")
|
|
82
|
+
return value
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class BatchRequest(BaseModel):
|
|
86
|
+
"""Request to run multiple tasks remotely."""
|
|
87
|
+
|
|
88
|
+
requests: list[SingleTaskRequest] = Field(
|
|
89
|
+
description="List of single task requests to submit.",
|
|
90
|
+
min_length=1,
|
|
91
|
+
max_length=1000,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _normalize_tasks(tasks: Sequence[TaskInput]) -> list[dict[str, Any]]:
|
|
96
|
+
"""Convert tasks to list of dicts for remote API submission."""
|
|
97
|
+
result = []
|
|
98
|
+
for t in tasks:
|
|
99
|
+
if isinstance(t, dict):
|
|
100
|
+
result.append(t)
|
|
101
|
+
elif hasattr(t, "model_dump"):
|
|
102
|
+
result.append(t.model_dump(mode="json"))
|
|
103
|
+
else:
|
|
104
|
+
raise TypeError(f"Cannot convert {type(t).__name__} to dict")
|
|
105
|
+
return result
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
async def submit_rollouts(
|
|
109
|
+
tasks: Sequence[TaskInput],
|
|
110
|
+
job_id: str,
|
|
111
|
+
agent_type: AgentType,
|
|
112
|
+
agent_params: dict[str, Any] | None = None,
|
|
113
|
+
max_steps: int = 10,
|
|
114
|
+
group_size: int = 1,
|
|
115
|
+
batch_size: int = 50,
|
|
116
|
+
metadata: dict[str, Any] | None = None,
|
|
117
|
+
use_byok: bool = False,
|
|
118
|
+
) -> None:
|
|
119
|
+
"""Submit rollouts to the HUD platform API for remote execution (fire-and-forget).
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
tasks: List of tasks (v5 Task, v4 LegacyTask, or dicts)
|
|
123
|
+
job_id: HUD job ID for telemetry grouping
|
|
124
|
+
agent_type: Agent type to use for execution
|
|
125
|
+
agent_params: Parameters passed to agent.create()
|
|
126
|
+
max_steps: Maximum steps per rollout
|
|
127
|
+
group_size: Number of rollouts per task (for variance estimation)
|
|
128
|
+
batch_size: Number of rollouts per API batch request
|
|
129
|
+
metadata: Additional metadata for each rollout
|
|
130
|
+
use_byok: If True, use BYOK keys from encrypted env vars (remote only)
|
|
25
131
|
"""
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
132
|
+
from hud.eval.utils import is_v4_format
|
|
133
|
+
|
|
134
|
+
if not settings.api_key:
|
|
135
|
+
raise ValueError("HUD_API_KEY is required for remote execution")
|
|
136
|
+
|
|
137
|
+
# Convert to dicts once for uniform processing
|
|
138
|
+
task_dicts = _normalize_tasks(tasks)
|
|
139
|
+
|
|
140
|
+
# Validate v4 tasks have remote-compatible mcp_config (URL-based, not command-based)
|
|
141
|
+
for i, td in enumerate(task_dicts):
|
|
142
|
+
if not is_v4_format(td):
|
|
143
|
+
continue # v5 tasks use env config, no mcp_config to check
|
|
144
|
+
mcp_config = td.get("mcp_config") or {}
|
|
145
|
+
for server_name, server_cfg in mcp_config.items():
|
|
146
|
+
is_local = (
|
|
147
|
+
isinstance(server_cfg, dict)
|
|
148
|
+
and "command" in server_cfg
|
|
149
|
+
and not server_cfg.get("url")
|
|
150
|
+
)
|
|
151
|
+
if is_local:
|
|
152
|
+
raise ValueError(
|
|
153
|
+
f"Remote execution requires URL-based mcp_config. "
|
|
154
|
+
f"Task {td.get('id') or i} uses local Docker config for '{server_name}'. "
|
|
155
|
+
"Convert to remote with: hud convert <tasks_file>"
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Build single task requests
|
|
159
|
+
requests: list[SingleTaskRequest] = []
|
|
160
|
+
for task_idx, td in enumerate(task_dicts):
|
|
161
|
+
base_task_id = td.get("id") or f"task_{task_idx}"
|
|
162
|
+
trace_name = td.get("prompt") or td.get("scenario") or base_task_id
|
|
163
|
+
|
|
164
|
+
for rollout_idx in range(group_size):
|
|
165
|
+
task_id = f"{base_task_id}_r{rollout_idx}" if group_size > 1 else base_task_id
|
|
166
|
+
requests.append(
|
|
167
|
+
SingleTaskRequest(
|
|
168
|
+
task=td,
|
|
169
|
+
agent_type=agent_type,
|
|
170
|
+
agent_params=agent_params or {},
|
|
171
|
+
max_steps=max_steps,
|
|
172
|
+
job_id=job_id,
|
|
173
|
+
task_id=task_id,
|
|
174
|
+
trace_name=trace_name,
|
|
175
|
+
group_id=base_task_id if group_size > 1 else None,
|
|
176
|
+
metadata=metadata or {},
|
|
177
|
+
use_byok=use_byok,
|
|
178
|
+
)
|
|
35
179
|
)
|
|
36
180
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
181
|
+
# Submit in batches
|
|
182
|
+
api_url = f"{settings.hud_api_url.rstrip('/')}/v1/rollouts/run_list"
|
|
183
|
+
headers = {"Authorization": f"Bearer {settings.api_key}"}
|
|
184
|
+
|
|
185
|
+
total_accepted = 0
|
|
186
|
+
total_rejected = 0
|
|
187
|
+
|
|
188
|
+
async with httpx.AsyncClient(timeout=120) as client:
|
|
189
|
+
for i in range(0, len(requests), batch_size):
|
|
190
|
+
batch = requests[i : i + batch_size]
|
|
191
|
+
batch_request = BatchRequest(requests=batch)
|
|
192
|
+
|
|
193
|
+
try:
|
|
194
|
+
response = await client.post(
|
|
195
|
+
api_url,
|
|
196
|
+
json=batch_request.model_dump(mode="json"),
|
|
197
|
+
headers=headers,
|
|
198
|
+
)
|
|
199
|
+
response.raise_for_status()
|
|
200
|
+
result = response.json()
|
|
201
|
+
|
|
202
|
+
total_accepted += result.get("accepted", 0)
|
|
203
|
+
total_rejected += result.get("rejected", 0)
|
|
204
|
+
|
|
205
|
+
for item in result.get("results", []):
|
|
206
|
+
if isinstance(item, dict) and item.get("status") == "rejected":
|
|
207
|
+
hud_console.warning(f"Task rejected: {item.get('error', 'Unknown reason')}")
|
|
208
|
+
|
|
209
|
+
batch_num = (i // batch_size) + 1
|
|
210
|
+
total_batches = (len(requests) + batch_size - 1) // batch_size
|
|
211
|
+
hud_console.info(
|
|
212
|
+
f"Batch {batch_num}/{total_batches}: "
|
|
213
|
+
f"{result.get('accepted', 0)}/{len(batch)} accepted"
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
except httpx.HTTPStatusError as exc:
|
|
217
|
+
if 400 <= exc.response.status_code < 500:
|
|
218
|
+
raise ValueError(f"Submission failed: {exc.response.text}") from exc
|
|
219
|
+
hud_console.error(f"Batch submission failed: {exc.response.status_code}")
|
|
220
|
+
total_rejected += len(batch)
|
|
221
|
+
|
|
222
|
+
except Exception as exc:
|
|
223
|
+
hud_console.error(f"Batch submission failed: {exc}")
|
|
224
|
+
total_rejected += len(batch)
|
|
225
|
+
|
|
226
|
+
# Log final summary
|
|
227
|
+
if total_rejected > 0:
|
|
228
|
+
hud_console.warning(
|
|
229
|
+
f"Submitted {total_accepted}/{len(requests)} requests ({total_rejected} rejected)"
|
|
56
230
|
)
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
logger.error("Error fetching system prompt from %s: %s", dataset_id, e)
|
|
60
|
-
return None
|
|
231
|
+
else:
|
|
232
|
+
hud_console.info(f"Submitted {total_accepted}/{len(requests)} requests")
|
|
61
233
|
|
|
62
234
|
|
|
63
|
-
def
|
|
64
|
-
|
|
65
|
-
|
|
235
|
+
async def cancel_job(job_id: str) -> dict[str, Any]:
|
|
236
|
+
"""Cancel all tasks for a specific job.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
job_id: The job ID to cancel
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
Response with cancellation results including total_found, cancelled counts
|
|
66
243
|
"""
|
|
67
|
-
|
|
244
|
+
api_url = f"{settings.hud_api_url.rstrip('/')}/v1/rollouts/cancel_job"
|
|
245
|
+
headers = {"Authorization": f"Bearer {settings.api_key}"}
|
|
246
|
+
|
|
247
|
+
async with httpx.AsyncClient(timeout=30) as client:
|
|
248
|
+
response = await client.post(
|
|
249
|
+
api_url,
|
|
250
|
+
json={"job_id": job_id},
|
|
251
|
+
headers=headers,
|
|
252
|
+
)
|
|
253
|
+
response.raise_for_status()
|
|
254
|
+
return response.json()
|
|
68
255
|
|
|
69
|
-
|
|
70
|
-
|
|
256
|
+
|
|
257
|
+
async def cancel_task(job_id: str, task_id: str) -> dict[str, Any]:
|
|
258
|
+
"""Cancel a specific task within a job.
|
|
71
259
|
|
|
72
260
|
Args:
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
261
|
+
job_id: The job ID
|
|
262
|
+
task_id: The specific task ID to cancel
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
Response with cancellation result
|
|
77
266
|
"""
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
raise ValueError(
|
|
81
|
-
"save_tasks expects dictionaries, not Task objects. "
|
|
82
|
-
"Task objects have resolved environment variables which would expose secrets. "
|
|
83
|
-
"Please pass raw dictionaries with template strings like '${HUD_API_KEY}' preserved."
|
|
84
|
-
)
|
|
267
|
+
api_url = f"{settings.hud_api_url.rstrip('/')}/v1/rollouts/cancel"
|
|
268
|
+
headers = {"Authorization": f"Bearer {settings.api_key}"}
|
|
85
269
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
"Please convert to dictionary format with template strings preserved."
|
|
95
|
-
)
|
|
270
|
+
async with httpx.AsyncClient(timeout=30) as client:
|
|
271
|
+
response = await client.post(
|
|
272
|
+
api_url,
|
|
273
|
+
json={"job_id": job_id, "task_id": task_id},
|
|
274
|
+
headers=headers,
|
|
275
|
+
)
|
|
276
|
+
response.raise_for_status()
|
|
277
|
+
return response.json()
|
|
96
278
|
|
|
97
|
-
row = {}
|
|
98
279
|
|
|
99
|
-
|
|
100
|
-
|
|
280
|
+
async def cancel_all_jobs() -> dict[str, Any]:
|
|
281
|
+
"""Cancel ALL active jobs for the authenticated user.
|
|
101
282
|
|
|
102
|
-
|
|
103
|
-
if field in tc_dict:
|
|
104
|
-
value = tc_dict[field]
|
|
105
|
-
# Serialize complex types as JSON strings
|
|
106
|
-
if isinstance(value, (dict | list)):
|
|
107
|
-
row[field] = json.dumps(value)
|
|
108
|
-
elif isinstance(value, (str | int | float | bool | type(None))):
|
|
109
|
-
row[field] = value if value is not None else ""
|
|
110
|
-
else:
|
|
111
|
-
# For other types, convert to string
|
|
112
|
-
row[field] = str(value)
|
|
283
|
+
This is a "panic button" to stop all running rollouts.
|
|
113
284
|
|
|
114
|
-
|
|
285
|
+
Returns:
|
|
286
|
+
Response with jobs_cancelled, total_tasks_cancelled, and job_details
|
|
287
|
+
"""
|
|
288
|
+
api_url = f"{settings.hud_api_url.rstrip('/')}/v1/rollouts/cancel_user_jobs"
|
|
289
|
+
headers = {"Authorization": f"Bearer {settings.api_key}"}
|
|
115
290
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
291
|
+
async with httpx.AsyncClient(timeout=60) as client:
|
|
292
|
+
response = await client.post(
|
|
293
|
+
api_url,
|
|
294
|
+
json={},
|
|
295
|
+
headers=headers,
|
|
296
|
+
)
|
|
297
|
+
response.raise_for_status()
|
|
298
|
+
return response.json()
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HUD Environment - A unified abstraction for MCP environments.
|
|
3
|
+
|
|
4
|
+
The Environment class is a server that you can also use as a client.
|
|
5
|
+
It subclasses MCPServer to get server capabilities (@env.tool, serve())
|
|
6
|
+
and composes FastMCP Client instances for remote connections.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
from hud.environment import Environment
|
|
10
|
+
|
|
11
|
+
# Create and connect
|
|
12
|
+
env = Environment("my-env").connect_hub("browser", prefix="web")
|
|
13
|
+
|
|
14
|
+
async with env:
|
|
15
|
+
# Get tools in any format
|
|
16
|
+
openai_tools = env.as_openai_chat_tools()
|
|
17
|
+
claude_tools = env.as_claude_tools()
|
|
18
|
+
|
|
19
|
+
# Call tools with any format - auto-parses and returns matching format
|
|
20
|
+
result = await env.call_tool("web_navigate", url="https://google.com")
|
|
21
|
+
|
|
22
|
+
# Framework integrations (requires external deps)
|
|
23
|
+
agent_tools = env.as_openai_agent_tools() # needs openai-agents
|
|
24
|
+
lc_tools = env.as_langchain_tools() # needs langchain-core
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from hud.environment.connection import ConnectionConfig, ConnectionType, Connector
|
|
28
|
+
from hud.environment.environment import Environment
|
|
29
|
+
from hud.environment.mock import MockMixin, generate_mock_value
|
|
30
|
+
from hud.environment.router import ConflictResolution, ToolRouter
|
|
31
|
+
from hud.environment.scenarios import ScenarioMixin
|
|
32
|
+
from hud.environment.types import EnvConfig
|
|
33
|
+
from hud.environment.utils import ToolFormat, format_result, parse_tool_call, parse_tool_calls
|
|
34
|
+
|
|
35
|
+
__all__ = [
|
|
36
|
+
"ConflictResolution",
|
|
37
|
+
"ConnectionConfig",
|
|
38
|
+
"ConnectionType",
|
|
39
|
+
"Connector",
|
|
40
|
+
"EnvConfig",
|
|
41
|
+
"Environment",
|
|
42
|
+
"MockMixin",
|
|
43
|
+
"ScenarioMixin",
|
|
44
|
+
"ToolFormat",
|
|
45
|
+
"ToolRouter",
|
|
46
|
+
"format_result",
|
|
47
|
+
"generate_mock_value",
|
|
48
|
+
"parse_tool_call",
|
|
49
|
+
"parse_tool_calls",
|
|
50
|
+
]
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
"""Connection management for MCP servers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from typing import TYPE_CHECKING, Any
|
|
8
|
+
|
|
9
|
+
import mcp.types as mcp_types
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from collections.abc import Callable
|
|
13
|
+
|
|
14
|
+
from fastmcp.client import Client as FastMCPClient
|
|
15
|
+
from fastmcp.tools.tool import Tool
|
|
16
|
+
|
|
17
|
+
__all__ = ["ConnectionConfig", "ConnectionType", "Connector"]
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ConnectionType(str, Enum):
|
|
23
|
+
"""Type of connection - determines parallelization capability."""
|
|
24
|
+
|
|
25
|
+
LOCAL = "local" # Stdio/Docker - single instance, not parallelizable
|
|
26
|
+
REMOTE = "remote" # HTTP/URL - can spawn multiple instances
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ConnectionConfig:
|
|
30
|
+
"""Configuration for filtering/transforming tools from a remote connection."""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
*,
|
|
35
|
+
prefix: str | None = None,
|
|
36
|
+
include: list[str] | None = None,
|
|
37
|
+
exclude: list[str] | None = None,
|
|
38
|
+
transform: Callable[[Tool], Tool | None] | None = None,
|
|
39
|
+
) -> None:
|
|
40
|
+
self.prefix = prefix
|
|
41
|
+
self.include = include
|
|
42
|
+
self.exclude = exclude
|
|
43
|
+
self.transform = transform
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class Connector:
|
|
47
|
+
"""Manages a connection to an MCP server with tool caching.
|
|
48
|
+
|
|
49
|
+
Client creation is deferred to connect() so that:
|
|
50
|
+
1. Each parallel trace gets fresh client instances
|
|
51
|
+
2. Connection happens inside trace context (for header injection)
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
transport: Any,
|
|
57
|
+
config: ConnectionConfig,
|
|
58
|
+
name: str,
|
|
59
|
+
connection_type: ConnectionType,
|
|
60
|
+
*,
|
|
61
|
+
auth: str | None = None,
|
|
62
|
+
) -> None:
|
|
63
|
+
# Store transport config - client created in connect()
|
|
64
|
+
self._transport = transport
|
|
65
|
+
self._auth = auth
|
|
66
|
+
self.config = config
|
|
67
|
+
self.name = name
|
|
68
|
+
self.connection_type = connection_type
|
|
69
|
+
self.client: FastMCPClient[Any] | None = None
|
|
70
|
+
self._tools_cache: list[mcp_types.Tool] | None = None
|
|
71
|
+
|
|
72
|
+
def copy(self) -> Connector:
|
|
73
|
+
"""Create a copy of this connector with fresh (unconnected) state.
|
|
74
|
+
|
|
75
|
+
The copy shares transport config but has its own client instance,
|
|
76
|
+
allowing parallel execution without conflicts.
|
|
77
|
+
"""
|
|
78
|
+
return Connector(
|
|
79
|
+
transport=self._transport,
|
|
80
|
+
config=self.config,
|
|
81
|
+
name=self.name,
|
|
82
|
+
connection_type=self.connection_type,
|
|
83
|
+
auth=self._auth,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def is_local(self) -> bool:
|
|
88
|
+
"""True if this is a local (non-parallelizable) connection."""
|
|
89
|
+
return self.connection_type == ConnectionType.LOCAL
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def is_remote(self) -> bool:
|
|
93
|
+
"""True if this is a remote (parallelizable) connection."""
|
|
94
|
+
return self.connection_type == ConnectionType.REMOTE
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def is_connected(self) -> bool:
|
|
98
|
+
return self.client is not None and self.client.is_connected()
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def cached_tools(self) -> list[mcp_types.Tool]:
|
|
102
|
+
return self._tools_cache or []
|
|
103
|
+
|
|
104
|
+
async def connect(self) -> None:
|
|
105
|
+
"""Create FastMCP client and connect.
|
|
106
|
+
|
|
107
|
+
Client is created here (not in __init__) so that:
|
|
108
|
+
1. Each parallel trace gets fresh client instances
|
|
109
|
+
2. httpx auto-instrumentation can inject trace headers
|
|
110
|
+
"""
|
|
111
|
+
from fastmcp.client import Client as FastMCPClient
|
|
112
|
+
|
|
113
|
+
# Create fresh client from stored transport config
|
|
114
|
+
self.client = FastMCPClient(transport=self._transport, auth=self._auth)
|
|
115
|
+
await self.client.__aenter__()
|
|
116
|
+
|
|
117
|
+
async def disconnect(self) -> None:
|
|
118
|
+
"""Disconnect and clear cache."""
|
|
119
|
+
if self.client is not None and self.is_connected:
|
|
120
|
+
await self.client.__aexit__(None, None, None)
|
|
121
|
+
self.client = None
|
|
122
|
+
self._tools_cache = None
|
|
123
|
+
|
|
124
|
+
async def list_tools(self) -> list[mcp_types.Tool]:
|
|
125
|
+
"""Fetch tools from server, apply filters/transforms/prefix, and cache."""
|
|
126
|
+
if self.client is None:
|
|
127
|
+
raise RuntimeError("Not connected - call connect() first")
|
|
128
|
+
tools = await self.client.list_tools()
|
|
129
|
+
|
|
130
|
+
result: list[mcp_types.Tool] = []
|
|
131
|
+
for tool in tools:
|
|
132
|
+
# Apply include/exclude filter
|
|
133
|
+
if self.config.include is not None and tool.name not in self.config.include:
|
|
134
|
+
continue
|
|
135
|
+
if self.config.exclude is not None and tool.name in self.config.exclude:
|
|
136
|
+
continue
|
|
137
|
+
|
|
138
|
+
# Apply transform
|
|
139
|
+
if self.config.transform is not None:
|
|
140
|
+
from fastmcp.tools.tool import Tool as FastMCPTool
|
|
141
|
+
|
|
142
|
+
fastmcp_tool = FastMCPTool.model_construct(
|
|
143
|
+
name=tool.name,
|
|
144
|
+
description=tool.description or "",
|
|
145
|
+
parameters=tool.inputSchema,
|
|
146
|
+
)
|
|
147
|
+
transformed = self.config.transform(fastmcp_tool)
|
|
148
|
+
if transformed is None:
|
|
149
|
+
continue
|
|
150
|
+
tool = mcp_types.Tool(
|
|
151
|
+
name=transformed.name,
|
|
152
|
+
description=transformed.description,
|
|
153
|
+
inputSchema=transformed.parameters,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Apply prefix
|
|
157
|
+
name = f"{self.config.prefix}_{tool.name}" if self.config.prefix else tool.name
|
|
158
|
+
result.append(
|
|
159
|
+
mcp_types.Tool(
|
|
160
|
+
name=name,
|
|
161
|
+
description=tool.description,
|
|
162
|
+
inputSchema=tool.inputSchema,
|
|
163
|
+
)
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
self._tools_cache = result
|
|
167
|
+
return result
|
|
168
|
+
|
|
169
|
+
async def call_tool(
|
|
170
|
+
self, name: str, arguments: dict[str, Any] | None = None
|
|
171
|
+
) -> mcp_types.CallToolResult:
|
|
172
|
+
"""Call a tool, stripping prefix if needed."""
|
|
173
|
+
if self.client is None:
|
|
174
|
+
raise RuntimeError("Not connected - call connect() first")
|
|
175
|
+
# Strip prefix when calling remote
|
|
176
|
+
if self.config.prefix and name.startswith(f"{self.config.prefix}_"):
|
|
177
|
+
name = name[len(self.config.prefix) + 1 :]
|
|
178
|
+
return await self.client.call_tool_mcp(name, arguments or {})
|
|
179
|
+
|
|
180
|
+
async def list_resources(self) -> list[mcp_types.Resource]:
|
|
181
|
+
if self.client is None:
|
|
182
|
+
raise RuntimeError("Not connected - call connect() first")
|
|
183
|
+
return await self.client.list_resources()
|
|
184
|
+
|
|
185
|
+
async def list_prompts(self) -> list[mcp_types.Prompt]:
|
|
186
|
+
if self.client is None:
|
|
187
|
+
raise RuntimeError("Not connected - call connect() first")
|
|
188
|
+
return await self.client.list_prompts()
|
|
189
|
+
|
|
190
|
+
async def read_resource(
|
|
191
|
+
self, uri: str
|
|
192
|
+
) -> list[mcp_types.TextResourceContents | mcp_types.BlobResourceContents]:
|
|
193
|
+
if self.client is None:
|
|
194
|
+
raise RuntimeError("Not connected - call connect() first")
|
|
195
|
+
return await self.client.read_resource(uri)
|
|
196
|
+
|
|
197
|
+
async def get_prompt(
|
|
198
|
+
self, name: str, arguments: dict[str, Any] | None = None
|
|
199
|
+
) -> mcp_types.GetPromptResult:
|
|
200
|
+
if self.client is None:
|
|
201
|
+
raise RuntimeError("Not connected - call connect() first")
|
|
202
|
+
return await self.client.get_prompt(name, arguments)
|
|
203
|
+
|
|
204
|
+
def __repr__(self) -> str:
|
|
205
|
+
t = self.connection_type.value
|
|
206
|
+
return f"Connector({self.name!r}, {t}, connected={self.is_connected})"
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Connection connectors - methods for connecting to various sources."""
|
|
2
|
+
|
|
3
|
+
from hud.environment.connectors.local import LocalConnectorMixin
|
|
4
|
+
from hud.environment.connectors.openai import OpenAIConnectorMixin
|
|
5
|
+
from hud.environment.connectors.remote import RemoteConnectorMixin
|
|
6
|
+
|
|
7
|
+
__all__ = ["ConnectorsMixin"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ConnectorsMixin(
|
|
11
|
+
RemoteConnectorMixin,
|
|
12
|
+
LocalConnectorMixin,
|
|
13
|
+
OpenAIConnectorMixin,
|
|
14
|
+
):
|
|
15
|
+
"""Combined connector mixin providing all connection methods.
|
|
16
|
+
|
|
17
|
+
Remote connections:
|
|
18
|
+
connect_hub(slug) - HUD Hub environment
|
|
19
|
+
connect_url(url) - MCP server via URL
|
|
20
|
+
connect_openapi(spec) - Mount OpenAPI spec as MCP server
|
|
21
|
+
|
|
22
|
+
Local connections (in-process):
|
|
23
|
+
connect_image(image) - Docker image via stdio
|
|
24
|
+
connect_fastapi(app) - Mount FastAPI app as MCP server
|
|
25
|
+
connect_server(server) - Mount MCPServer/FastMCP directly
|
|
26
|
+
|
|
27
|
+
MCP config:
|
|
28
|
+
connect_mcp(config) - Single mcp_config server (auto-detects local/remote)
|
|
29
|
+
connect_mcp_config(mcp_config) - Multiple mcp_config servers
|
|
30
|
+
|
|
31
|
+
Framework imports:
|
|
32
|
+
connect_function_tools(tools) - Import OpenAI Agents SDK FunctionTools
|
|
33
|
+
"""
|