hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/datasets/utils.py CHANGED
@@ -1,118 +1,298 @@
1
- """Dataset utilities for loading, saving, and fetching datasets."""
1
+ """Utility functions and schemas for the datasets module."""
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- import json
6
5
  import logging
7
- from typing import Any
6
+ from typing import TYPE_CHECKING, Any
8
7
 
9
- from datasets import Dataset
8
+ import httpx
9
+ from pydantic import BaseModel, Field, field_validator, model_validator
10
10
 
11
- from hud.types import Task
11
+ from hud.settings import settings
12
+ from hud.types import AgentType, TaskInput
13
+ from hud.utils.hud_console import HUDConsole
12
14
 
13
- logger = logging.getLogger("hud.datasets")
15
+ if TYPE_CHECKING:
16
+ from collections.abc import Sequence
14
17
 
18
+ logger = logging.getLogger(__name__)
19
+ hud_console = HUDConsole()
15
20
 
16
- async def fetch_system_prompt_from_dataset(dataset_id: str) -> str | None:
17
- """
18
- Fetch system_prompt.txt from a HuggingFace dataset repository.
21
+ __all__ = [
22
+ "BatchRequest",
23
+ "SingleTaskRequest",
24
+ "cancel_all_jobs",
25
+ "cancel_job",
26
+ "cancel_task",
27
+ "submit_rollouts",
28
+ ]
19
29
 
20
- Args:
21
- dataset_id: HuggingFace dataset identifier (e.g., "hud-evals/SheetBench-50")
22
30
 
23
- Returns:
24
- System prompt text if found, None otherwise
31
+ class SingleTaskRequest(BaseModel):
32
+ """Request to run a single task remotely - mirrors run_single_task() args."""
33
+
34
+ task: dict[str, Any] = Field(
35
+ description="Task definition (v4 LegacyTask or v5 Task format).",
36
+ )
37
+ agent_type: AgentType = Field(description="Agent type to execute the task.")
38
+ agent_params: dict[str, Any] = Field(
39
+ default_factory=dict,
40
+ description="Agent constructor parameters passed to agent.create(). "
41
+ "Should include fields from BaseCreateParams (auto_trace, auto_respond, verbose) "
42
+ "plus agent-specific config fields (e.g., checkpoint_name for ClaudeConfig).",
43
+ )
44
+ max_steps: int = Field(default=10, description="Maximum steps allowed for the agent.")
45
+ job_id: str = Field(description="HUD job identifier for telemetry association.")
46
+ task_id: str | None = Field(default=None, description="Task identifier.")
47
+ trace_name: str | None = Field(default=None, description="Trace name.")
48
+ group_id: str | None = Field(default=None, description="Optional HUD group identifier.")
49
+ metadata: dict[str, Any] = Field(
50
+ default_factory=dict,
51
+ description="Additional metadata to inject into the trace context.",
52
+ )
53
+ trace_id: str | None = Field(default=None, description="Pre-assigned trace ID.")
54
+ use_byok: bool = Field(
55
+ default=False,
56
+ description="If True, use BYOK headers from encrypted env vars for inference.",
57
+ )
58
+
59
+ @model_validator(mode="after")
60
+ def _validate_task(self) -> SingleTaskRequest:
61
+ """Validate task is either v4 LegacyTask or v5 Task format."""
62
+ from hud.eval.utils import is_v4_format, validate_v4_task
63
+
64
+ # v4 format: looks like v4 (prompt + mcp_config)?
65
+ if is_v4_format(self.task):
66
+ # Validate completeness (requires evaluate_tool too)
67
+ validate_v4_task(self.task)
68
+ return self
69
+
70
+ # v5 format: env required
71
+ if "env" in self.task:
72
+ return self
73
+
74
+ # Neither v4 nor v5
75
+ raise ValueError("Task must have 'env' (v5) or 'prompt'+'mcp_config'+'evaluate_tool' (v4)")
76
+
77
+ @field_validator("job_id")
78
+ @classmethod
79
+ def _validate_job_id(cls, value: str) -> str:
80
+ if not value or not value.strip():
81
+ raise ValueError("job_id must be a non-empty string.")
82
+ return value
83
+
84
+
85
+ class BatchRequest(BaseModel):
86
+ """Request to run multiple tasks remotely."""
87
+
88
+ requests: list[SingleTaskRequest] = Field(
89
+ description="List of single task requests to submit.",
90
+ min_length=1,
91
+ max_length=1000,
92
+ )
93
+
94
+
95
+ def _normalize_tasks(tasks: Sequence[TaskInput]) -> list[dict[str, Any]]:
96
+ """Convert tasks to list of dicts for remote API submission."""
97
+ result = []
98
+ for t in tasks:
99
+ if isinstance(t, dict):
100
+ result.append(t)
101
+ elif hasattr(t, "model_dump"):
102
+ result.append(t.model_dump(mode="json"))
103
+ else:
104
+ raise TypeError(f"Cannot convert {type(t).__name__} to dict")
105
+ return result
106
+
107
+
108
+ async def submit_rollouts(
109
+ tasks: Sequence[TaskInput],
110
+ job_id: str,
111
+ agent_type: AgentType,
112
+ agent_params: dict[str, Any] | None = None,
113
+ max_steps: int = 10,
114
+ group_size: int = 1,
115
+ batch_size: int = 50,
116
+ metadata: dict[str, Any] | None = None,
117
+ use_byok: bool = False,
118
+ ) -> None:
119
+ """Submit rollouts to the HUD platform API for remote execution (fire-and-forget).
120
+
121
+ Args:
122
+ tasks: List of tasks (v5 Task, v4 LegacyTask, or dicts)
123
+ job_id: HUD job ID for telemetry grouping
124
+ agent_type: Agent type to use for execution
125
+ agent_params: Parameters passed to agent.create()
126
+ max_steps: Maximum steps per rollout
127
+ group_size: Number of rollouts per task (for variance estimation)
128
+ batch_size: Number of rollouts per API batch request
129
+ metadata: Additional metadata for each rollout
130
+ use_byok: If True, use BYOK keys from encrypted env vars (remote only)
25
131
  """
26
- try:
27
- # Import here to avoid unnecessary dependency
28
- from huggingface_hub import hf_hub_download
29
- from huggingface_hub.errors import EntryNotFoundError
30
-
31
- # Try to download the system_prompt.txt file
32
- try:
33
- file_path = hf_hub_download(
34
- repo_id=dataset_id, filename="system_prompt.txt", repo_type="dataset"
132
+ from hud.eval.utils import is_v4_format
133
+
134
+ if not settings.api_key:
135
+ raise ValueError("HUD_API_KEY is required for remote execution")
136
+
137
+ # Convert to dicts once for uniform processing
138
+ task_dicts = _normalize_tasks(tasks)
139
+
140
+ # Validate v4 tasks have remote-compatible mcp_config (URL-based, not command-based)
141
+ for i, td in enumerate(task_dicts):
142
+ if not is_v4_format(td):
143
+ continue # v5 tasks use env config, no mcp_config to check
144
+ mcp_config = td.get("mcp_config") or {}
145
+ for server_name, server_cfg in mcp_config.items():
146
+ is_local = (
147
+ isinstance(server_cfg, dict)
148
+ and "command" in server_cfg
149
+ and not server_cfg.get("url")
150
+ )
151
+ if is_local:
152
+ raise ValueError(
153
+ f"Remote execution requires URL-based mcp_config. "
154
+ f"Task {td.get('id') or i} uses local Docker config for '{server_name}'. "
155
+ "Convert to remote with: hud convert <tasks_file>"
156
+ )
157
+
158
+ # Build single task requests
159
+ requests: list[SingleTaskRequest] = []
160
+ for task_idx, td in enumerate(task_dicts):
161
+ base_task_id = td.get("id") or f"task_{task_idx}"
162
+ trace_name = td.get("prompt") or td.get("scenario") or base_task_id
163
+
164
+ for rollout_idx in range(group_size):
165
+ task_id = f"{base_task_id}_r{rollout_idx}" if group_size > 1 else base_task_id
166
+ requests.append(
167
+ SingleTaskRequest(
168
+ task=td,
169
+ agent_type=agent_type,
170
+ agent_params=agent_params or {},
171
+ max_steps=max_steps,
172
+ job_id=job_id,
173
+ task_id=task_id,
174
+ trace_name=trace_name,
175
+ group_id=base_task_id if group_size > 1 else None,
176
+ metadata=metadata or {},
177
+ use_byok=use_byok,
178
+ )
35
179
  )
36
180
 
37
- # Read and return the content
38
- with open(file_path, encoding="utf-8") as f: # noqa: ASYNC230
39
- content = f.read().strip()
40
- if content:
41
- logger.info(
42
- "Loaded system prompt from %s (length: %d chars)", dataset_id, len(content)
43
- )
44
- return content
45
- else:
46
- logger.warning("System prompt file is empty in %s", dataset_id)
47
- return None
48
-
49
- except EntryNotFoundError:
50
- logger.debug("No system_prompt.txt found in dataset %s", dataset_id)
51
- return None
52
-
53
- except ImportError:
54
- logger.warning(
55
- "huggingface_hub not installed. Install it to fetch system prompts from datasets."
181
+ # Submit in batches
182
+ api_url = f"{settings.hud_api_url.rstrip('/')}/v1/rollouts/run_list"
183
+ headers = {"Authorization": f"Bearer {settings.api_key}"}
184
+
185
+ total_accepted = 0
186
+ total_rejected = 0
187
+
188
+ async with httpx.AsyncClient(timeout=120) as client:
189
+ for i in range(0, len(requests), batch_size):
190
+ batch = requests[i : i + batch_size]
191
+ batch_request = BatchRequest(requests=batch)
192
+
193
+ try:
194
+ response = await client.post(
195
+ api_url,
196
+ json=batch_request.model_dump(mode="json"),
197
+ headers=headers,
198
+ )
199
+ response.raise_for_status()
200
+ result = response.json()
201
+
202
+ total_accepted += result.get("accepted", 0)
203
+ total_rejected += result.get("rejected", 0)
204
+
205
+ for item in result.get("results", []):
206
+ if isinstance(item, dict) and item.get("status") == "rejected":
207
+ hud_console.warning(f"Task rejected: {item.get('error', 'Unknown reason')}")
208
+
209
+ batch_num = (i // batch_size) + 1
210
+ total_batches = (len(requests) + batch_size - 1) // batch_size
211
+ hud_console.info(
212
+ f"Batch {batch_num}/{total_batches}: "
213
+ f"{result.get('accepted', 0)}/{len(batch)} accepted"
214
+ )
215
+
216
+ except httpx.HTTPStatusError as exc:
217
+ if 400 <= exc.response.status_code < 500:
218
+ raise ValueError(f"Submission failed: {exc.response.text}") from exc
219
+ hud_console.error(f"Batch submission failed: {exc.response.status_code}")
220
+ total_rejected += len(batch)
221
+
222
+ except Exception as exc:
223
+ hud_console.error(f"Batch submission failed: {exc}")
224
+ total_rejected += len(batch)
225
+
226
+ # Log final summary
227
+ if total_rejected > 0:
228
+ hud_console.warning(
229
+ f"Submitted {total_accepted}/{len(requests)} requests ({total_rejected} rejected)"
56
230
  )
57
- return None
58
- except Exception as e:
59
- logger.error("Error fetching system prompt from %s: %s", dataset_id, e)
60
- return None
231
+ else:
232
+ hud_console.info(f"Submitted {total_accepted}/{len(requests)} requests")
61
233
 
62
234
 
63
- def save_tasks(
64
- tasks: list[dict[str, Any]], repo_id: str, fields: list[str] | None = None, **kwargs: Any
65
- ) -> None:
235
+ async def cancel_job(job_id: str) -> dict[str, Any]:
236
+ """Cancel all tasks for a specific job.
237
+
238
+ Args:
239
+ job_id: The job ID to cancel
240
+
241
+ Returns:
242
+ Response with cancellation results including total_found, cancelled counts
66
243
  """
67
- Save data to HuggingFace dataset with JSON string serialization.
244
+ api_url = f"{settings.hud_api_url.rstrip('/')}/v1/rollouts/cancel_job"
245
+ headers = {"Authorization": f"Bearer {settings.api_key}"}
246
+
247
+ async with httpx.AsyncClient(timeout=30) as client:
248
+ response = await client.post(
249
+ api_url,
250
+ json={"job_id": job_id},
251
+ headers=headers,
252
+ )
253
+ response.raise_for_status()
254
+ return response.json()
68
255
 
69
- Complex fields (dicts, lists) are serialized as JSON strings to maintain clean schema
70
- and avoid null value pollution in HuggingFace datasets.
256
+
257
+ async def cancel_task(job_id: str, task_id: str) -> dict[str, Any]:
258
+ """Cancel a specific task within a job.
71
259
 
72
260
  Args:
73
- tasks: List of dictionaries to save
74
- repo_id: HuggingFace repository ID (e.g., "hud-evals/my-tasks")
75
- fields: Optional list of fields to save. If None, saves all fields from each dict.
76
- **kwargs: Additional arguments passed to dataset.push_to_hub()
261
+ job_id: The job ID
262
+ task_id: The specific task ID to cancel
263
+
264
+ Returns:
265
+ Response with cancellation result
77
266
  """
78
- # Safety check: Ensure we're not saving Task objects (which have resolved env vars)
79
- if tasks and isinstance(tasks[0], Task):
80
- raise ValueError(
81
- "save_tasks expects dictionaries, not Task objects. "
82
- "Task objects have resolved environment variables which would expose secrets. "
83
- "Please pass raw dictionaries with template strings like '${HUD_API_KEY}' preserved."
84
- )
267
+ api_url = f"{settings.hud_api_url.rstrip('/')}/v1/rollouts/cancel"
268
+ headers = {"Authorization": f"Bearer {settings.api_key}"}
85
269
 
86
- # Convert to rows with JSON string fields
87
- data = []
88
- for i, tc_dict in enumerate(tasks):
89
- # Additional safety check for each item
90
- if isinstance(tc_dict, Task):
91
- raise ValueError(
92
- f"Item {i} is a Task object, not a dictionary. "
93
- "This would expose resolved environment variables. "
94
- "Please convert to dictionary format with template strings preserved."
95
- )
270
+ async with httpx.AsyncClient(timeout=30) as client:
271
+ response = await client.post(
272
+ api_url,
273
+ json={"job_id": job_id, "task_id": task_id},
274
+ headers=headers,
275
+ )
276
+ response.raise_for_status()
277
+ return response.json()
96
278
 
97
- row = {}
98
279
 
99
- # Determine which fields to process
100
- fields_to_process = fields if fields is not None else list(tc_dict.keys())
280
+ async def cancel_all_jobs() -> dict[str, Any]:
281
+ """Cancel ALL active jobs for the authenticated user.
101
282
 
102
- for field in fields_to_process:
103
- if field in tc_dict:
104
- value = tc_dict[field]
105
- # Serialize complex types as JSON strings
106
- if isinstance(value, (dict | list)):
107
- row[field] = json.dumps(value)
108
- elif isinstance(value, (str | int | float | bool | type(None))):
109
- row[field] = value if value is not None else ""
110
- else:
111
- # For other types, convert to string
112
- row[field] = str(value)
283
+ This is a "panic button" to stop all running rollouts.
113
284
 
114
- data.append(row)
285
+ Returns:
286
+ Response with jobs_cancelled, total_tasks_cancelled, and job_details
287
+ """
288
+ api_url = f"{settings.hud_api_url.rstrip('/')}/v1/rollouts/cancel_user_jobs"
289
+ headers = {"Authorization": f"Bearer {settings.api_key}"}
115
290
 
116
- # Create and push dataset
117
- dataset = Dataset.from_list(data)
118
- dataset.push_to_hub(repo_id, **kwargs)
291
+ async with httpx.AsyncClient(timeout=60) as client:
292
+ response = await client.post(
293
+ api_url,
294
+ json={},
295
+ headers=headers,
296
+ )
297
+ response.raise_for_status()
298
+ return response.json()
@@ -0,0 +1,50 @@
1
+ """
2
+ HUD Environment - A unified abstraction for MCP environments.
3
+
4
+ The Environment class is a server that you can also use as a client.
5
+ It subclasses MCPServer to get server capabilities (@env.tool, serve())
6
+ and composes FastMCP Client instances for remote connections.
7
+
8
+ Usage:
9
+ from hud.environment import Environment
10
+
11
+ # Create and connect
12
+ env = Environment("my-env").connect_hub("browser", prefix="web")
13
+
14
+ async with env:
15
+ # Get tools in any format
16
+ openai_tools = env.as_openai_chat_tools()
17
+ claude_tools = env.as_claude_tools()
18
+
19
+ # Call tools with any format - auto-parses and returns matching format
20
+ result = await env.call_tool("web_navigate", url="https://google.com")
21
+
22
+ # Framework integrations (requires external deps)
23
+ agent_tools = env.as_openai_agent_tools() # needs openai-agents
24
+ lc_tools = env.as_langchain_tools() # needs langchain-core
25
+ """
26
+
27
+ from hud.environment.connection import ConnectionConfig, ConnectionType, Connector
28
+ from hud.environment.environment import Environment
29
+ from hud.environment.mock import MockMixin, generate_mock_value
30
+ from hud.environment.router import ConflictResolution, ToolRouter
31
+ from hud.environment.scenarios import ScenarioMixin
32
+ from hud.environment.types import EnvConfig
33
+ from hud.environment.utils import ToolFormat, format_result, parse_tool_call, parse_tool_calls
34
+
35
+ __all__ = [
36
+ "ConflictResolution",
37
+ "ConnectionConfig",
38
+ "ConnectionType",
39
+ "Connector",
40
+ "EnvConfig",
41
+ "Environment",
42
+ "MockMixin",
43
+ "ScenarioMixin",
44
+ "ToolFormat",
45
+ "ToolRouter",
46
+ "format_result",
47
+ "generate_mock_value",
48
+ "parse_tool_call",
49
+ "parse_tool_calls",
50
+ ]
@@ -0,0 +1,206 @@
1
+ """Connection management for MCP servers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from enum import Enum
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ import mcp.types as mcp_types
10
+
11
+ if TYPE_CHECKING:
12
+ from collections.abc import Callable
13
+
14
+ from fastmcp.client import Client as FastMCPClient
15
+ from fastmcp.tools.tool import Tool
16
+
17
+ __all__ = ["ConnectionConfig", "ConnectionType", "Connector"]
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class ConnectionType(str, Enum):
23
+ """Type of connection - determines parallelization capability."""
24
+
25
+ LOCAL = "local" # Stdio/Docker - single instance, not parallelizable
26
+ REMOTE = "remote" # HTTP/URL - can spawn multiple instances
27
+
28
+
29
+ class ConnectionConfig:
30
+ """Configuration for filtering/transforming tools from a remote connection."""
31
+
32
+ def __init__(
33
+ self,
34
+ *,
35
+ prefix: str | None = None,
36
+ include: list[str] | None = None,
37
+ exclude: list[str] | None = None,
38
+ transform: Callable[[Tool], Tool | None] | None = None,
39
+ ) -> None:
40
+ self.prefix = prefix
41
+ self.include = include
42
+ self.exclude = exclude
43
+ self.transform = transform
44
+
45
+
46
+ class Connector:
47
+ """Manages a connection to an MCP server with tool caching.
48
+
49
+ Client creation is deferred to connect() so that:
50
+ 1. Each parallel trace gets fresh client instances
51
+ 2. Connection happens inside trace context (for header injection)
52
+ """
53
+
54
+ def __init__(
55
+ self,
56
+ transport: Any,
57
+ config: ConnectionConfig,
58
+ name: str,
59
+ connection_type: ConnectionType,
60
+ *,
61
+ auth: str | None = None,
62
+ ) -> None:
63
+ # Store transport config - client created in connect()
64
+ self._transport = transport
65
+ self._auth = auth
66
+ self.config = config
67
+ self.name = name
68
+ self.connection_type = connection_type
69
+ self.client: FastMCPClient[Any] | None = None
70
+ self._tools_cache: list[mcp_types.Tool] | None = None
71
+
72
+ def copy(self) -> Connector:
73
+ """Create a copy of this connector with fresh (unconnected) state.
74
+
75
+ The copy shares transport config but has its own client instance,
76
+ allowing parallel execution without conflicts.
77
+ """
78
+ return Connector(
79
+ transport=self._transport,
80
+ config=self.config,
81
+ name=self.name,
82
+ connection_type=self.connection_type,
83
+ auth=self._auth,
84
+ )
85
+
86
+ @property
87
+ def is_local(self) -> bool:
88
+ """True if this is a local (non-parallelizable) connection."""
89
+ return self.connection_type == ConnectionType.LOCAL
90
+
91
+ @property
92
+ def is_remote(self) -> bool:
93
+ """True if this is a remote (parallelizable) connection."""
94
+ return self.connection_type == ConnectionType.REMOTE
95
+
96
+ @property
97
+ def is_connected(self) -> bool:
98
+ return self.client is not None and self.client.is_connected()
99
+
100
+ @property
101
+ def cached_tools(self) -> list[mcp_types.Tool]:
102
+ return self._tools_cache or []
103
+
104
+ async def connect(self) -> None:
105
+ """Create FastMCP client and connect.
106
+
107
+ Client is created here (not in __init__) so that:
108
+ 1. Each parallel trace gets fresh client instances
109
+ 2. httpx auto-instrumentation can inject trace headers
110
+ """
111
+ from fastmcp.client import Client as FastMCPClient
112
+
113
+ # Create fresh client from stored transport config
114
+ self.client = FastMCPClient(transport=self._transport, auth=self._auth)
115
+ await self.client.__aenter__()
116
+
117
+ async def disconnect(self) -> None:
118
+ """Disconnect and clear cache."""
119
+ if self.client is not None and self.is_connected:
120
+ await self.client.__aexit__(None, None, None)
121
+ self.client = None
122
+ self._tools_cache = None
123
+
124
+ async def list_tools(self) -> list[mcp_types.Tool]:
125
+ """Fetch tools from server, apply filters/transforms/prefix, and cache."""
126
+ if self.client is None:
127
+ raise RuntimeError("Not connected - call connect() first")
128
+ tools = await self.client.list_tools()
129
+
130
+ result: list[mcp_types.Tool] = []
131
+ for tool in tools:
132
+ # Apply include/exclude filter
133
+ if self.config.include is not None and tool.name not in self.config.include:
134
+ continue
135
+ if self.config.exclude is not None and tool.name in self.config.exclude:
136
+ continue
137
+
138
+ # Apply transform
139
+ if self.config.transform is not None:
140
+ from fastmcp.tools.tool import Tool as FastMCPTool
141
+
142
+ fastmcp_tool = FastMCPTool.model_construct(
143
+ name=tool.name,
144
+ description=tool.description or "",
145
+ parameters=tool.inputSchema,
146
+ )
147
+ transformed = self.config.transform(fastmcp_tool)
148
+ if transformed is None:
149
+ continue
150
+ tool = mcp_types.Tool(
151
+ name=transformed.name,
152
+ description=transformed.description,
153
+ inputSchema=transformed.parameters,
154
+ )
155
+
156
+ # Apply prefix
157
+ name = f"{self.config.prefix}_{tool.name}" if self.config.prefix else tool.name
158
+ result.append(
159
+ mcp_types.Tool(
160
+ name=name,
161
+ description=tool.description,
162
+ inputSchema=tool.inputSchema,
163
+ )
164
+ )
165
+
166
+ self._tools_cache = result
167
+ return result
168
+
169
+ async def call_tool(
170
+ self, name: str, arguments: dict[str, Any] | None = None
171
+ ) -> mcp_types.CallToolResult:
172
+ """Call a tool, stripping prefix if needed."""
173
+ if self.client is None:
174
+ raise RuntimeError("Not connected - call connect() first")
175
+ # Strip prefix when calling remote
176
+ if self.config.prefix and name.startswith(f"{self.config.prefix}_"):
177
+ name = name[len(self.config.prefix) + 1 :]
178
+ return await self.client.call_tool_mcp(name, arguments or {})
179
+
180
+ async def list_resources(self) -> list[mcp_types.Resource]:
181
+ if self.client is None:
182
+ raise RuntimeError("Not connected - call connect() first")
183
+ return await self.client.list_resources()
184
+
185
+ async def list_prompts(self) -> list[mcp_types.Prompt]:
186
+ if self.client is None:
187
+ raise RuntimeError("Not connected - call connect() first")
188
+ return await self.client.list_prompts()
189
+
190
+ async def read_resource(
191
+ self, uri: str
192
+ ) -> list[mcp_types.TextResourceContents | mcp_types.BlobResourceContents]:
193
+ if self.client is None:
194
+ raise RuntimeError("Not connected - call connect() first")
195
+ return await self.client.read_resource(uri)
196
+
197
+ async def get_prompt(
198
+ self, name: str, arguments: dict[str, Any] | None = None
199
+ ) -> mcp_types.GetPromptResult:
200
+ if self.client is None:
201
+ raise RuntimeError("Not connected - call connect() first")
202
+ return await self.client.get_prompt(name, arguments)
203
+
204
+ def __repr__(self) -> str:
205
+ t = self.connection_type.value
206
+ return f"Connector({self.name!r}, {t}, connected={self.is_connected})"
@@ -0,0 +1,33 @@
1
+ """Connection connectors - methods for connecting to various sources."""
2
+
3
+ from hud.environment.connectors.local import LocalConnectorMixin
4
+ from hud.environment.connectors.openai import OpenAIConnectorMixin
5
+ from hud.environment.connectors.remote import RemoteConnectorMixin
6
+
7
+ __all__ = ["ConnectorsMixin"]
8
+
9
+
10
+ class ConnectorsMixin(
11
+ RemoteConnectorMixin,
12
+ LocalConnectorMixin,
13
+ OpenAIConnectorMixin,
14
+ ):
15
+ """Combined connector mixin providing all connection methods.
16
+
17
+ Remote connections:
18
+ connect_hub(slug) - HUD Hub environment
19
+ connect_url(url) - MCP server via URL
20
+ connect_openapi(spec) - Mount OpenAPI spec as MCP server
21
+
22
+ Local connections (in-process):
23
+ connect_image(image) - Docker image via stdio
24
+ connect_fastapi(app) - Mount FastAPI app as MCP server
25
+ connect_server(server) - Mount MCPServer/FastMCP directly
26
+
27
+ MCP config:
28
+ connect_mcp(config) - Single mcp_config server (auto-detects local/remote)
29
+ connect_mcp_config(mcp_config) - Multiple mcp_config servers
30
+
31
+ Framework imports:
32
+ connect_function_tools(tools) - Import OpenAI Agents SDK FunctionTools
33
+ """