hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/datasets/utils.py CHANGED
@@ -1,118 +1,298 @@
1
- """Dataset utilities for loading, saving, and fetching datasets."""
1
+ """Utility functions and schemas for the datasets module."""
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- import json
6
5
  import logging
7
- from typing import Any
6
+ from typing import TYPE_CHECKING, Any
8
7
 
9
- from datasets import Dataset
8
+ import httpx
9
+ from pydantic import BaseModel, Field, field_validator, model_validator
10
10
 
11
- from hud.types import Task
11
+ from hud.settings import settings
12
+ from hud.types import AgentType, TaskInput
13
+ from hud.utils.hud_console import HUDConsole
12
14
 
13
- logger = logging.getLogger("hud.datasets")
15
+ if TYPE_CHECKING:
16
+ from collections.abc import Sequence
14
17
 
18
+ logger = logging.getLogger(__name__)
19
+ hud_console = HUDConsole()
15
20
 
16
- async def fetch_system_prompt_from_dataset(dataset_id: str) -> str | None:
17
- """
18
- Fetch system_prompt.txt from a HuggingFace dataset repository.
21
+ __all__ = [
22
+ "BatchRequest",
23
+ "SingleTaskRequest",
24
+ "cancel_all_jobs",
25
+ "cancel_job",
26
+ "cancel_task",
27
+ "submit_rollouts",
28
+ ]
19
29
 
20
- Args:
21
- dataset_id: HuggingFace dataset identifier (e.g., "hud-evals/SheetBench-50")
22
30
 
23
- Returns:
24
- System prompt text if found, None otherwise
31
+ class SingleTaskRequest(BaseModel):
32
+ """Request to run a single task remotely - mirrors run_single_task() args."""
33
+
34
+ task: dict[str, Any] = Field(
35
+ description="Task definition (v4 LegacyTask or v5 Task format).",
36
+ )
37
+ agent_type: AgentType = Field(description="Agent type to execute the task.")
38
+ agent_params: dict[str, Any] = Field(
39
+ default_factory=dict,
40
+ description="Agent constructor parameters passed to agent.create(). "
41
+ "Should include fields from BaseCreateParams (auto_trace, auto_respond, verbose) "
42
+ "plus agent-specific config fields (e.g., checkpoint_name for ClaudeConfig).",
43
+ )
44
+ max_steps: int = Field(default=10, description="Maximum steps allowed for the agent.")
45
+ job_id: str = Field(description="HUD job identifier for telemetry association.")
46
+ task_id: str | None = Field(default=None, description="Task identifier.")
47
+ trace_name: str | None = Field(default=None, description="Trace name.")
48
+ group_id: str | None = Field(default=None, description="Optional HUD group identifier.")
49
+ metadata: dict[str, Any] = Field(
50
+ default_factory=dict,
51
+ description="Additional metadata to inject into the trace context.",
52
+ )
53
+ trace_id: str | None = Field(default=None, description="Pre-assigned trace ID.")
54
+ use_byok: bool = Field(
55
+ default=False,
56
+ description="If True, use BYOK headers from encrypted env vars for inference.",
57
+ )
58
+
59
+ @model_validator(mode="after")
60
+ def _validate_task(self) -> SingleTaskRequest:
61
+ """Validate task is either v4 LegacyTask or v5 Task format."""
62
+ from hud.eval.utils import is_v4_format, validate_v4_task
63
+
64
+ # v4 format: looks like v4 (prompt + mcp_config)?
65
+ if is_v4_format(self.task):
66
+ # Validate completeness (requires evaluate_tool too)
67
+ validate_v4_task(self.task)
68
+ return self
69
+
70
+ # v5 format: env required
71
+ if "env" in self.task:
72
+ return self
73
+
74
+ # Neither v4 nor v5
75
+ raise ValueError("Task must have 'env' (v5) or 'prompt'+'mcp_config'+'evaluate_tool' (v4)")
76
+
77
+ @field_validator("job_id")
78
+ @classmethod
79
+ def _validate_job_id(cls, value: str) -> str:
80
+ if not value or not value.strip():
81
+ raise ValueError("job_id must be a non-empty string.")
82
+ return value
83
+
84
+
85
+ class BatchRequest(BaseModel):
86
+ """Request to run multiple tasks remotely."""
87
+
88
+ requests: list[SingleTaskRequest] = Field(
89
+ description="List of single task requests to submit.",
90
+ min_length=1,
91
+ max_length=1000,
92
+ )
93
+
94
+
95
+ def _normalize_tasks(tasks: Sequence[TaskInput]) -> list[dict[str, Any]]:
96
+ """Convert tasks to list of dicts for remote API submission."""
97
+ result = []
98
+ for t in tasks:
99
+ if isinstance(t, dict):
100
+ result.append(t)
101
+ elif hasattr(t, "model_dump"):
102
+ result.append(t.model_dump(mode="json"))
103
+ else:
104
+ raise TypeError(f"Cannot convert {type(t).__name__} to dict")
105
+ return result
106
+
107
+
108
+ async def submit_rollouts(
109
+ tasks: Sequence[TaskInput],
110
+ job_id: str,
111
+ agent_type: AgentType,
112
+ agent_params: dict[str, Any] | None = None,
113
+ max_steps: int = 10,
114
+ group_size: int = 1,
115
+ batch_size: int = 50,
116
+ metadata: dict[str, Any] | None = None,
117
+ use_byok: bool = False,
118
+ ) -> None:
119
+ """Submit rollouts to the HUD platform API for remote execution (fire-and-forget).
120
+
121
+ Args:
122
+ tasks: List of tasks (v5 Task, v4 LegacyTask, or dicts)
123
+ job_id: HUD job ID for telemetry grouping
124
+ agent_type: Agent type to use for execution
125
+ agent_params: Parameters passed to agent.create()
126
+ max_steps: Maximum steps per rollout
127
+ group_size: Number of rollouts per task (for variance estimation)
128
+ batch_size: Number of rollouts per API batch request
129
+ metadata: Additional metadata for each rollout
130
+ use_byok: If True, use BYOK keys from encrypted env vars (remote only)
25
131
  """
26
- try:
27
- # Import here to avoid unnecessary dependency
28
- from huggingface_hub import hf_hub_download
29
- from huggingface_hub.errors import EntryNotFoundError
30
-
31
- # Try to download the system_prompt.txt file
32
- try:
33
- file_path = hf_hub_download(
34
- repo_id=dataset_id, filename="system_prompt.txt", repo_type="dataset"
132
+ from hud.eval.utils import is_v4_format
133
+
134
+ if not settings.api_key:
135
+ raise ValueError("HUD_API_KEY is required for remote execution")
136
+
137
+ # Convert to dicts once for uniform processing
138
+ task_dicts = _normalize_tasks(tasks)
139
+
140
+ # Validate v4 tasks have remote-compatible mcp_config (URL-based, not command-based)
141
+ for i, td in enumerate(task_dicts):
142
+ if not is_v4_format(td):
143
+ continue # v5 tasks use env config, no mcp_config to check
144
+ mcp_config = td.get("mcp_config") or {}
145
+ for server_name, server_cfg in mcp_config.items():
146
+ is_local = (
147
+ isinstance(server_cfg, dict)
148
+ and "command" in server_cfg
149
+ and not server_cfg.get("url")
150
+ )
151
+ if is_local:
152
+ raise ValueError(
153
+ f"Remote execution requires URL-based mcp_config. "
154
+ f"Task {td.get('id') or i} uses local Docker config for '{server_name}'. "
155
+ "Convert to remote with: hud convert <tasks_file>"
156
+ )
157
+
158
+ # Build single task requests
159
+ requests: list[SingleTaskRequest] = []
160
+ for task_idx, td in enumerate(task_dicts):
161
+ base_task_id = td.get("id") or f"task_{task_idx}"
162
+ trace_name = td.get("prompt") or td.get("scenario") or base_task_id
163
+
164
+ for rollout_idx in range(group_size):
165
+ task_id = f"{base_task_id}_r{rollout_idx}" if group_size > 1 else base_task_id
166
+ requests.append(
167
+ SingleTaskRequest(
168
+ task=td,
169
+ agent_type=agent_type,
170
+ agent_params=agent_params or {},
171
+ max_steps=max_steps,
172
+ job_id=job_id,
173
+ task_id=task_id,
174
+ trace_name=trace_name,
175
+ group_id=base_task_id if group_size > 1 else None,
176
+ metadata=metadata or {},
177
+ use_byok=use_byok,
178
+ )
35
179
  )
36
180
 
37
- # Read and return the content
38
- with open(file_path, encoding="utf-8") as f: # noqa: ASYNC230
39
- content = f.read().strip()
40
- if content:
41
- logger.info(
42
- "Loaded system prompt from %s (length: %d chars)", dataset_id, len(content)
43
- )
44
- return content
45
- else:
46
- logger.warning("System prompt file is empty in %s", dataset_id)
47
- return None
48
-
49
- except EntryNotFoundError:
50
- logger.debug("No system_prompt.txt found in dataset %s", dataset_id)
51
- return None
52
-
53
- except ImportError:
54
- logger.warning(
55
- "huggingface_hub not installed. Install it to fetch system prompts from datasets."
181
+ # Submit in batches
182
+ api_url = f"{settings.hud_api_url.rstrip('/')}/v1/rollouts/run_list"
183
+ headers = {"Authorization": f"Bearer {settings.api_key}"}
184
+
185
+ total_accepted = 0
186
+ total_rejected = 0
187
+
188
+ async with httpx.AsyncClient(timeout=120) as client:
189
+ for i in range(0, len(requests), batch_size):
190
+ batch = requests[i : i + batch_size]
191
+ batch_request = BatchRequest(requests=batch)
192
+
193
+ try:
194
+ response = await client.post(
195
+ api_url,
196
+ json=batch_request.model_dump(mode="json"),
197
+ headers=headers,
198
+ )
199
+ response.raise_for_status()
200
+ result = response.json()
201
+
202
+ total_accepted += result.get("accepted", 0)
203
+ total_rejected += result.get("rejected", 0)
204
+
205
+ for item in result.get("results", []):
206
+ if isinstance(item, dict) and item.get("status") == "rejected":
207
+ hud_console.warning(f"Task rejected: {item.get('error', 'Unknown reason')}")
208
+
209
+ batch_num = (i // batch_size) + 1
210
+ total_batches = (len(requests) + batch_size - 1) // batch_size
211
+ hud_console.info(
212
+ f"Batch {batch_num}/{total_batches}: "
213
+ f"{result.get('accepted', 0)}/{len(batch)} accepted"
214
+ )
215
+
216
+ except httpx.HTTPStatusError as exc:
217
+ if 400 <= exc.response.status_code < 500:
218
+ raise ValueError(f"Submission failed: {exc.response.text}") from exc
219
+ hud_console.error(f"Batch submission failed: {exc.response.status_code}")
220
+ total_rejected += len(batch)
221
+
222
+ except Exception as exc:
223
+ hud_console.error(f"Batch submission failed: {exc}")
224
+ total_rejected += len(batch)
225
+
226
+ # Log final summary
227
+ if total_rejected > 0:
228
+ hud_console.warning(
229
+ f"Submitted {total_accepted}/{len(requests)} requests ({total_rejected} rejected)"
56
230
  )
57
- return None
58
- except Exception as e:
59
- logger.error("Error fetching system prompt from %s: %s", dataset_id, e)
60
- return None
231
+ else:
232
+ hud_console.info(f"Submitted {total_accepted}/{len(requests)} requests")
61
233
 
62
234
 
63
- def save_tasks(
64
- tasks: list[dict[str, Any]], repo_id: str, fields: list[str] | None = None, **kwargs: Any
65
- ) -> None:
235
+ async def cancel_job(job_id: str) -> dict[str, Any]:
236
+ """Cancel all tasks for a specific job.
237
+
238
+ Args:
239
+ job_id: The job ID to cancel
240
+
241
+ Returns:
242
+ Response with cancellation results including total_found, cancelled counts
66
243
  """
67
- Save data to HuggingFace dataset with JSON string serialization.
244
+ api_url = f"{settings.hud_api_url.rstrip('/')}/v1/rollouts/cancel_job"
245
+ headers = {"Authorization": f"Bearer {settings.api_key}"}
246
+
247
+ async with httpx.AsyncClient(timeout=30) as client:
248
+ response = await client.post(
249
+ api_url,
250
+ json={"job_id": job_id},
251
+ headers=headers,
252
+ )
253
+ response.raise_for_status()
254
+ return response.json()
68
255
 
69
- Complex fields (dicts, lists) are serialized as JSON strings to maintain clean schema
70
- and avoid null value pollution in HuggingFace datasets.
256
+
257
+ async def cancel_task(job_id: str, task_id: str) -> dict[str, Any]:
258
+ """Cancel a specific task within a job.
71
259
 
72
260
  Args:
73
- tasks: List of dictionaries to save
74
- repo_id: HuggingFace repository ID (e.g., "hud-evals/my-tasks")
75
- fields: Optional list of fields to save. If None, saves all fields from each dict.
76
- **kwargs: Additional arguments passed to dataset.push_to_hub()
261
+ job_id: The job ID
262
+ task_id: The specific task ID to cancel
263
+
264
+ Returns:
265
+ Response with cancellation result
77
266
  """
78
- # Safety check: Ensure we're not saving Task objects (which have resolved env vars)
79
- if tasks and isinstance(tasks[0], Task):
80
- raise ValueError(
81
- "save_tasks expects dictionaries, not Task objects. "
82
- "Task objects have resolved environment variables which would expose secrets. "
83
- "Please pass raw dictionaries with template strings like '${HUD_API_KEY}' preserved."
84
- )
267
+ api_url = f"{settings.hud_api_url.rstrip('/')}/v1/rollouts/cancel"
268
+ headers = {"Authorization": f"Bearer {settings.api_key}"}
85
269
 
86
- # Convert to rows with JSON string fields
87
- data = []
88
- for i, tc_dict in enumerate(tasks):
89
- # Additional safety check for each item
90
- if isinstance(tc_dict, Task):
91
- raise ValueError(
92
- f"Item {i} is a Task object, not a dictionary. "
93
- "This would expose resolved environment variables. "
94
- "Please convert to dictionary format with template strings preserved."
95
- )
270
+ async with httpx.AsyncClient(timeout=30) as client:
271
+ response = await client.post(
272
+ api_url,
273
+ json={"job_id": job_id, "task_id": task_id},
274
+ headers=headers,
275
+ )
276
+ response.raise_for_status()
277
+ return response.json()
96
278
 
97
- row = {}
98
279
 
99
- # Determine which fields to process
100
- fields_to_process = fields if fields is not None else list(tc_dict.keys())
280
+ async def cancel_all_jobs() -> dict[str, Any]:
281
+ """Cancel ALL active jobs for the authenticated user.
101
282
 
102
- for field in fields_to_process:
103
- if field in tc_dict:
104
- value = tc_dict[field]
105
- # Serialize complex types as JSON strings
106
- if isinstance(value, (dict | list)):
107
- row[field] = json.dumps(value)
108
- elif isinstance(value, (str | int | float | bool | type(None))):
109
- row[field] = value if value is not None else ""
110
- else:
111
- # For other types, convert to string
112
- row[field] = str(value)
283
+ This is a "panic button" to stop all running rollouts.
113
284
 
114
- data.append(row)
285
+ Returns:
286
+ Response with jobs_cancelled, total_tasks_cancelled, and job_details
287
+ """
288
+ api_url = f"{settings.hud_api_url.rstrip('/')}/v1/rollouts/cancel_user_jobs"
289
+ headers = {"Authorization": f"Bearer {settings.api_key}"}
115
290
 
116
- # Create and push dataset
117
- dataset = Dataset.from_list(data)
118
- dataset.push_to_hub(repo_id, **kwargs)
291
+ async with httpx.AsyncClient(timeout=60) as client:
292
+ response = await client.post(
293
+ api_url,
294
+ json={},
295
+ headers=headers,
296
+ )
297
+ response.raise_for_status()
298
+ return response.json()
@@ -0,0 +1,52 @@
1
+ """
2
+ HUD Environment - A unified abstraction for MCP environments.
3
+
4
+ The Environment class is a server that you can also use as a client.
5
+ It subclasses MCPServer to get server capabilities (@env.tool, serve())
6
+ and composes FastMCP Client instances for remote connections.
7
+
8
+ Usage:
9
+ from hud.environment import Environment
10
+
11
+ # Create and connect
12
+ env = Environment("my-env").connect_hub("browser", prefix="web")
13
+
14
+ async with env:
15
+ # Get tools in any format
16
+ openai_tools = env.as_openai_chat_tools()
17
+ claude_tools = env.as_claude_tools()
18
+
19
+ # Call tools with any format - auto-parses and returns matching format
20
+ result = await env.call_tool("web_navigate", url="https://google.com")
21
+
22
+ # Framework integrations (requires external deps)
23
+ agent_tools = env.as_openai_agent_tools() # needs openai-agents
24
+ lc_tools = env.as_langchain_tools() # needs langchain-core
25
+ """
26
+
27
+ from hud.environment.connection import ConnectionConfig, ConnectionType, Connector
28
+ from hud.environment.environment import Environment
29
+ from hud.environment.mock import MockMixin, generate_mock_value
30
+ from hud.environment.router import ConflictResolution, MCPRouter, ToolRouter
31
+ from hud.environment.scenarios import ScenarioMixin, ScenarioSession
32
+ from hud.environment.types import EnvConfig
33
+ from hud.environment.utils import ToolFormat, format_result, parse_tool_call, parse_tool_calls
34
+
35
+ __all__ = [
36
+ "ConflictResolution",
37
+ "ConnectionConfig",
38
+ "ConnectionType",
39
+ "Connector",
40
+ "EnvConfig",
41
+ "Environment",
42
+ "MCPRouter",
43
+ "MockMixin",
44
+ "ScenarioMixin",
45
+ "ScenarioSession",
46
+ "ToolFormat",
47
+ "ToolRouter", # Backwards compat alias for MCPRouter
48
+ "format_result",
49
+ "generate_mock_value",
50
+ "parse_tool_call",
51
+ "parse_tool_calls",
52
+ ]