hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/datasets/loader.py ADDED
@@ -0,0 +1,326 @@
1
+ """Task loading utilities for HUD.
2
+
3
+ Unified interface for loading evaluation tasks from:
4
+ - HUD API (v5 format)
5
+ - Local JSON/JSONL files (v4 LegacyTask format, auto-converted)
6
+ - HuggingFace datasets (v4 LegacyTask format, auto-converted)
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ import logging
13
+ import warnings
14
+ from pathlib import Path
15
+ from typing import TYPE_CHECKING, Any, overload
16
+
17
+ import httpx
18
+
19
+ from hud.settings import settings
20
+
21
+ if TYPE_CHECKING:
22
+ from hud.eval.task import Task
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ __all__ = ["load_dataset", "load_tasks", "save_tasks"]
27
+
28
+
29
+ def _load_raw_from_file(path: Path) -> list[dict[str, Any]]:
30
+ """Load raw task dicts from a local JSON or JSONL file."""
31
+ raw_items: list[dict[str, Any]] = []
32
+
33
+ if path.suffix == ".jsonl":
34
+ # JSONL: one task per line
35
+ with open(path, encoding="utf-8") as f:
36
+ for line in f:
37
+ line = line.strip()
38
+ if not line:
39
+ continue
40
+ item = json.loads(line)
41
+ # Handle case where line contains a list
42
+ if isinstance(item, list):
43
+ raw_items.extend(i for i in item if isinstance(i, dict))
44
+ elif isinstance(item, dict):
45
+ raw_items.append(item)
46
+ else:
47
+ raise ValueError(
48
+ f"Invalid JSONL format: expected dict or list, got {type(item)}"
49
+ )
50
+ else:
51
+ # JSON: array of tasks
52
+ with open(path, encoding="utf-8") as f:
53
+ data = json.load(f)
54
+
55
+ if isinstance(data, list):
56
+ raw_items = [item for item in data if isinstance(item, dict)]
57
+ elif isinstance(data, dict):
58
+ raw_items = [data]
59
+ else:
60
+ raise ValueError(f"JSON file must contain an array or object, got {type(data)}")
61
+
62
+ return raw_items
63
+
64
+
65
+ def _load_from_file(path: Path) -> list[Task]:
66
+ """Load tasks from a local JSON or JSONL file."""
67
+ from hud.eval.task import Task
68
+
69
+ raw_items = _load_raw_from_file(path)
70
+ # Default args to {} for runnable tasks (None = template)
71
+ return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items]
72
+
73
+
74
+ def _load_raw_from_huggingface(dataset_name: str) -> list[dict[str, Any]]:
75
+ """Load raw task dicts from HuggingFace dataset."""
76
+ try:
77
+ from datasets import load_dataset as hf_load_dataset
78
+ except ImportError as e:
79
+ raise ImportError(
80
+ "Please install 'datasets' to load from HuggingFace: uv pip install datasets"
81
+ ) from e
82
+
83
+ # Parse dataset name and optional split
84
+ if ":" in dataset_name:
85
+ name, split = dataset_name.split(":", 1)
86
+ else:
87
+ name = dataset_name
88
+ split = "train" # Default split
89
+
90
+ logger.info("Loading from HuggingFace dataset: %s (split=%s)", name, split)
91
+ dataset = hf_load_dataset(name, split=split)
92
+
93
+ raw_items: list[dict[str, Any]] = []
94
+ for item in dataset:
95
+ if not isinstance(item, dict):
96
+ raise ValueError(f"Invalid HuggingFace dataset: expected dict, got {type(item)}")
97
+ raw_items.append(dict(item))
98
+
99
+ return raw_items
100
+
101
+
102
+ def _load_from_huggingface(dataset_name: str) -> list[Task]:
103
+ """Load tasks from HuggingFace dataset."""
104
+ raw_items = _load_raw_from_huggingface(dataset_name)
105
+ from hud.eval.task import Task
106
+
107
+ # Default args to {} for runnable tasks (None = template)
108
+ return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items]
109
+
110
+
111
+ def _load_raw_from_api(dataset_name: str) -> list[dict[str, Any]]:
112
+ """Load raw task dicts from HUD API."""
113
+ headers = {}
114
+ if settings.api_key:
115
+ headers["Authorization"] = f"Bearer {settings.api_key}"
116
+
117
+ with httpx.Client() as client:
118
+ response = client.get(
119
+ f"{settings.hud_api_url}/tasks/evalset/{dataset_name}",
120
+ headers=headers,
121
+ params={"all": "true"},
122
+ )
123
+ response.raise_for_status()
124
+ data = response.json()
125
+
126
+ # Extract tasks dict from response
127
+ tasks_dict = data.get("tasks", {})
128
+
129
+ raw_items: list[dict[str, Any]] = []
130
+ for task_id, task_data in tasks_dict.items():
131
+ if task_data.get("id") is None:
132
+ task_data["id"] = task_id
133
+ raw_items.append(task_data)
134
+
135
+ return raw_items
136
+
137
+
138
+ def _load_from_api(dataset_name: str) -> list[Task]:
139
+ """Load tasks from HUD API."""
140
+ from hud.eval.task import Task
141
+
142
+ raw_items = _load_raw_from_api(dataset_name)
143
+ # Default args to {} for runnable tasks (None = template)
144
+ return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items]
145
+
146
+
147
+ @overload
148
+ def load_tasks(source: str, *, raw: bool = False) -> list[Task]: ...
149
+
150
+
151
+ @overload
152
+ def load_tasks(source: str, *, raw: bool = True) -> list[dict[str, Any]]: ...
153
+
154
+
155
+ def load_tasks(source: str, *, raw: bool = False) -> list[Task] | list[dict[str, Any]]:
156
+ """Load tasks from a source.
157
+
158
+ Supports multiple sources with auto-detection:
159
+ - Local file path (JSON or JSONL)
160
+ - HUD API dataset slug (e.g., "hud-evals/SheetBench-50")
161
+ - HuggingFace dataset (e.g., "username/dataset" or "username/dataset:split")
162
+
163
+ Automatically detects and converts v4 LegacyTask format to v5 Task.
164
+
165
+ Args:
166
+ source: Task source. Can be:
167
+ - Path to a local JSON/JSONL file
168
+ - HUD API dataset slug (e.g., "hud-evals/SheetBench-50")
169
+ - HuggingFace dataset name (e.g., "hud-evals/tasks" or "hud-evals/tasks:train")
170
+ raw: If True, return raw dicts without validation or env var substitution.
171
+ Useful for preserving template strings like "${HUD_API_KEY}".
172
+
173
+ Returns:
174
+ - If raw=False (default): list[Task] ready to use with hud.eval()
175
+ - If raw=True: list[dict] with raw task data
176
+
177
+ Example:
178
+ ```python
179
+ import hud
180
+ from hud.datasets import load_tasks
181
+
182
+ # Load from HUD API
183
+ tasks = load_tasks("hud-evals/SheetBench-50")
184
+
185
+ # Load from local file (v4 format auto-converted)
186
+ tasks = load_tasks("./my-tasks.json")
187
+
188
+ # Load from HuggingFace
189
+ tasks = load_tasks("hud-evals/benchmark:test")
190
+
191
+ # Load raw dicts (preserves env var placeholders)
192
+ raw_tasks = load_tasks("./tasks.json", raw=True)
193
+
194
+ # Run evaluation
195
+ async with hud.eval(tasks) as ctx:
196
+ await agent.run(ctx)
197
+ ```
198
+
199
+ Raises:
200
+ ValueError: If task loading fails
201
+ """
202
+ # Check if it's a local file
203
+ path = Path(source)
204
+ if path.exists() and path.suffix in {".json", ".jsonl"}:
205
+ logger.info("Loading tasks from file: %s", source)
206
+ items = _load_raw_from_file(path) if raw else _load_from_file(path)
207
+ logger.info("Loaded %d tasks from %s", len(items), source)
208
+ return items
209
+
210
+ # Try HUD API first
211
+ try:
212
+ logger.info("Trying HUD API: %s", source)
213
+ items = _load_raw_from_api(source) if raw else _load_from_api(source)
214
+ logger.info("Loaded %d tasks from HUD API: %s", len(items), source)
215
+ return items
216
+ except Exception as hud_error:
217
+ logger.debug("HUD API load failed (%s), trying HuggingFace", hud_error)
218
+
219
+ # Try HuggingFace as fallback
220
+ try:
221
+ logger.info("Trying HuggingFace dataset: %s", source)
222
+ items = _load_raw_from_huggingface(source) if raw else _load_from_huggingface(source)
223
+ logger.info("Loaded %d tasks from HuggingFace: %s", len(items), source)
224
+ return items
225
+ except ImportError:
226
+ raise ValueError(
227
+ f"Failed to load tasks from '{source}'. "
228
+ "Install 'datasets' package for HuggingFace support."
229
+ ) from None
230
+ except Exception as hf_error:
231
+ raise ValueError(f"Failed to load tasks from '{source}': {hf_error}") from hf_error
232
+
233
+
234
+ def save_tasks(
235
+ name: str,
236
+ tasks: list[Task],
237
+ ) -> str:
238
+ """Save tasks to the HUD API.
239
+
240
+ Creates or updates a taskset with the given tasks.
241
+
242
+ Args:
243
+ name: Taskset name/slug (e.g., "my-evals/benchmark-v1").
244
+ If no org prefix, uses user's default org.
245
+ tasks: List of Task objects (v5 format) to save.
246
+
247
+ Returns:
248
+ The taskset ID of the created/updated taskset.
249
+
250
+ Example:
251
+ ```python
252
+ from hud.datasets import save_tasks, load_tasks
253
+ from hud.eval.task import Task
254
+ from hud.environment import Environment
255
+
256
+ # Create tasks
257
+ env = Environment("my-env")
258
+ tasks = [
259
+ Task(env=env, scenario="checkout", args={"user": "alice"}),
260
+ Task(env=env, scenario="checkout", args={"user": "bob"}),
261
+ ]
262
+
263
+ # Save to HUD API
264
+ taskset_id = save_tasks("my-evals/benchmark-v1", tasks)
265
+
266
+ # Later, load them back
267
+ loaded = load_tasks("my-evals/benchmark-v1")
268
+ ```
269
+
270
+ Raises:
271
+ TypeError: If any task is not a v5 Task object (must have 'scenario')
272
+ ValueError: If API key is not set or save fails
273
+ """
274
+ if not settings.api_key:
275
+ raise ValueError("HUD_API_KEY is required to save tasks")
276
+
277
+ # Validate all tasks are v5 format (must have 'scenario')
278
+ for i, task in enumerate(tasks):
279
+ if not hasattr(task, "scenario"):
280
+ raise TypeError(
281
+ f"Task at index {i} is missing 'scenario' - only v5 Task objects can be saved. "
282
+ "Use Task.from_v4(legacy_task) to convert from LegacyTask."
283
+ )
284
+
285
+ # Convert tasks to dicts (Task is a Pydantic model)
286
+ task_dicts = [task.model_dump(mode="json", exclude_none=True) for task in tasks]
287
+
288
+ # Build request payload
289
+ payload: dict[str, Any] = {
290
+ "name": name,
291
+ "tasks": task_dicts,
292
+ }
293
+
294
+ headers = {"Authorization": f"Bearer {settings.api_key}"}
295
+
296
+ try:
297
+ with httpx.Client(timeout=60) as client:
298
+ response = client.post(
299
+ f"{settings.hud_api_url}/tasks/evalset",
300
+ json=payload,
301
+ headers=headers,
302
+ )
303
+ response.raise_for_status()
304
+ data = response.json()
305
+ taskset_id = data.get("evalset_id") or data.get("id") or name
306
+ logger.info("Saved %d tasks to taskset: %s", len(tasks), taskset_id)
307
+ return taskset_id
308
+ except httpx.HTTPStatusError as e:
309
+ raise ValueError(f"Failed to save tasks: {e.response.text}") from e
310
+ except Exception as e:
311
+ raise ValueError(f"Failed to save tasks: {e}") from e
312
+
313
+
314
+ # Deprecated alias for backwards compatibility
315
+ def load_dataset(source: str, *, raw: bool = False) -> list[Task] | list[dict[str, Any]]:
316
+ """Deprecated: Use load_tasks() instead.
317
+
318
+ .. deprecated:: 0.6.0
319
+ load_dataset() is deprecated. Use load_tasks() instead.
320
+ """
321
+ warnings.warn(
322
+ "load_dataset() is deprecated. Use load_tasks() instead.",
323
+ DeprecationWarning,
324
+ stacklevel=2,
325
+ )
326
+ return load_tasks(source, raw=raw)
hud/datasets/runner.py CHANGED
@@ -1,126 +1,219 @@
1
- """Standard asyncio-based dataset runner."""
1
+ """Core task runner for evaluating agents on datasets.
2
+
3
+ Requires the [agents] extra: pip install hud-python[agents]
4
+ """
2
5
 
3
6
  from __future__ import annotations
4
7
 
5
- import asyncio
6
8
  import logging
7
- from typing import TYPE_CHECKING, Any, cast
8
-
9
- from datasets import Dataset, load_dataset
9
+ from typing import TYPE_CHECKING, Any
10
10
 
11
- from hud.agents.misc import ResponseAgent
12
- from hud.types import Task
11
+ import hud
12
+ from hud.types import AgentType, LegacyTask, TaskInput, Trace
13
13
 
14
14
  if TYPE_CHECKING:
15
- from hud.agents import MCPAgent
15
+ from collections.abc import Sequence
16
+
17
+ from hud.eval.context import EvalContext
18
+ from hud.eval.task import Task
16
19
 
17
20
  logger = logging.getLogger("hud.datasets")
18
21
 
19
22
 
20
23
  async def run_dataset(
21
- name: str,
22
- dataset: str | Dataset | list[dict[str, Any]],
23
- agent_class: type[MCPAgent],
24
- agent_config: dict[str, Any] | None = None,
25
- max_concurrent: int = 30,
26
- metadata: dict[str, Any] | None = None,
24
+ tasks: str | TaskInput | Sequence[TaskInput],
25
+ agent_type: str | AgentType,
26
+ *,
27
+ agent_params: dict[str, Any] | None = None,
27
28
  max_steps: int = 10,
28
- split: str = "train",
29
- auto_respond: bool = False,
30
- custom_system_prompt: str | None = None,
31
- ) -> list[Any]:
32
- """
33
- Run all tasks in a dataset with automatic job tracking.
29
+ max_concurrent: int = 30,
30
+ group_size: int = 1,
31
+ quiet: bool = True,
32
+ taskset: str | None = None,
33
+ ) -> list[EvalContext]:
34
+ """Run an agent on a dataset of tasks.
35
+
36
+ This is the primary entry point for running evaluations programmatically.
37
+ The agent is created fresh for each task context to ensure correct tool initialization.
34
38
 
35
39
  Args:
36
- name: Name for the job
37
- dataset: HuggingFace dataset identifier (e.g. "hud-evals/SheetBench-50"),
38
- Dataset object, OR list of Task objects
39
- agent_class: Agent class to instantiate (e.g., ClaudeAgent)
40
- agent_config: Configuration/kwargs for agent (model, etc.)
41
- max_concurrent: Maximum parallel task execution
42
- metadata: Optional metadata for the job
43
- max_steps: Maximum steps per task
44
- split: Dataset split to use when loading from string (default: "train")
45
- auto_respond: Whether to use auto-response agent
46
- custom_system_prompt: Override system prompt for all tasks
40
+ tasks: Tasks to run. Can be:
41
+ - A source string (file path, API slug) - loaded via load_tasks()
42
+ - A single TaskInput (Task, LegacyTask, or dict)
43
+ - A list of TaskInput objects
44
+ agent_type: Agent type (e.g., "claude", "openai", AgentType.CLAUDE).
45
+ agent_params: Parameters to pass to agent.create().
46
+ max_steps: Maximum steps per task.
47
+ max_concurrent: Maximum concurrent tasks (for parallel execution).
48
+ group_size: Number of times to run each task (for variance estimation).
49
+ quiet: Whether to suppress printing eval links and opening browser (default True).
47
50
 
48
51
  Returns:
49
- List of results from agent.run() in dataset order
52
+ List of EvalContext results from each task execution. Access `.reward` on each.
50
53
 
51
54
  Example:
52
- >>> from hud.agents import ClaudeAgent
53
- >>> # Option 1: From dataset string identifier
54
- >>> results = await run_dataset(
55
- ... "SheetBench Eval",
56
- ... "hud-evals/SheetBench-50",
57
- ... ClaudeAgent,
58
- ... {"model": "claude-3-5-sonnet-20241022"},
59
- ... )
60
- >>> # Option 2: From HuggingFace dataset object
61
- >>> from datasets import load_dataset
62
- >>> dataset = load_dataset("hud-evals/SheetBench-50", split="train")
63
- >>> results = await run_dataset("my_eval", dataset, ClaudeAgent)
64
- >>> # Option 3: From list of dicts
65
- >>> tasks = [{"prompt": "...", "mcp_config": {...}, ...}, ...]
66
- >>> results = await run_dataset("browser_eval", tasks, ClaudeAgent)
67
- """
68
- # Import here to avoid circular imports
69
- import hud
70
-
71
- dataset_link = None
72
-
73
- # Load dataset from string if needed
74
- if isinstance(dataset, str):
75
- logger.info("Loading dataset %s from HuggingFace...", dataset)
76
- dataset_link = dataset
77
-
78
- # Load dataset from HuggingFace
79
- dataset = cast("Dataset", load_dataset(dataset, split=split))
80
-
81
- # Create job context
82
- job_metadata = metadata or {}
83
- job_metadata["agent_class"] = agent_class.__name__
84
- job_metadata["agent_config"] = agent_config
85
-
86
- # Extract dataset verification info if available
87
- if isinstance(dataset, Dataset) and not dataset_link:
88
- try:
89
- general_info = next(iter(dataset.info.__dict__["download_checksums"].keys())).split("/")
90
- project = general_info[3]
91
- dataset_name = general_info[4].split("@")[0]
92
- dataset_link = f"{project}/{dataset_name}"
93
- except Exception:
94
- logger.warning("Failed to extract dataset verification info")
95
-
96
- with hud.job(name, metadata=job_metadata, dataset_link=dataset_link) as job_obj:
97
- # Run tasks with semaphore for concurrency control
98
- sem = asyncio.Semaphore(max_concurrent)
99
- results: list[Any | None] = [None] * len(dataset)
100
-
101
- async def _worker(index: int, task_dict: Any, max_steps: int = 10) -> None:
102
- async with sem:
103
- # Create trace for this task
104
- task_name = task_dict.get("prompt") or f"Task {index}"
105
- if custom_system_prompt and "system_prompt" not in task_dict:
106
- task_dict["system_prompt"] = custom_system_prompt
107
- # Ensure task_id is a string for baggage propagation
108
- raw_task_id = task_dict.get("id")
109
- safe_task_id = str(raw_task_id) if raw_task_id is not None else None
110
- with hud.trace(task_name, job_id=job_obj.id, task_id=safe_task_id):
111
- # Convert dict to Task here, at trace level
112
- task = Task(**task_dict)
113
-
114
- agent = agent_class(**(agent_config or {}))
115
-
116
- if auto_respond:
117
- agent.response_agent = ResponseAgent()
118
- results[index] = await agent.run(task, max_steps=max_steps)
119
-
120
- # Execute all tasks
121
- await asyncio.gather(
122
- *[_worker(i, task, max_steps=max_steps) for i, task in enumerate(dataset)],
123
- return_exceptions=True, # Don't fail entire batch on one error
55
+ ```python
56
+ from hud.datasets import load_tasks, run_dataset
57
+
58
+ # Load tasks and run
59
+ tasks = load_tasks("my-tasks.json")
60
+ results = await run_dataset(
61
+ tasks,
62
+ agent_type="claude",
63
+ agent_params={"checkpoint_name": "claude-sonnet-4-20250514"},
64
+ max_steps=50,
124
65
  )
125
66
 
126
- return results
67
+ for ctx in results:
68
+ print(f"Reward: {ctx.reward}")
69
+ ```
70
+ """
71
+ from hud.datasets.loader import load_tasks
72
+ from hud.eval.task import Task
73
+
74
+ # Normalize agent_type to AgentType enum
75
+ if isinstance(agent_type, str):
76
+ agent_type = AgentType(agent_type)
77
+
78
+ # Normalize tasks to list[Task]
79
+ task_list: list[Task]
80
+ if isinstance(tasks, str):
81
+ task_list = load_tasks(tasks)
82
+ elif isinstance(tasks, Task):
83
+ task_list = [tasks]
84
+ elif isinstance(tasks, LegacyTask | dict):
85
+ # Single LegacyTask or dict - convert to Task
86
+ task_list = [Task.from_v4(tasks)]
87
+ else:
88
+ # Sequence of TaskInput - convert each to Task
89
+ task_list = [t if isinstance(t, Task) else Task.from_v4(t) for t in tasks]
90
+
91
+ if not task_list:
92
+ raise ValueError("No tasks to run")
93
+
94
+ # Use hud.eval() for both single and parallel execution
95
+ async with hud.eval(
96
+ task_list,
97
+ group=group_size,
98
+ max_concurrent=max_concurrent,
99
+ quiet=quiet,
100
+ taskset=taskset,
101
+ ) as ctx:
102
+ # Build agent params - use system_prompt from ctx (set from task.agent_config)
103
+ final_agent_params = dict(agent_params or {})
104
+ if ctx.system_prompt and "system_prompt" not in final_agent_params:
105
+ final_agent_params["system_prompt"] = ctx.system_prompt
106
+
107
+ # Create agent using AgentType.cls.create()
108
+ agent = agent_type.cls.create(**final_agent_params)
109
+ await agent.run(ctx, max_steps=max_steps)
110
+ # Reward is computed by EvalContext.__aexit__ from evaluate tools
111
+
112
+ # For parallel execution, results are collected via ctx.results
113
+ if hasattr(ctx, "results") and ctx.results:
114
+ return ctx.results
115
+
116
+ return [ctx]
117
+
118
+
119
+ async def run_single_task(
120
+ task: Task,
121
+ *,
122
+ agent_type: AgentType,
123
+ agent_params: dict[str, Any] | None = None,
124
+ max_steps: int = 10,
125
+ job_id: str | None = None,
126
+ task_id: str | None = None,
127
+ group_id: str | None = None,
128
+ trace_name: str | None = None,
129
+ metadata: dict[str, Any] | None = None,
130
+ trace_id: str | None = None,
131
+ api_key: str | None = None,
132
+ trace: bool = True,
133
+ quiet: bool = False,
134
+ ) -> Trace:
135
+ """Run a single task with full control over eval context parameters.
136
+
137
+ This is the low-level entry point for running individual tasks with explicit
138
+ trace/job/group IDs. Used by remote execution workers.
139
+
140
+ Args:
141
+ task: Task object to run. Use Task.from_v4() or load_tasks() to create.
142
+ agent_type: AgentType enum specifying the agent to use.
143
+ agent_params: Parameters passed to agent.create(). Should include
144
+ pre-configured model_client for inference gateway usage.
145
+ max_steps: Maximum steps allowed for the agent.
146
+ job_id: HUD job identifier for telemetry association.
147
+ task_id: Task identifier (used in trace name if trace_name not provided).
148
+ group_id: Optional group identifier for parallel runs.
149
+ trace_name: Name for the trace (defaults to task_id or task.id).
150
+ metadata: Additional metadata for the trace context.
151
+ trace_id: Pre-assigned trace ID (if provided by backend).
152
+ api_key: API key override for telemetry and backend calls.
153
+ trace: Whether to send trace data to backend (default True).
154
+ quiet: Whether to suppress printing eval link (default False).
155
+
156
+ Returns:
157
+ Trace result from the agent run.
158
+
159
+ Example:
160
+ ```python
161
+ from hud.datasets import run_single_task
162
+ from hud.eval.task import Task
163
+ from hud.types import AgentType
164
+ from openai import AsyncOpenAI
165
+
166
+ # Create task (from v4 dict or directly)
167
+ task = Task.from_v4({"prompt": "...", "mcp_config": {...}, "evaluate_tool": {...}})
168
+
169
+ # Configure agent with inference gateway
170
+ agent_params = {
171
+ "checkpoint_name": "gpt-4o",
172
+ "validate_api_key": False,
173
+ "model_client": AsyncOpenAI(
174
+ api_key=hud_api_key,
175
+ base_url=settings.hud_gateway_url,
176
+ ),
177
+ }
178
+
179
+ result = await run_single_task(
180
+ task=task,
181
+ agent_type=AgentType.OPENAI,
182
+ agent_params=agent_params,
183
+ max_steps=20,
184
+ job_id="job-123",
185
+ task_id="task-456",
186
+ )
187
+ ```
188
+ """
189
+ # Determine trace name
190
+ effective_trace_name = trace_name or task_id or task.id or "single_task"
191
+
192
+ # Run with explicit eval context parameters
193
+ async with hud.eval(
194
+ task,
195
+ name=effective_trace_name,
196
+ job_id=job_id,
197
+ group_id=group_id,
198
+ trace_id=trace_id,
199
+ api_key=api_key,
200
+ trace=trace,
201
+ quiet=quiet,
202
+ ) as ctx:
203
+ # Build agent params - use system_prompt from ctx (set from task.agent_config)
204
+ final_agent_params = dict(agent_params or {})
205
+ if ctx.system_prompt and "system_prompt" not in final_agent_params:
206
+ final_agent_params["system_prompt"] = ctx.system_prompt
207
+
208
+ # Create agent using AgentType.cls.create()
209
+ agent = agent_type.cls.create(**final_agent_params)
210
+
211
+ # Store metadata if provided
212
+ if metadata:
213
+ ctx.metadata.update(metadata)
214
+
215
+ result = await agent.run(ctx, max_steps=max_steps)
216
+ # Reward is computed by EvalContext.__aexit__ from evaluate tools
217
+
218
+ # Return the Trace (ctx.reward is set by EvalContext.__aexit__)
219
+ return result
File without changes