hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/datasets/loader.py ADDED
@@ -0,0 +1,327 @@
1
+ """Task loading utilities for HUD.
2
+
3
+ Unified interface for loading evaluation tasks from:
4
+ - HUD API (v5 format)
5
+ - Local JSON/JSONL files (v4 LegacyTask format, auto-converted)
6
+ - HuggingFace datasets (v4 LegacyTask format, auto-converted)
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ import logging
13
+ import warnings
14
+ from pathlib import Path
15
+ from typing import TYPE_CHECKING, Any, overload
16
+
17
+ if TYPE_CHECKING:
18
+ from hud.eval.task import Task
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ __all__ = ["load_dataset", "load_tasks", "save_tasks"]
23
+
24
+
25
+ def _load_raw_from_file(path: Path) -> list[dict[str, Any]]:
26
+ """Load raw task dicts from a local JSON or JSONL file."""
27
+ raw_items: list[dict[str, Any]] = []
28
+
29
+ if path.suffix == ".jsonl":
30
+ # JSONL: one task per line
31
+ with open(path, encoding="utf-8") as f:
32
+ for line in f:
33
+ line = line.strip()
34
+ if not line:
35
+ continue
36
+ item = json.loads(line)
37
+ # Handle case where line contains a list
38
+ if isinstance(item, list):
39
+ raw_items.extend(i for i in item if isinstance(i, dict))
40
+ elif isinstance(item, dict):
41
+ raw_items.append(item)
42
+ else:
43
+ raise ValueError(
44
+ f"Invalid JSONL format: expected dict or list, got {type(item)}"
45
+ )
46
+ else:
47
+ # JSON: array of tasks
48
+ with open(path, encoding="utf-8") as f:
49
+ data = json.load(f)
50
+
51
+ if isinstance(data, list):
52
+ raw_items = [item for item in data if isinstance(item, dict)]
53
+ elif isinstance(data, dict):
54
+ raw_items = [data]
55
+ else:
56
+ raise ValueError(f"JSON file must contain an array or object, got {type(data)}")
57
+
58
+ return raw_items
59
+
60
+
61
+ def _load_from_file(path: Path) -> list[Task]:
62
+ """Load tasks from a local JSON or JSONL file."""
63
+ from hud.eval.task import Task
64
+
65
+ raw_items = _load_raw_from_file(path)
66
+ return [Task(**item) for item in raw_items]
67
+
68
+
69
+ def _load_raw_from_huggingface(dataset_name: str) -> list[dict[str, Any]]:
70
+ """Load raw task dicts from HuggingFace dataset."""
71
+ try:
72
+ from datasets import load_dataset as hf_load_dataset
73
+ except ImportError as e:
74
+ raise ImportError(
75
+ "Please install 'datasets' to load from HuggingFace: uv pip install datasets"
76
+ ) from e
77
+
78
+ # Parse dataset name and optional split
79
+ if ":" in dataset_name:
80
+ name, split = dataset_name.split(":", 1)
81
+ else:
82
+ name = dataset_name
83
+ split = "train" # Default split
84
+
85
+ logger.info("Loading from HuggingFace dataset: %s (split=%s)", name, split)
86
+ dataset = hf_load_dataset(name, split=split)
87
+
88
+ raw_items: list[dict[str, Any]] = []
89
+ for item in dataset:
90
+ if not isinstance(item, dict):
91
+ raise ValueError(f"Invalid HuggingFace dataset: expected dict, got {type(item)}")
92
+ raw_items.append(dict(item))
93
+
94
+ return raw_items
95
+
96
+
97
+ def _load_from_huggingface(dataset_name: str) -> list[Task]:
98
+ """Load tasks from HuggingFace dataset."""
99
+ raw_items = _load_raw_from_huggingface(dataset_name)
100
+ from hud.eval.task import Task
101
+
102
+ return [Task(**item) for item in raw_items]
103
+
104
+
105
+ def _load_raw_from_api(dataset_name: str) -> list[dict[str, Any]]:
106
+ """Load raw task dicts from HUD API."""
107
+ import httpx
108
+
109
+ from hud.settings import settings
110
+
111
+ headers = {}
112
+ if settings.api_key:
113
+ headers["Authorization"] = f"Bearer {settings.api_key}"
114
+
115
+ with httpx.Client() as client:
116
+ response = client.get(
117
+ f"{settings.hud_api_url}/tasks/evalset/{dataset_name}",
118
+ headers=headers,
119
+ params={"all": "true"},
120
+ )
121
+ response.raise_for_status()
122
+ data = response.json()
123
+
124
+ # Extract tasks dict from response
125
+ tasks_dict = data.get("tasks", {})
126
+
127
+ raw_items: list[dict[str, Any]] = []
128
+ for task_id, task_data in tasks_dict.items():
129
+ if task_data.get("id") is None:
130
+ task_data["id"] = task_id
131
+ raw_items.append(task_data)
132
+
133
+ return raw_items
134
+
135
+
136
+ def _load_from_api(dataset_name: str) -> list[Task]:
137
+ """Load tasks from HUD API."""
138
+ from hud.eval.task import Task
139
+
140
+ raw_items = _load_raw_from_api(dataset_name)
141
+ return [Task(**item) for item in raw_items]
142
+
143
+
144
+ @overload
145
+ def load_tasks(source: str, *, raw: bool = False) -> list[Task]: ...
146
+
147
+
148
+ @overload
149
+ def load_tasks(source: str, *, raw: bool = True) -> list[dict[str, Any]]: ...
150
+
151
+
152
+ def load_tasks(source: str, *, raw: bool = False) -> list[Task] | list[dict[str, Any]]:
153
+ """Load tasks from a source.
154
+
155
+ Supports multiple sources with auto-detection:
156
+ - Local file path (JSON or JSONL)
157
+ - HUD API dataset slug (e.g., "hud-evals/SheetBench-50")
158
+ - HuggingFace dataset (e.g., "username/dataset" or "username/dataset:split")
159
+
160
+ Automatically detects and converts v4 LegacyTask format to v5 Task.
161
+
162
+ Args:
163
+ source: Task source. Can be:
164
+ - Path to a local JSON/JSONL file
165
+ - HUD API dataset slug (e.g., "hud-evals/SheetBench-50")
166
+ - HuggingFace dataset name (e.g., "hud-evals/tasks" or "hud-evals/tasks:train")
167
+ raw: If True, return raw dicts without validation or env var substitution.
168
+ Useful for preserving template strings like "${HUD_API_KEY}".
169
+
170
+ Returns:
171
+ - If raw=False (default): list[Task] ready to use with hud.eval()
172
+ - If raw=True: list[dict] with raw task data
173
+
174
+ Example:
175
+ ```python
176
+ import hud
177
+ from hud.datasets import load_tasks
178
+
179
+ # Load from HUD API
180
+ tasks = load_tasks("hud-evals/SheetBench-50")
181
+
182
+ # Load from local file (v4 format auto-converted)
183
+ tasks = load_tasks("./my-tasks.json")
184
+
185
+ # Load from HuggingFace
186
+ tasks = load_tasks("hud-evals/benchmark:test")
187
+
188
+ # Load raw dicts (preserves env var placeholders)
189
+ raw_tasks = load_tasks("./tasks.json", raw=True)
190
+
191
+ # Run evaluation
192
+ async with hud.eval(tasks) as ctx:
193
+ await agent.run(ctx)
194
+ ```
195
+
196
+ Raises:
197
+ ValueError: If task loading fails
198
+ """
199
+ # Check if it's a local file
200
+ path = Path(source)
201
+ if path.exists() and path.suffix in {".json", ".jsonl"}:
202
+ logger.info("Loading tasks from file: %s", source)
203
+ items = _load_raw_from_file(path) if raw else _load_from_file(path)
204
+ logger.info("Loaded %d tasks from %s", len(items), source)
205
+ return items
206
+
207
+ # Try HUD API first
208
+ try:
209
+ logger.info("Trying HUD API: %s", source)
210
+ items = _load_raw_from_api(source) if raw else _load_from_api(source)
211
+ logger.info("Loaded %d tasks from HUD API: %s", len(items), source)
212
+ return items
213
+ except Exception as hud_error:
214
+ logger.debug("HUD API load failed (%s), trying HuggingFace", hud_error)
215
+
216
+ # Try HuggingFace as fallback
217
+ try:
218
+ logger.info("Trying HuggingFace dataset: %s", source)
219
+ items = _load_raw_from_huggingface(source) if raw else _load_from_huggingface(source)
220
+ logger.info("Loaded %d tasks from HuggingFace: %s", len(items), source)
221
+ return items
222
+ except ImportError:
223
+ raise ValueError(
224
+ f"Failed to load tasks from '{source}'. "
225
+ "Install 'datasets' package for HuggingFace support."
226
+ ) from None
227
+ except Exception as hf_error:
228
+ raise ValueError(f"Failed to load tasks from '{source}': {hf_error}") from hf_error
229
+
230
+
231
+ def save_tasks(
232
+ name: str,
233
+ tasks: list[Task],
234
+ ) -> str:
235
+ """Save tasks to the HUD API.
236
+
237
+ Creates or updates an evalset with the given tasks.
238
+
239
+ Args:
240
+ name: Evalset name/slug (e.g., "my-evals/benchmark-v1").
241
+ If no org prefix, uses user's default org.
242
+ tasks: List of Task objects (v5 format) to save.
243
+
244
+ Returns:
245
+ The evalset ID of the created/updated evalset.
246
+
247
+ Example:
248
+ ```python
249
+ from hud.datasets import save_tasks, load_tasks
250
+ from hud.eval.task import Task
251
+ from hud.environment import Environment
252
+
253
+ # Create tasks
254
+ env = Environment("my-env")
255
+ tasks = [
256
+ Task(env=env, scenario="checkout", args={"user": "alice"}),
257
+ Task(env=env, scenario="checkout", args={"user": "bob"}),
258
+ ]
259
+
260
+ # Save to HUD API
261
+ evalset_id = save_tasks("my-evals/benchmark-v1", tasks)
262
+
263
+ # Later, load them back
264
+ loaded = load_tasks("my-evals/benchmark-v1")
265
+ ```
266
+
267
+ Raises:
268
+ TypeError: If any task is not a v5 Task object (must have 'scenario')
269
+ ValueError: If API key is not set or save fails
270
+ """
271
+ import httpx
272
+
273
+ from hud.settings import settings
274
+
275
+ if not settings.api_key:
276
+ raise ValueError("HUD_API_KEY is required to save tasks")
277
+
278
+ # Validate all tasks are v5 format (must have 'scenario')
279
+ for i, task in enumerate(tasks):
280
+ if not hasattr(task, "scenario"):
281
+ raise TypeError(
282
+ f"Task at index {i} is missing 'scenario' - only v5 Task objects can be saved. "
283
+ "Use Task.from_v4(legacy_task) to convert from LegacyTask."
284
+ )
285
+
286
+ # Convert tasks to dicts (Task is a Pydantic model)
287
+ task_dicts = [task.model_dump(mode="json", exclude_none=True) for task in tasks]
288
+
289
+ # Build request payload
290
+ payload: dict[str, Any] = {
291
+ "name": name,
292
+ "tasks": task_dicts,
293
+ }
294
+
295
+ headers = {"Authorization": f"Bearer {settings.api_key}"}
296
+
297
+ try:
298
+ with httpx.Client(timeout=60) as client:
299
+ response = client.post(
300
+ f"{settings.hud_api_url}/tasks/evalset",
301
+ json=payload,
302
+ headers=headers,
303
+ )
304
+ response.raise_for_status()
305
+ data = response.json()
306
+ evalset_id = data.get("evalset_id") or data.get("id") or name
307
+ logger.info("Saved %d tasks to evalset: %s", len(tasks), evalset_id)
308
+ return evalset_id
309
+ except httpx.HTTPStatusError as e:
310
+ raise ValueError(f"Failed to save tasks: {e.response.text}") from e
311
+ except Exception as e:
312
+ raise ValueError(f"Failed to save tasks: {e}") from e
313
+
314
+
315
+ # Deprecated alias for backwards compatibility
316
+ def load_dataset(source: str, *, raw: bool = False) -> list[Task] | list[dict[str, Any]]:
317
+ """Deprecated: Use load_tasks() instead.
318
+
319
+ .. deprecated:: 0.6.0
320
+ load_dataset() is deprecated. Use load_tasks() instead.
321
+ """
322
+ warnings.warn(
323
+ "load_dataset() is deprecated. Use load_tasks() instead.",
324
+ DeprecationWarning,
325
+ stacklevel=2,
326
+ )
327
+ return load_tasks(source, raw=raw)
hud/datasets/runner.py CHANGED
@@ -1,126 +1,213 @@
1
- """Standard asyncio-based dataset runner."""
1
+ """Core task runner for evaluating agents on datasets.
2
+
3
+ Requires the [agents] extra: pip install hud-python[agents]
4
+ """
2
5
 
3
6
  from __future__ import annotations
4
7
 
5
- import asyncio
6
8
  import logging
7
- from typing import TYPE_CHECKING, Any, cast
8
-
9
- from datasets import Dataset, load_dataset
9
+ from typing import TYPE_CHECKING, Any
10
10
 
11
- from hud.agents.misc import ResponseAgent
12
- from hud.types import Task
11
+ import hud
12
+ from hud.types import AgentType, LegacyTask, TaskInput, Trace
13
13
 
14
14
  if TYPE_CHECKING:
15
- from hud.agents import MCPAgent
15
+ from collections.abc import Sequence
16
+
17
+ from hud.eval.context import EvalContext
18
+ from hud.eval.task import Task
16
19
 
17
20
  logger = logging.getLogger("hud.datasets")
18
21
 
19
22
 
20
23
  async def run_dataset(
21
- name: str,
22
- dataset: str | Dataset | list[dict[str, Any]],
23
- agent_class: type[MCPAgent],
24
- agent_config: dict[str, Any] | None = None,
25
- max_concurrent: int = 30,
26
- metadata: dict[str, Any] | None = None,
24
+ tasks: str | TaskInput | Sequence[TaskInput],
25
+ agent_type: str | AgentType,
26
+ *,
27
+ agent_params: dict[str, Any] | None = None,
27
28
  max_steps: int = 10,
28
- split: str = "train",
29
- auto_respond: bool = False,
30
- custom_system_prompt: str | None = None,
31
- ) -> list[Any]:
32
- """
33
- Run all tasks in a dataset with automatic job tracking.
29
+ max_concurrent: int = 30,
30
+ group_size: int = 1,
31
+ quiet: bool = True,
32
+ ) -> list[EvalContext]:
33
+ """Run an agent on a dataset of tasks.
34
+
35
+ This is the primary entry point for running evaluations programmatically.
36
+ The agent is created fresh for each task context to ensure correct tool initialization.
34
37
 
35
38
  Args:
36
- name: Name for the job
37
- dataset: HuggingFace dataset identifier (e.g. "hud-evals/SheetBench-50"),
38
- Dataset object, OR list of Task objects
39
- agent_class: Agent class to instantiate (e.g., ClaudeAgent)
40
- agent_config: Configuration/kwargs for agent (model, etc.)
41
- max_concurrent: Maximum parallel task execution
42
- metadata: Optional metadata for the job
43
- max_steps: Maximum steps per task
44
- split: Dataset split to use when loading from string (default: "train")
45
- auto_respond: Whether to use auto-response agent
46
- custom_system_prompt: Override system prompt for all tasks
39
+ tasks: Tasks to run. Can be:
40
+ - A source string (file path, API slug) - loaded via load_tasks()
41
+ - A single TaskInput (Task, LegacyTask, or dict)
42
+ - A list of TaskInput objects
43
+ agent_type: Type of agent to create (e.g., "claude", "openai", AgentType.CLAUDE).
44
+ agent_params: Parameters to pass to agent.create().
45
+ max_steps: Maximum steps per task.
46
+ max_concurrent: Maximum concurrent tasks (for parallel execution).
47
+ group_size: Number of times to run each task (for variance estimation).
48
+ quiet: Whether to suppress printing eval links and opening browser (default True).
47
49
 
48
50
  Returns:
49
- List of results from agent.run() in dataset order
51
+ List of EvalContext results from each task execution. Access `.reward` on each.
50
52
 
51
53
  Example:
52
- >>> from hud.agents import ClaudeAgent
53
- >>> # Option 1: From dataset string identifier
54
- >>> results = await run_dataset(
55
- ... "SheetBench Eval",
56
- ... "hud-evals/SheetBench-50",
57
- ... ClaudeAgent,
58
- ... {"model": "claude-3-5-sonnet-20241022"},
59
- ... )
60
- >>> # Option 2: From HuggingFace dataset object
61
- >>> from datasets import load_dataset
62
- >>> dataset = load_dataset("hud-evals/SheetBench-50", split="train")
63
- >>> results = await run_dataset("my_eval", dataset, ClaudeAgent)
64
- >>> # Option 3: From list of dicts
65
- >>> tasks = [{"prompt": "...", "mcp_config": {...}, ...}, ...]
66
- >>> results = await run_dataset("browser_eval", tasks, ClaudeAgent)
67
- """
68
- # Import here to avoid circular imports
69
- import hud
70
-
71
- dataset_link = None
72
-
73
- # Load dataset from string if needed
74
- if isinstance(dataset, str):
75
- logger.info("Loading dataset %s from HuggingFace...", dataset)
76
- dataset_link = dataset
77
-
78
- # Load dataset from HuggingFace
79
- dataset = cast("Dataset", load_dataset(dataset, split=split))
80
-
81
- # Create job context
82
- job_metadata = metadata or {}
83
- job_metadata["agent_class"] = agent_class.__name__
84
- job_metadata["agent_config"] = agent_config
85
-
86
- # Extract dataset verification info if available
87
- if isinstance(dataset, Dataset) and not dataset_link:
88
- try:
89
- general_info = next(iter(dataset.info.__dict__["download_checksums"].keys())).split("/")
90
- project = general_info[3]
91
- dataset_name = general_info[4].split("@")[0]
92
- dataset_link = f"{project}/{dataset_name}"
93
- except Exception:
94
- logger.warning("Failed to extract dataset verification info")
95
-
96
- with hud.job(name, metadata=job_metadata, dataset_link=dataset_link) as job_obj:
97
- # Run tasks with semaphore for concurrency control
98
- sem = asyncio.Semaphore(max_concurrent)
99
- results: list[Any | None] = [None] * len(dataset)
100
-
101
- async def _worker(index: int, task_dict: Any, max_steps: int = 10) -> None:
102
- async with sem:
103
- # Create trace for this task
104
- task_name = task_dict.get("prompt") or f"Task {index}"
105
- if custom_system_prompt and "system_prompt" not in task_dict:
106
- task_dict["system_prompt"] = custom_system_prompt
107
- # Ensure task_id is a string for baggage propagation
108
- raw_task_id = task_dict.get("id")
109
- safe_task_id = str(raw_task_id) if raw_task_id is not None else None
110
- with hud.trace(task_name, job_id=job_obj.id, task_id=safe_task_id):
111
- # Convert dict to Task here, at trace level
112
- task = Task(**task_dict)
113
-
114
- agent = agent_class(**(agent_config or {}))
115
-
116
- if auto_respond:
117
- agent.response_agent = ResponseAgent()
118
- results[index] = await agent.run(task, max_steps=max_steps)
119
-
120
- # Execute all tasks
121
- await asyncio.gather(
122
- *[_worker(i, task, max_steps=max_steps) for i, task in enumerate(dataset)],
123
- return_exceptions=True, # Don't fail entire batch on one error
54
+ ```python
55
+ from hud.datasets import load_tasks, run_dataset
56
+
57
+ # Load tasks and run
58
+ tasks = load_tasks("my-tasks.json")
59
+ results = await run_dataset(
60
+ tasks,
61
+ agent_type="claude",
62
+ agent_params={"checkpoint_name": "claude-sonnet-4-20250514"},
63
+ max_steps=50,
124
64
  )
125
65
 
126
- return results
66
+ for ctx in results:
67
+ print(f"Reward: {ctx.reward}")
68
+ ```
69
+ """
70
+ from hud.datasets.loader import load_tasks
71
+ from hud.eval.task import Task
72
+
73
+ # Normalize tasks to list[Task]
74
+ task_list: list[Task]
75
+ if isinstance(tasks, str):
76
+ task_list = load_tasks(tasks)
77
+ elif isinstance(tasks, Task):
78
+ task_list = [tasks]
79
+ elif isinstance(tasks, LegacyTask | dict):
80
+ # Single LegacyTask or dict - convert to Task
81
+ task_list = [Task.from_v4(tasks)]
82
+ else:
83
+ # Sequence of TaskInput - convert each to Task
84
+ task_list = [t if isinstance(t, Task) else Task.from_v4(t) for t in tasks]
85
+
86
+ if not task_list:
87
+ raise ValueError("No tasks to run")
88
+
89
+ # Resolve agent class
90
+ agent_type_enum = agent_type if isinstance(agent_type, AgentType) else AgentType(agent_type)
91
+ agent_cls = agent_type_enum.cls
92
+
93
+ # Use hud.eval() for both single and parallel execution
94
+ async with hud.eval(
95
+ task_list,
96
+ group=group_size,
97
+ max_concurrent=max_concurrent,
98
+ quiet=quiet,
99
+ ) as ctx:
100
+ # Create agent fresh for each context (ensures correct tool initialization)
101
+ agent = agent_cls.create(**(agent_params or {}))
102
+ await agent.run(ctx, max_steps=max_steps)
103
+ # Reward is computed by EvalContext.__aexit__ from evaluate tools
104
+
105
+ # For parallel execution, results are collected via ctx.results
106
+ if hasattr(ctx, "results") and ctx.results:
107
+ return ctx.results
108
+
109
+ return [ctx]
110
+
111
+
112
+ async def run_single_task(
113
+ task: Task,
114
+ *,
115
+ agent_type: AgentType,
116
+ agent_params: dict[str, Any] | None = None,
117
+ max_steps: int = 10,
118
+ job_id: str | None = None,
119
+ task_id: str | None = None,
120
+ group_id: str | None = None,
121
+ trace_name: str | None = None,
122
+ metadata: dict[str, Any] | None = None,
123
+ trace_id: str | None = None,
124
+ api_key: str | None = None,
125
+ trace: bool = True,
126
+ quiet: bool = False,
127
+ ) -> Trace:
128
+ """Run a single task with full control over eval context parameters.
129
+
130
+ This is the low-level entry point for running individual tasks with explicit
131
+ trace/job/group IDs. Used by remote execution workers.
132
+
133
+ Args:
134
+ task: Task object to run. Use Task.from_v4() or load_tasks() to create.
135
+ agent_type: AgentType enum specifying the agent to use.
136
+ agent_params: Parameters passed to agent.create(). Should include
137
+ pre-configured model_client for inference gateway usage.
138
+ max_steps: Maximum steps allowed for the agent.
139
+ job_id: HUD job identifier for telemetry association.
140
+ task_id: Task identifier (used in trace name if trace_name not provided).
141
+ group_id: Optional group identifier for parallel runs.
142
+ trace_name: Name for the trace (defaults to task_id or task.id).
143
+ metadata: Additional metadata for the trace context.
144
+ trace_id: Pre-assigned trace ID (if provided by backend).
145
+ api_key: API key override for telemetry and backend calls.
146
+ trace: Whether to send trace data to backend (default True).
147
+ quiet: Whether to suppress printing eval link (default False).
148
+
149
+ Returns:
150
+ Trace result from the agent run.
151
+
152
+ Example:
153
+ ```python
154
+ from hud.datasets import run_single_task
155
+ from hud.eval.task import Task
156
+ from hud.types import AgentType
157
+ from openai import AsyncOpenAI
158
+
159
+ # Create task (from v4 dict or directly)
160
+ task = Task.from_v4({"prompt": "...", "mcp_config": {...}, "evaluate_tool": {...}})
161
+
162
+ # Configure agent with inference gateway
163
+ agent_params = {
164
+ "checkpoint_name": "gpt-4o",
165
+ "validate_api_key": False,
166
+ "model_client": AsyncOpenAI(
167
+ api_key=hud_api_key,
168
+ base_url=settings.hud_gateway_url,
169
+ ),
170
+ }
171
+
172
+ result = await run_single_task(
173
+ task=task,
174
+ agent_type=AgentType.OPENAI,
175
+ agent_params=agent_params,
176
+ max_steps=20,
177
+ job_id="job-123",
178
+ task_id="task-456",
179
+ )
180
+ ```
181
+ """
182
+ # Determine trace name
183
+ effective_trace_name = trace_name or task_id or task.id or "single_task"
184
+
185
+ # Run with explicit eval context parameters
186
+ async with hud.eval(
187
+ task,
188
+ name=effective_trace_name,
189
+ job_id=job_id,
190
+ group_id=group_id,
191
+ trace_id=trace_id,
192
+ api_key=api_key,
193
+ trace=trace,
194
+ quiet=quiet,
195
+ ) as ctx:
196
+ # Build agent params - use system_prompt from ctx (set from task.agent_config)
197
+ final_agent_params = dict(agent_params or {})
198
+ if ctx.system_prompt and "system_prompt" not in final_agent_params:
199
+ final_agent_params["system_prompt"] = ctx.system_prompt
200
+
201
+ # Create agent inside ctx so it has access to context-derived values
202
+ agent_cls = agent_type.cls
203
+ agent = agent_cls.create(**final_agent_params)
204
+
205
+ # Store metadata if provided
206
+ if metadata:
207
+ ctx.metadata.update(metadata)
208
+
209
+ result = await agent.run(ctx, max_steps=max_steps)
210
+ # Reward is computed by EvalContext.__aexit__ from evaluate tools
211
+
212
+ # Return the Trace (ctx.reward is set by EvalContext.__aexit__)
213
+ return result
File without changes