hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/datasets/parallel.py DELETED
@@ -1,687 +0,0 @@
1
- """Process-based parallel dataset runner."""
2
-
3
- from __future__ import annotations
4
-
5
- import asyncio
6
- import logging
7
- import multiprocessing
8
- import os
9
- import traceback
10
- from concurrent.futures import ProcessPoolExecutor, as_completed
11
- from functools import partial
12
- from typing import TYPE_CHECKING, Any
13
-
14
- if TYPE_CHECKING:
15
- from datasets import Dataset
16
-
17
- from hud.agents import MCPAgent
18
-
19
- logger = logging.getLogger(__name__)
20
-
21
-
22
- # Worker function that runs in a separate process
23
- def _process_worker(
24
- task_batch: list[tuple[int, dict[str, Any]]],
25
- agent_class_module: str,
26
- agent_class_name: str,
27
- agent_config: dict[str, Any] | None,
28
- job_id: str,
29
- job_name: str,
30
- max_steps: int,
31
- auto_respond: bool,
32
- worker_id: int,
33
- total_workers: int,
34
- max_concurrent_per_worker: int,
35
- ) -> list[tuple[int, Any]]:
36
- """
37
- Worker function that runs in a separate process.
38
-
39
- This function:
40
- 1. Reinitializes telemetry in the new process
41
- 2. Creates its own event loop
42
- 3. Processes a batch of tasks asynchronously
43
- 4. Returns results with their original indices
44
- 5. Handles interruption signals gracefully
45
-
46
- Args:
47
- task_batch: List of (index, task_dict) tuples
48
- agent_class_module: Module path for the agent class
49
- agent_class_name: Name of the agent class
50
- agent_config: Configuration for agent initialization
51
- job_id: Job ID for telemetry tracking
52
- job_name: Job name for logging
53
- max_steps: Maximum steps per task
54
- auto_respond: Whether to use ResponseAgent
55
- worker_id: ID of this worker process
56
- total_workers: Total number of worker processes
57
- max_concurrent_per_worker: Maximum concurrent tasks within each worker
58
-
59
- Returns:
60
- List of (index, result) tuples
61
- """
62
- # Import inside worker to avoid pickling issues
63
- import signal
64
- import sys
65
-
66
- import hud
67
- from hud.agents.misc.response_agent import ResponseAgent
68
- from hud.otel import configure_telemetry
69
- from hud.types import Task
70
-
71
- # Ensure stdout is not buffered for immediate output
72
- try:
73
- sys.stdout.reconfigure(line_buffering=True) # type: ignore
74
- sys.stderr.reconfigure(line_buffering=True) # type: ignore
75
- except AttributeError:
76
- pass
77
-
78
- # Set up signal handler for clean interruption
79
- def signal_handler(signum: int, frame: Any) -> None:
80
- logger.warning("Worker %s: Received interrupt signal", worker_id)
81
- # Raise KeyboardInterrupt to actually interrupt the worker
82
- raise KeyboardInterrupt(f"Worker {worker_id} interrupted by user")
83
-
84
- signal.signal(signal.SIGINT, signal_handler)
85
-
86
- # Reinitialize telemetry in this process
87
- configure_telemetry()
88
-
89
- # Dynamically import the agent class
90
- try:
91
- import importlib
92
-
93
- module = importlib.import_module(agent_class_module)
94
- agent_class = getattr(module, agent_class_name)
95
- except (ImportError, AttributeError) as e:
96
- logger.error("Worker %s: Failed to import agent class: %s", worker_id, e)
97
- return [(idx, {"error": str(e), "isError": True}) for idx, _ in task_batch]
98
-
99
- # Create new event loop for this process
100
- loop = asyncio.new_event_loop()
101
- asyncio.set_event_loop(loop)
102
-
103
- async def process_batch() -> list[tuple[int, Any]]:
104
- """Process all tasks in the batch asynchronously."""
105
- results = []
106
-
107
- # Use semaphore to limit concurrency within the process
108
- sem = asyncio.Semaphore(max_concurrent_per_worker)
109
-
110
- async def process_single_task(index: int, task_dict: dict[str, Any]) -> tuple[int, Any]:
111
- """Process a single task with telemetry tracking."""
112
- async with sem:
113
- try:
114
- # Create trace for this task (linked to the job) - match original format
115
- task_name = task_dict.get("prompt") or f"Task {index}"
116
-
117
- # Use the job_id to group all tasks under the same job
118
- raw_task_id = task_dict.get("id")
119
- safe_task_id = str(raw_task_id) if raw_task_id is not None else None
120
- with hud.trace(task_name, job_id=job_id, task_id=safe_task_id):
121
- # Convert dict to Task
122
- task = Task(**task_dict)
123
-
124
- # Create agent instance
125
- agent = agent_class(**(agent_config or {}))
126
-
127
- if auto_respond:
128
- agent.response_agent = ResponseAgent()
129
-
130
- # Run the task
131
- result = await agent.run(task, max_steps=max_steps)
132
-
133
- # Extract and print evaluation score for visibility
134
- reward = getattr(result, "reward", "N/A")
135
- logger.info(
136
- "[Worker %s] Task %s: ✓ Completed (reward: %s)",
137
- worker_id,
138
- index,
139
- reward,
140
- )
141
-
142
- logger.info(
143
- "[Worker %s] Completed task %s (reward: %s)",
144
- worker_id,
145
- index,
146
- reward,
147
- )
148
-
149
- return (index, result)
150
-
151
- except Exception as e:
152
- error_msg = f"Worker {worker_id}: Task {index} failed: {e}"
153
- logger.error(
154
- "[Worker %s] Task %s: ✗ Failed (%s)", worker_id, index, str(e)[:100]
155
- )
156
- logger.error("%s\n%s", error_msg, traceback.format_exc())
157
-
158
- return (
159
- index,
160
- {
161
- "error": str(e),
162
- "traceback": traceback.format_exc(),
163
- "isError": True,
164
- "reward": 0.0,
165
- "done": False,
166
- "content": f"Task failed: {e}",
167
- },
168
- )
169
-
170
- # Process all tasks in parallel within this process
171
- tasks = [process_single_task(idx, task_dict) for idx, task_dict in task_batch]
172
-
173
- try:
174
- results = await asyncio.gather(*tasks, return_exceptions=False)
175
- return results
176
- except asyncio.CancelledError:
177
- logger.info("Worker %s: Tasks cancelled due to interruption", worker_id)
178
- # Return error results for all tasks
179
- return [
180
- (
181
- idx,
182
- {
183
- "error": "Task cancelled (Ctrl+C)",
184
- "isError": True,
185
- "reward": 0.0,
186
- "done": False,
187
- "content": "Task cancelled",
188
- },
189
- )
190
- for idx, _ in task_batch
191
- ]
192
-
193
- try:
194
- # Run the async batch processing
195
- results = loop.run_until_complete(process_batch())
196
-
197
- # Ensure telemetry is fully sent before process exits
198
- # Spans are buffered in BatchSpanProcessor and need explicit flush
199
-
200
- from opentelemetry import trace as otel_trace
201
-
202
- provider = otel_trace.get_tracer_provider()
203
- if provider and hasattr(provider, "force_flush"):
204
- # Flush of buffered spans
205
- success = provider.force_flush(timeout_millis=2000) # type: ignore[arg-type]
206
- if not success:
207
- logger.warning("Worker %s: Telemetry flush timed out", worker_id)
208
-
209
- return results
210
- except KeyboardInterrupt:
211
- logger.info("Worker %s: Interrupted by user, stopping gracefully", worker_id)
212
- # Return partial results for tasks that completed
213
- partial_results = []
214
- for idx, _ in task_batch:
215
- partial_results.append(
216
- (
217
- idx,
218
- {
219
- "error": "Worker interrupted by user (Ctrl+C)",
220
- "isError": True,
221
- "reward": 0.0,
222
- "done": False,
223
- "content": "Task interrupted",
224
- },
225
- )
226
- )
227
- return partial_results
228
- except Exception as e:
229
- logger.error("[Worker %s] Batch processing failed: %s", worker_id, e)
230
- logger.error("Worker %s batch processing failed: %s", worker_id, e)
231
- return [(idx, {"error": str(e), "isError": True}) for idx, _ in task_batch]
232
- finally:
233
- try:
234
- from opentelemetry import trace as otel_trace
235
-
236
- provider = otel_trace.get_tracer_provider()
237
- if provider and hasattr(provider, "force_flush"):
238
- # Flush buffered spans with reasonable timeout
239
- success = provider.force_flush(timeout_millis=2000) # type: ignore[arg-type]
240
- if not success:
241
- logger.warning("Worker %s: Telemetry flush timed out", worker_id)
242
- except Exception as e:
243
- logger.warning("Worker %s: Failed to flush telemetry: %s", worker_id, e)
244
-
245
- # Clean up the event loop
246
- try:
247
- loop.close()
248
- except Exception as e:
249
- logger.warning("Worker %s: Failed to close event loop: %s", worker_id, e)
250
-
251
-
252
- async def run_dataset_parallel_manual(
253
- name: str,
254
- dataset: str | Dataset | list[dict[str, Any]],
255
- agent_class: type[MCPAgent],
256
- agent_config: dict[str, Any] | None = None,
257
- max_workers: int | None = None,
258
- max_concurrent_per_worker: int = 25,
259
- max_concurrent: int | None = None,
260
- metadata: dict[str, Any] | None = None,
261
- max_steps: int = 10,
262
- split: str = "train",
263
- auto_respond: bool = False,
264
- custom_system_prompt: str | None = None,
265
- ) -> list[Any]:
266
- """
267
- Run all tasks in a dataset using process-based parallelism with manual configuration.
268
-
269
- This function distributes tasks evenly across multiple processes to achieve true parallelism,
270
- bypassing Python's GIL limitations. Each process runs its own event loop with concurrent
271
- task execution controlled by max_concurrent_per_worker or max_concurrent.
272
-
273
- Args:
274
- name: Name for the job (shown in telemetry)
275
- dataset: HuggingFace dataset identifier, Dataset object, or list of task dicts
276
- agent_class: Agent class to use (must be importable in worker processes)
277
- agent_config: Configuration for agent initialization
278
- max_workers: Number of processes (defaults to CPU count)
279
- max_concurrent_per_worker: Max concurrent tasks within each worker
280
- max_concurrent: Optional total concurrent limit across all workers (overrides per-worker)
281
- metadata: Optional metadata for the job
282
- max_steps: Maximum steps per task
283
- split: Dataset split when loading from string
284
- auto_respond: Whether to use ResponseAgent
285
- custom_system_prompt: Override system prompt for all tasks
286
-
287
- Returns:
288
- List of results in the same order as the input dataset
289
-
290
- Example:
291
- >>> from hud.agents import ClaudeAgent
292
- >>> from hud.datasets import run_dataset_parallel_manual
293
- >>> # Run with 8 workers, 10 concurrent per worker (80 total concurrent)
294
- >>> results = await run_dataset_parallel_manual(
295
- ... "Large Scale Eval",
296
- ... "hud-evals/benchmark-400",
297
- ... ClaudeAgent,
298
- ... max_workers=8,
299
- ... max_concurrent_per_worker=10,
300
- ... )
301
- >>> # OR limit total concurrent to prevent rate limits
302
- >>> results = await run_dataset_parallel_manual(
303
- ... "Rate Limited Eval",
304
- ... dataset,
305
- ... ClaudeAgent,
306
- ... max_workers=8,
307
- ... max_concurrent=20, # Only 20 total concurrent
308
- ... )
309
- """
310
- from datasets import Dataset
311
- from datasets import load_dataset as hf_load_dataset
312
-
313
- import hud
314
-
315
- # Determine optimal worker count
316
- if max_workers is None:
317
- max_workers = min(os.cpu_count() or 4, 16) # Cap at 16 to be reasonable
318
-
319
- # If max_concurrent is specified, calculate per-worker concurrency
320
- if max_concurrent is not None:
321
- # Distribute concurrent limit across workers
322
- # Each worker should get a fair share of the total concurrent limit
323
- max_concurrent_per_worker = max(1, max_concurrent // max_workers)
324
- logger.info(
325
- "Limiting to %s total concurrent tasks %s per worker)",
326
- max_concurrent,
327
- max_concurrent_per_worker,
328
- )
329
-
330
- logger.info(
331
- "Starting parallel dataset run with %s workers (%s concurrent per worker)",
332
- max_workers,
333
- max_concurrent_per_worker,
334
- )
335
-
336
- # Load dataset if needed
337
- dataset_link = None
338
- task_dicts: list[dict[str, Any]]
339
-
340
- if isinstance(dataset, str):
341
- logger.info("Loading dataset %s from HuggingFace...", dataset)
342
- dataset_link = dataset
343
- loaded_dataset = hf_load_dataset(dataset, split=split)
344
- task_dicts = list(loaded_dataset) # type: ignore
345
- elif isinstance(dataset, Dataset):
346
- task_dicts = list(dataset) # type: ignore
347
- elif isinstance(dataset, list):
348
- task_dicts = dataset
349
- else:
350
- raise ValueError(f"Dataset must be string, Dataset, or list, got {type(dataset)}")
351
-
352
- # Apply custom system prompt if provided
353
- if custom_system_prompt:
354
- for task_dict in task_dicts:
355
- if "system_prompt" not in task_dict:
356
- task_dict["system_prompt"] = custom_system_prompt
357
- else:
358
- task_dict["system_prompt"] += "\n" + custom_system_prompt
359
-
360
- # Prepare job metadata
361
- job_metadata = metadata or {}
362
- job_metadata.update(
363
- {
364
- "agent_class": agent_class.__name__,
365
- "agent_config": agent_config,
366
- "parallel_mode": "process_pool",
367
- "max_workers": max_workers,
368
- "max_concurrent_per_worker": max_concurrent_per_worker,
369
- "total_tasks": len(task_dicts),
370
- }
371
- )
372
-
373
- # Extract dataset verification info if available (match original)
374
- if isinstance(dataset, Dataset) and not dataset_link:
375
- try:
376
- general_info = next(iter(dataset.info.__dict__["download_checksums"].keys())).split("/")
377
- project = general_info[3]
378
- dataset_name = general_info[4].split("@")[0]
379
- dataset_link = f"{project}/{dataset_name}"
380
- except Exception:
381
- logger.warning("Failed to extract dataset verification info")
382
-
383
- # task_dicts = task_dicts[:10]
384
-
385
- # Create job context
386
- with hud.job(name, metadata=job_metadata, dataset_link=dataset_link) as job_obj:
387
- # Prepare agent class info for pickling
388
- agent_module = agent_class.__module__
389
- agent_name = agent_class.__name__
390
-
391
- # Divide tasks evenly among workers
392
- num_tasks = len(task_dicts)
393
- tasks_per_worker = (num_tasks + max_workers - 1) // max_workers # Ceiling division
394
-
395
- task_batches: list[list[tuple[int, dict[str, Any]]]] = []
396
- for i in range(0, num_tasks, tasks_per_worker):
397
- batch = [
398
- (idx, task_dict)
399
- for idx, task_dict in enumerate(task_dicts[i : i + tasks_per_worker], start=i)
400
- ]
401
- if batch: # Only add non-empty batches
402
- task_batches.append(batch)
403
-
404
- logger.info(
405
- "Distributing %s tasks across %s workers (~%s tasks per worker)",
406
- num_tasks,
407
- len(task_batches),
408
- tasks_per_worker,
409
- )
410
-
411
- # Initialize results list
412
- results: list[Any] = [None] * len(task_dicts)
413
-
414
- # Create worker function with all needed context
415
- worker_func = partial(
416
- _process_worker,
417
- agent_class_module=agent_module,
418
- agent_class_name=agent_name,
419
- agent_config=agent_config,
420
- job_id=job_obj.id,
421
- job_name=name,
422
- max_steps=max_steps,
423
- auto_respond=auto_respond,
424
- total_workers=min(max_workers, len(task_batches)),
425
- max_concurrent_per_worker=max_concurrent_per_worker,
426
- )
427
-
428
- # Process batches in parallel using ProcessPoolExecutor
429
- executor = ProcessPoolExecutor(
430
- max_workers=max_workers,
431
- mp_context=multiprocessing.get_context("spawn"),
432
- )
433
- try:
434
- # Submit all batches to workers
435
- future_to_batch = {
436
- executor.submit(worker_func, batch, worker_id=i): batch
437
- for i, batch in enumerate(task_batches)
438
- }
439
-
440
- # Track progress
441
- completed = 0
442
- total = len(task_dicts)
443
-
444
- # Process results as they complete
445
- try:
446
- for future in as_completed(future_to_batch):
447
- batch = future_to_batch[future]
448
-
449
- try:
450
- # Get results from this worker
451
- batch_results = future.result()
452
-
453
- # Place results in correct positions
454
- for index, result in batch_results:
455
- results[index] = result
456
- completed += 1
457
-
458
- # Calculate success rate so far
459
- successful_so_far = sum(
460
- 1
461
- for r in results[:completed]
462
- if r is not None and getattr(r, "reward", 0) > 0
463
- )
464
-
465
- progress_msg = (
466
- f"Progress: {completed}/{total} tasks completed "
467
- f"({100 * completed / total:.1f}%) | "
468
- f"Success rate: {successful_so_far}/{completed} "
469
- f"({100 * successful_so_far / completed:.1f}%)"
470
- )
471
-
472
- logger.info(progress_msg)
473
-
474
- except Exception as e:
475
- # Handle worker failure
476
- logger.error(
477
- "Worker failed with exception: %s\n%s", e, traceback.format_exc()
478
- )
479
-
480
- # Mark all tasks in this batch as failed
481
- for index, _ in batch:
482
- results[index] = {
483
- "error": f"Worker process failed: {e}",
484
- "isError": True,
485
- "reward": 0.0,
486
- "done": False,
487
- "content": f"Worker process failed: {e}",
488
- }
489
- completed += 1
490
-
491
- except KeyboardInterrupt:
492
- logger.warning("\n⚠️ Parallel evaluation interrupted by user (Ctrl+C)")
493
- logger.info("Cancelling pending tasks...")
494
-
495
- # Cancel all pending futures
496
- for future in future_to_batch:
497
- if not future.done():
498
- future.cancel()
499
-
500
- # Mark uncompleted tasks as interrupted
501
- for i, r in enumerate(results):
502
- if r is None:
503
- results[i] = {
504
- "error": "Evaluation interrupted by user",
505
- "isError": True,
506
- "reward": 0.0,
507
- "done": False,
508
- "content": "Task interrupted (Ctrl+C)",
509
- }
510
-
511
- logger.info("Interrupted after %s/%s tasks", completed, total)
512
- raise # Re-raise to propagate the interrupt
513
-
514
- finally:
515
- # Always shutdown the executor properly
516
- executor.shutdown(wait=False, cancel_futures=True)
517
-
518
- # Verify all results are populated
519
- missing = [i for i, r in enumerate(results) if r is None]
520
- if missing:
521
- logger.warning("Missing results for task indices: %s...", missing[:10])
522
- for idx in missing:
523
- results[idx] = {
524
- "error": "No result returned from worker",
525
- "isError": True,
526
- "reward": 0.0,
527
- "done": False,
528
- "content": "Task was not processed",
529
- }
530
-
531
- # Print final summary
532
- total_tasks = len(results)
533
- successful_tasks = sum(1 for r in results if getattr(r, "reward", 0) > 0)
534
- failed_tasks = sum(1 for r in results if isinstance(r, dict) and r.get("isError", False))
535
-
536
- logger.info("\n")
537
- logger.info("=" * 60)
538
- logger.info("📊 Parallel Evaluation Complete!")
539
- logger.info("=" * 60)
540
- logger.info("Total tasks: %s", total_tasks)
541
- logger.info("Successful: %s (%s%%)", successful_tasks, 100 * successful_tasks / total_tasks)
542
- logger.info("Failed: %s", failed_tasks)
543
- logger.info("Workers used: %s", max_workers)
544
- logger.info("=" * 60)
545
-
546
- logger.info(
547
- "Parallel dataset run completed: %s tasks, %s successful (%s%%)",
548
- total_tasks,
549
- successful_tasks,
550
- 100 * successful_tasks / total_tasks,
551
- )
552
-
553
- return results
554
-
555
-
556
- def calculate_optimal_workers(num_tasks: int, reserve_system_resources: bool = True) -> int:
557
- """
558
- Calculate optimal number of workers based on CPU cores and task count.
559
-
560
- Simple heuristic:
561
- - 1 worker per CPU core (minus 1-2 for system if reserve_system_resources)
562
- - But don't create more workers than tasks
563
- - Cap at reasonable maximum
564
-
565
- Args:
566
- num_tasks: Total number of tasks to process
567
- reserve_system_resources: Whether to leave CPU cores for system (default True)
568
-
569
- Returns:
570
- Optimal number of workers
571
- """
572
- # Get CPU count
573
- cpu_count = os.cpu_count() or 4
574
-
575
- # Reserve 1-2 cores for system if requested
576
- if reserve_system_resources:
577
- if cpu_count > 8:
578
- available_cpus = cpu_count - 2 # Reserve 2 for systems with many cores
579
- elif cpu_count > 2:
580
- available_cpus = cpu_count - 1 # Reserve 1 for typical systems
581
- else:
582
- available_cpus = 1 # Minimum 1 worker
583
- else:
584
- available_cpus = cpu_count
585
-
586
- # Cap at 32 workers to be reasonable
587
- max_workers = min(available_cpus, 32)
588
-
589
- # Don't create more workers than tasks
590
- # But try to have at least 5-10 tasks per worker for efficiency
591
- if num_tasks <= max_workers:
592
- return min(num_tasks, max_workers)
593
- else:
594
- # For many tasks, use all available workers
595
- # unless that would give us very few tasks per worker
596
- min_tasks_per_worker = 10
597
- ideal_workers = min(max_workers, max(1, num_tasks // min_tasks_per_worker))
598
- return ideal_workers
599
-
600
-
601
- async def run_dataset_parallel(
602
- name: str,
603
- dataset: str | Dataset | list[dict[str, Any]],
604
- agent_class: type[MCPAgent],
605
- agent_config: dict[str, Any] | None = None,
606
- max_concurrent: int | None = None,
607
- metadata: dict[str, Any] | None = None,
608
- max_steps: int = 10,
609
- **kwargs: Any,
610
- ) -> list[Any]:
611
- """
612
- Run all tasks in a dataset using automatically optimized process-based parallelism.
613
-
614
- This function automatically determines the optimal number of workers
615
- and batch sizes based on system resources and dataset size. For manual control
616
- over worker configuration, use `run_dataset_parallel_manual`.
617
-
618
- Args:
619
- name: Name for the job
620
- dataset: Dataset to run
621
- agent_class: Agent class to use
622
- agent_config: Agent configuration
623
- max_concurrent: Maximum total concurrent tasks across all workers (prevents rate limits)
624
- metadata: Optional metadata
625
- max_steps: Maximum steps per task
626
- **kwargs: Additional arguments passed to run_dataset_parallel_manual
627
-
628
- Example:
629
- >>> # Automatically handles 400+ tasks efficiently
630
- >>> results = await run_dataset_parallel(
631
- ... "Large Evaluation",
632
- ... "hud-evals/benchmark-400",
633
- ... ClaudeAgent,
634
- ... max_concurrent=50, # Limit to 50 concurrent API calls
635
- ... )
636
- """
637
- # Load dataset to get size
638
- num_tasks: int
639
-
640
- if isinstance(dataset, str):
641
- from datasets import load_dataset as hf_load_dataset
642
-
643
- dataset_obj = hf_load_dataset(dataset, split=kwargs.get("split", "train"))
644
- num_tasks = len(dataset_obj) # type: ignore
645
- elif hasattr(dataset, "__len__"):
646
- num_tasks = len(dataset)
647
- else:
648
- # Convert to list to count
649
- dataset_list: list[dict[str, Any]] = list(dataset) # type: ignore
650
- num_tasks = len(dataset_list)
651
- dataset = dataset_list
652
-
653
- # Calculate optimal configuration
654
- num_workers = calculate_optimal_workers(num_tasks)
655
-
656
- # Set default max_concurrent_per_worker if not using total limit
657
- if max_concurrent is None:
658
- max_concurrent_per_worker = 25 # Reasonable default
659
- else:
660
- max_concurrent_per_worker = max(1, max_concurrent // num_workers)
661
-
662
- logger.info(
663
- "Auto-configured for %s tasks: %s workers, %s concurrent per worker",
664
- num_tasks,
665
- num_workers,
666
- max_concurrent_per_worker,
667
- )
668
-
669
- # Add auto-configuration info to metadata
670
- if metadata is None:
671
- metadata = {}
672
- metadata["auto_configured"] = True
673
- metadata["auto_num_workers"] = num_workers
674
-
675
- # Run with optimized settings
676
- return await run_dataset_parallel_manual(
677
- name=name,
678
- dataset=dataset,
679
- agent_class=agent_class,
680
- agent_config=agent_config,
681
- max_workers=num_workers,
682
- max_concurrent_per_worker=max_concurrent_per_worker,
683
- max_concurrent=max_concurrent,
684
- metadata=metadata,
685
- max_steps=max_steps,
686
- **kwargs,
687
- )
hud/misc/__init__.py DELETED
@@ -1 +0,0 @@
1
- """Miscellaneous utilities for HUD SDK."""