hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
@@ -1,86 +0,0 @@
1
- """Factory functions for creating agents compatible with run_dataset."""
2
-
3
- from __future__ import annotations
4
-
5
- from typing import Any
6
-
7
- from openai import AsyncOpenAI
8
-
9
- from hud.agents.grounded_openai import GroundedOpenAIChatAgent
10
- from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
11
- from hud.tools.grounding import GrounderConfig
12
-
13
-
14
- def create_openai_agent(**kwargs: Any) -> GenericOpenAIChatAgent:
15
- """Factory for GenericOpenAIChatAgent with run_dataset compatibility.
16
-
17
- Args:
18
- api_key: OpenAI API key
19
- base_url: Optional custom API endpoint
20
- model_name: Model to use (e.g., "gpt-4o-mini")
21
- **kwargs: Additional arguments passed to GenericOpenAIChatAgent
22
-
23
- Returns:
24
- Configured GenericOpenAIChatAgent instance
25
-
26
- Example:
27
- >>> from hud.datasets import run_dataset
28
- >>> from hud.utils.agent_factories import create_openai_agent
29
- >>> results = await run_dataset(
30
- ... "My Eval",
31
- ... "hud-evals/SheetBench-50",
32
- ... create_openai_agent,
33
- ... {"api_key": "your-key", "model_name": "gpt-4o-mini"},
34
- ... )
35
- """
36
- api_key = kwargs.pop("api_key", None)
37
- base_url = kwargs.pop("base_url", None)
38
-
39
- openai_client = AsyncOpenAI(api_key=api_key, base_url=base_url)
40
-
41
- return GenericOpenAIChatAgent(openai_client=openai_client, **kwargs)
42
-
43
-
44
- def create_grounded_agent(**kwargs: Any) -> GroundedOpenAIChatAgent:
45
- """Factory for GroundedOpenAIChatAgent with run_dataset compatibility.
46
-
47
- Args:
48
- api_key: OpenAI API key for planning model
49
- base_url: Optional custom API endpoint for planning model
50
- model_name: Planning model to use (e.g., "gpt-4o-mini")
51
- grounder_api_key: API key for grounding model
52
- grounder_api_base: API base URL for grounding model (default: OpenRouter)
53
- grounder_model: Grounding model to use (default: qwen/qwen-2.5-vl-7b-instruct)
54
- **kwargs: Additional arguments passed to GroundedOpenAIChatAgent
55
-
56
- Returns:
57
- Configured GroundedOpenAIChatAgent instance
58
-
59
- Example:
60
- >>> from hud.datasets import run_dataset
61
- >>> from hud.utils.agent_factories import create_grounded_agent
62
- >>> results = await run_dataset(
63
- ... "Grounded Eval",
64
- ... dataset,
65
- ... create_grounded_agent,
66
- ... {
67
- ... "api_key": "openai-key",
68
- ... "grounder_api_key": "openrouter-key",
69
- ... "model_name": "gpt-4o-mini",
70
- ... },
71
- ... )
72
- """
73
- api_key = kwargs.pop("api_key", None)
74
- base_url = kwargs.pop("base_url", None)
75
- grounder_api_key = kwargs.pop("grounder_api_key", None)
76
- grounder_api_base = kwargs.pop("grounder_api_base", "https://openrouter.ai/api/v1")
77
- grounder_model = kwargs.pop("grounder_model", "qwen/qwen-2.5-vl-7b-instruct")
78
-
79
- openai_client = AsyncOpenAI(api_key=api_key, base_url=base_url)
80
- grounder_config = GrounderConfig(
81
- api_base=grounder_api_base, model=grounder_model, api_key=grounder_api_key
82
- )
83
-
84
- return GroundedOpenAIChatAgent(
85
- openai_client=openai_client, grounder_config=grounder_config, **kwargs
86
- )
hud/utils/async_utils.py DELETED
@@ -1,65 +0,0 @@
1
- """Async utilities for HUD SDK.
2
-
3
- This module provides utilities for running async code in various environments,
4
- including Jupyter notebooks and synchronous contexts.
5
- """
6
-
7
- from __future__ import annotations
8
-
9
- import asyncio
10
- import logging
11
- import threading
12
- from typing import TYPE_CHECKING, Any
13
-
14
- if TYPE_CHECKING:
15
- from collections.abc import Coroutine
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
-
20
- def fire_and_forget(coro: Coroutine[Any, Any, Any], description: str = "task") -> None:
21
- """Execute a coroutine in a fire-and-forget manner.
22
-
23
- This function handles running async code in various contexts:
24
- - When an event loop is already running (normal async context)
25
- - When no event loop exists (sync context, some Jupyter setups)
26
- - Gracefully handles interpreter shutdown
27
-
28
- Args:
29
- coro: The coroutine to execute
30
- description: Description of the task for logging (e.g., "update job status")
31
-
32
- Example:
33
- fire_and_forget(
34
- some_async_function(),
35
- description="update status"
36
- )
37
- """
38
- try:
39
- # Try to get current event loop
40
- loop = asyncio.get_running_loop()
41
- # Schedule the coroutine
42
- task = loop.create_task(coro)
43
- # Add error handler to prevent unhandled exceptions
44
- task.add_done_callback(lambda t: t.exception() if not t.cancelled() else None)
45
- except RuntimeError:
46
- # No running event loop (e.g., Jupyter without %autoawait, sync context)
47
- try:
48
- # Try to run in a thread as a fallback
49
- def run_in_thread() -> None:
50
- loop = asyncio.new_event_loop()
51
- asyncio.set_event_loop(loop)
52
- try:
53
- loop.run_until_complete(coro)
54
- except Exception as e:
55
- # Suppress warnings about interpreter shutdown
56
- if "interpreter shutdown" not in str(e):
57
- logger.debug("Error in threaded %s: %s", description, e)
58
-
59
- thread = threading.Thread(target=run_in_thread, daemon=True)
60
- thread.start()
61
- except Exception as e:
62
- # If that fails too, just log and continue
63
- # Special case: suppress "cannot schedule new futures after interpreter shutdown"
64
- if "interpreter shutdown" not in str(e):
65
- logger.debug("Could not %s - no event loop available: %s", description, e)
hud/utils/group_eval.py DELETED
@@ -1,223 +0,0 @@
1
- """Utilities for grouped evaluation of tasks, following the RL pattern."""
2
-
3
- from __future__ import annotations
4
-
5
- import asyncio
6
- from statistics import mean, stdev
7
- from typing import Any
8
-
9
- import numpy as np
10
-
11
- import hud
12
- from hud.datasets import Task
13
- from hud.types import Trace
14
- from hud.utils.hud_console import HUDConsole
15
-
16
- hud_console = HUDConsole()
17
-
18
-
19
- async def run_tasks_grouped(
20
- tasks: list[Any],
21
- agent_class: type | Any,
22
- agent_config: dict[str, Any] | None = None,
23
- group_size: int = 1,
24
- max_parallel_episodes: int = 48,
25
- max_steps: int = 10,
26
- verbose: bool = False,
27
- job_id: str | None = None,
28
- ) -> list[dict[str, Any]]:
29
- """
30
- Run tasks with grouping, following the RL Actor pattern.
31
-
32
- Args:
33
- tasks: List of tasks to run
34
- agent_class: Agent class or instance to use
35
- agent_config: Configuration for agent instantiation
36
- group_size: Number of times to run each task
37
- max_parallel_episodes: Maximum parallel episodes to run
38
- max_steps: Maximum steps per episode
39
- verbose: Whether to show progress
40
- job_id: Optional job ID for tracking
41
-
42
- Returns:
43
- List of statistics for each task group
44
- """
45
- agent_config = agent_config or {}
46
-
47
- # Duplicate tasks according to group_size, exactly like RL
48
- grouped_tasks = []
49
- task_mapping = [] # Track which group each result belongs to
50
-
51
- for i, task in enumerate(tasks):
52
- for _ in range(group_size):
53
- grouped_tasks.append(task)
54
- task_mapping.append(i)
55
-
56
- hud_console.info(
57
- f"Running {len(tasks)} tasks with group_size={group_size} ({len(grouped_tasks)} total runs)"
58
- )
59
-
60
- # Run all episodes, respecting max_parallel_episodes
61
- all_traces = []
62
-
63
- for batch_start in range(0, len(grouped_tasks), max_parallel_episodes):
64
- batch_end = min(batch_start + max_parallel_episodes, len(grouped_tasks))
65
- batch = grouped_tasks[batch_start:batch_end]
66
-
67
- # Run batch in parallel
68
- async def run_single_episode(task_data: dict[str, Any] | Task, idx: int) -> Trace:
69
- """Run a single episode."""
70
- try:
71
- # Create task if needed
72
- task = Task(**task_data) if isinstance(task_data, dict) else task_data
73
-
74
- # Create fresh agent instance
75
- if isinstance(agent_class, type):
76
- agent = agent_class(**agent_config)
77
- else:
78
- # Agent is already instantiated
79
- agent = agent_class
80
-
81
- # Run the task
82
- trace_name = f"Eval | {task.id if hasattr(task, 'id') else 'Task'} | Group {task_mapping[idx]}" # noqa: E501
83
- with hud.trace(trace_name, job_id=job_id):
84
- result = await agent.run(task, max_steps=max_steps)
85
- return result
86
-
87
- except Exception as e:
88
- hud_console.warning_log(f"Episode failed: {e}")
89
- return Trace(isError=True, content=str(e), reward=0.0, done=True)
90
-
91
- # Run batch
92
- batch_results = await asyncio.gather(
93
- *[run_single_episode(t, batch_start + i) for i, t in enumerate(batch)],
94
- return_exceptions=True,
95
- )
96
-
97
- # Normalize exceptions to error traces
98
- for res in batch_results:
99
- if isinstance(res, Exception):
100
- hud_console.warning_log(f"Episode error: {res}")
101
- all_traces.append(Trace(isError=True, content=str(res), reward=0.0, done=True))
102
- else:
103
- all_traces.append(res)
104
-
105
- if verbose:
106
- hud_console.info(f"Completed batch: {len(all_traces)}/{len(grouped_tasks)} episodes")
107
-
108
- # Group results back by original task and calculate statistics
109
- return calculate_group_statistics(tasks, all_traces, task_mapping, group_size)
110
-
111
-
112
- def calculate_group_statistics(
113
- original_tasks: list[Any],
114
- traces: list[Trace],
115
- task_mapping: list[int],
116
- group_size: int,
117
- ) -> list[dict[str, Any]]:
118
- """
119
- Calculate statistics for each group, similar to preprocess_advantages.
120
-
121
- Args:
122
- original_tasks: Original task list
123
- traces: All traces from grouped runs
124
- task_mapping: Mapping of trace index to task index
125
- group_size: Number of runs per task
126
-
127
- Returns:
128
- List of statistics for each task
129
- """
130
- stats = []
131
-
132
- # Process each original task
133
- for task_idx, task in enumerate(original_tasks):
134
- # Get all traces for this task
135
- task_traces = [
136
- traces[i] for i, mapping_idx in enumerate(task_mapping) if mapping_idx == task_idx
137
- ]
138
-
139
- # Extract rewards
140
- rewards = np.array([t.reward for t in task_traces])
141
- errors = [t for t in task_traces if t.isError]
142
-
143
- # Calculate statistics
144
- task_stats = {
145
- "task_id": task.id
146
- if isinstance(task, Task) and hasattr(task, "id")
147
- else f"task_{task_idx}",
148
- "prompt": task.prompt if isinstance(task, Task) else task.get("prompt", ""),
149
- "group_size": group_size,
150
- "rewards": rewards.tolist(),
151
- "mean_reward": float(np.mean(rewards)),
152
- "std_reward": float(np.std(rewards)) if len(rewards) > 1 else 0.0,
153
- "min_reward": float(np.min(rewards)),
154
- "max_reward": float(np.max(rewards)),
155
- "success_rate": float(np.sum(rewards > 0) / len(rewards)) if len(rewards) > 0 else 0.0,
156
- "error_rate": len(errors) / len(task_traces) if len(task_traces) > 0 else 0.0,
157
- "traces": task_traces, # Keep full traces for detailed analysis
158
- }
159
-
160
- # Add variance info like RL does
161
- if task_stats["std_reward"] > 1e-6:
162
- task_stats["normalized_rewards"] = [
163
- (r - task_stats["mean_reward"]) / task_stats["std_reward"] for r in rewards
164
- ]
165
- else:
166
- task_stats["normalized_rewards"] = [0.0] * len(rewards)
167
-
168
- stats.append(task_stats)
169
-
170
- return stats
171
-
172
-
173
- def display_group_statistics(stats: list[dict[str, Any]], show_details: bool = True) -> None:
174
- """Display statistics from grouped evaluation."""
175
- from rich.console import Console
176
- from rich.table import Table
177
-
178
- console = Console()
179
-
180
- # Overall statistics
181
- all_means = [s["mean_reward"] for s in stats]
182
- overall_mean = mean(all_means) if all_means else 0.0
183
- overall_std = stdev(all_means) if len(all_means) > 1 else 0.0
184
-
185
- hud_console.success("\n📊 Evaluation Summary")
186
- hud_console.info(f"Tasks evaluated: {len(stats)}")
187
- hud_console.info(f"Episodes per task: {stats[0]['group_size'] if stats else 0}")
188
- hud_console.info(f"Total episodes: {sum(len(s['rewards']) for s in stats)}")
189
- hud_console.info(f"Overall mean reward: {overall_mean:.3f} ± {overall_std:.3f}")
190
-
191
- # Detailed table
192
- if show_details and len(stats) <= 50: # Only show for reasonable dataset sizes
193
- table = Table(title="\nPer-Task Performance Distribution")
194
- table.add_column("Task", style="cyan", no_wrap=True)
195
- table.add_column("Mean±Std", justify="right", style="green")
196
- table.add_column("Min/Max", justify="right")
197
- table.add_column("Success%", justify="right", style="yellow")
198
- table.add_column("Rewards", style="dim")
199
-
200
- for stat in stats:
201
- task_name = stat["prompt"][:30] + "..." if len(stat["prompt"]) > 30 else stat["prompt"]
202
- rewards_str = " ".join([f"{r:.2f}" for r in stat["rewards"][:5]])
203
- if len(stat["rewards"]) > 5:
204
- rewards_str += " ..."
205
-
206
- table.add_row(
207
- task_name,
208
- f"{stat['mean_reward']:.3f}±{stat['std_reward']:.3f}",
209
- f"{stat['min_reward']:.2f}/{stat['max_reward']:.2f}",
210
- f"{stat['success_rate'] * 100:.0f}%",
211
- rewards_str,
212
- )
213
-
214
- console.print(table)
215
-
216
- # High variance tasks
217
- high_variance_tasks = [s for s in stats if s["std_reward"] > 0.3 and s["group_size"] > 1]
218
- if high_variance_tasks:
219
- hud_console.warning(f"\n{len(high_variance_tasks)} tasks show high variance (std > 0.3)")
220
- for task in high_variance_tasks[:3]:
221
- hud_console.info(
222
- f" • {task['task_id']}: μ={task['mean_reward']:.3f}, σ={task['std_reward']:.3f}" # noqa: RUF001
223
- )
hud/utils/progress.py DELETED
@@ -1,149 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import time
4
- from collections import defaultdict
5
-
6
-
7
- class StepProgressTracker:
8
- """
9
- Tracks progress across potentially parallel async tasks based on steps completed.
10
- Provides estimates assuming tasks run up to max_steps_per_task.
11
- """
12
-
13
- def __init__(self, total_tasks: int, max_steps_per_task: int) -> None:
14
- """
15
- Initialize the StepProgressTracker.
16
-
17
- Args:
18
- total_tasks: The total number of tasks to track.
19
- max_steps_per_task: The maximum number of steps per task.
20
-
21
- Raises:
22
- ValueError: If total_tasks or max_steps_per_task is not positive.
23
- """
24
- if total_tasks <= 0:
25
- raise ValueError("total_tasks must be positive")
26
- if max_steps_per_task <= 0:
27
- raise ValueError("max_steps_per_task must be positive")
28
-
29
- self.total_tasks = total_tasks
30
- self.max_steps_per_task = max_steps_per_task
31
- self.total_potential_steps = total_tasks * max_steps_per_task
32
-
33
- # Use asyncio.Lock for potentially concurrent updates/reads if needed,
34
- # but start without for simplicity in single-threaded asyncio.
35
- # self._lock = asyncio.Lock()
36
- self._task_steps: dict[str, int] = defaultdict(int)
37
- self._finished_tasks: dict[str, bool] = defaultdict(bool)
38
- self._tasks_started = 0
39
- self._tasks_finished = 0
40
-
41
- self.start_time: float | None = None
42
- self.current_total_steps = 0
43
-
44
- def start_task(self, task_id: str) -> None:
45
- # async with self._lock: # If using lock
46
- if self.start_time is None:
47
- self.start_time = time.monotonic()
48
- self._task_steps[task_id] = 0
49
- self._finished_tasks[task_id] = False
50
- self._tasks_started += 1
51
-
52
- def increment_step(self, task_id: str) -> None:
53
- # async with self._lock:
54
- if (
55
- not self._finished_tasks[task_id]
56
- and self._task_steps[task_id] < self.max_steps_per_task
57
- ):
58
- self._task_steps[task_id] += 1
59
- # Update overall progress immediately
60
- self._update_total_steps()
61
-
62
- def finish_task(self, task_id: str) -> None:
63
- # async with self._lock:
64
- if not self._finished_tasks[task_id]:
65
- # For calculation, consider a finished task as having completed max steps
66
- self._task_steps[task_id] = self.max_steps_per_task
67
- self._finished_tasks[task_id] = True
68
- self._tasks_finished += 1
69
- # Update overall progress
70
- self._update_total_steps()
71
-
72
- def _update_total_steps(self) -> None:
73
- # This could be expensive if called extremely frequently.
74
- # Called after increment or finish.
75
- # async with self._lock:
76
- self.current_total_steps = sum(self._task_steps.values())
77
-
78
- def get_progress(self) -> tuple[int, int, float]:
79
- """Returns (current_steps, total_potential_steps, percentage)."""
80
- # async with self._lock:
81
- # Recalculate here for safety, though _update_total_steps should keep it current
82
- # current_steps = sum(self._task_steps.values())
83
- current_steps = self.current_total_steps
84
-
85
- percentage = 0.0
86
- if self.total_potential_steps > 0:
87
- percentage = (current_steps / self.total_potential_steps) * 100
88
- return current_steps, self.total_potential_steps, percentage
89
-
90
- def get_stats(self) -> tuple[float, float | None]:
91
- """Returns (rate_steps_per_minute, eta_seconds_upper_bound)."""
92
- # async with self._lock:
93
- if self.start_time is None or self._tasks_started == 0:
94
- return 0.0, None # No rate or ETA yet
95
-
96
- elapsed_time = time.monotonic() - self.start_time
97
- current_steps = self.current_total_steps
98
-
99
- rate_sec = 0.0
100
- if elapsed_time > 0:
101
- rate_sec = current_steps / elapsed_time
102
-
103
- rate_min = rate_sec * 60 # Convert rate to steps per minute
104
-
105
- eta = None
106
- # ETA calculation still uses rate_sec (steps/second) for time estimation in seconds
107
- if rate_sec > 0:
108
- remaining_steps = self.total_potential_steps - current_steps
109
- eta = remaining_steps / rate_sec if remaining_steps > 0 else 0.0
110
-
111
- return rate_min, eta # Return rate in steps/min
112
-
113
- def is_finished(self) -> bool:
114
- # async with self._lock:
115
- return self._tasks_finished >= self.total_tasks
116
-
117
- def display(self, bar_length: int = 40) -> str:
118
- """Generates a progress string similar to tqdm."""
119
- current_steps, total_steps, percentage = self.get_progress()
120
- rate_min, eta = self.get_stats() # Rate is now per minute
121
-
122
- # Ensure valid values for display
123
- current_steps = min(current_steps, total_steps)
124
- percentage = max(0.0, min(100.0, percentage))
125
-
126
- filled_length = int(bar_length * current_steps // total_steps) if total_steps else 0
127
- bar = "█" * filled_length + "-" * (bar_length - filled_length)
128
-
129
- # Format time
130
- elapsed_str = "0:00"
131
- eta_str = "??:??"
132
- if self.start_time:
133
- elapsed_seconds = int(time.monotonic() - self.start_time)
134
- elapsed_str = f"{elapsed_seconds // 60}:{elapsed_seconds % 60:02d}"
135
- if eta is not None:
136
- eta_seconds = int(eta)
137
- eta_str = f"{eta_seconds // 60}:{eta_seconds % 60:02d}"
138
- elif self.is_finished():
139
- eta_str = "0:00"
140
-
141
- # Update rate string format
142
- rate_str = f"{rate_min:.1f} steps/min" if rate_min > 0 else "?? steps/min"
143
-
144
- # Format steps - use K/M for large numbers if desired, keep simple for now
145
- steps_str = f"{current_steps}/{total_steps}"
146
-
147
- # tasks_str = f" {self._tasks_finished}/{self.total_tasks} tasks" # Optional tasks counter
148
-
149
- return f"{percentage:3.0f}%|{bar}| {steps_str} [{elapsed_str}<{eta_str}, {rate_str}]"
hud/utils/tasks.py DELETED
@@ -1,127 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import json
4
- from pathlib import Path
5
-
6
- from hud.types import Task
7
- from hud.utils.hud_console import HUDConsole
8
-
9
- hud_console = HUDConsole()
10
-
11
-
12
- def load_tasks(tasks_input: str | list[dict], *, raw: bool = False) -> list[Task] | list[dict]:
13
- """Load tasks from various sources.
14
-
15
- Args:
16
- tasks_input: Either:
17
- - Path to a JSON file (array of tasks)
18
- - Path to a JSONL file (one task per line)
19
- - HuggingFace dataset name (format: "username/dataset" or "username/dataset:split")
20
- - List of task dictionaries
21
- raw: If True, return raw dicts without validation or env substitution
22
-
23
- Returns:
24
- - If raw=False (default): list[Task]
25
- - If raw=True: list[dict]
26
- """
27
- tasks: list[Task] | list[dict] = []
28
-
29
- if isinstance(tasks_input, list):
30
- # Direct list of task dicts
31
- hud_console.info(f"Loading {len(tasks_input)} tasks from provided list")
32
- if raw:
33
- return [item for item in tasks_input if isinstance(item, dict)]
34
- for item in tasks_input:
35
- task = Task(**item)
36
- tasks.append(task)
37
-
38
- elif isinstance(tasks_input, str):
39
- # Check if it's a file path
40
- if Path(tasks_input).exists():
41
- file_path = Path(tasks_input)
42
-
43
- with open(file_path, encoding="utf-8") as f:
44
- # Handle JSON files (array of tasks)
45
- if file_path.suffix.lower() == ".json":
46
- data = json.load(f)
47
- if not isinstance(data, list):
48
- raise ValueError(
49
- f"JSON file must contain an array of tasks, got {type(data)}"
50
- )
51
- if raw:
52
- return [item for item in data if isinstance(item, dict)]
53
- for item in data:
54
- task = Task(**item)
55
- tasks.append(task)
56
-
57
- # Handle JSONL files (one task per line)
58
- else:
59
- raw_items: list[dict] = []
60
- for line in f:
61
- line = line.strip()
62
- if not line:
63
- continue
64
- item = json.loads(line)
65
- if isinstance(item, list):
66
- raw_items.extend([it for it in item if isinstance(it, dict)])
67
- elif isinstance(item, dict):
68
- raw_items.append(item)
69
- else:
70
- raise ValueError(
71
- f"Invalid JSONL format: expected dict or list of dicts, got {type(item)}" # noqa: E501
72
- )
73
- if raw:
74
- return raw_items
75
- for it in raw_items:
76
- task = Task(**it)
77
- tasks.append(task)
78
-
79
- # Check if it's a HuggingFace dataset
80
- elif "/" in tasks_input:
81
- hud_console.info(f"Loading tasks from HuggingFace dataset: {tasks_input}")
82
- try:
83
- from datasets import load_dataset
84
-
85
- # Parse dataset name and optional split
86
- if ":" in tasks_input:
87
- dataset_name, split = tasks_input.split(":", 1)
88
- else:
89
- dataset_name = tasks_input
90
- split = "train" # Default split
91
-
92
- dataset = load_dataset(dataset_name, split=split)
93
-
94
- # Convert dataset rows to Task objects
95
- raw_rows: list[dict] = []
96
- for item in dataset:
97
- if not isinstance(item, dict):
98
- raise ValueError(
99
- f"Invalid HuggingFace dataset: expected dict, got {type(item)}"
100
- )
101
- if not item["mcp_config"] or not item["prompt"]:
102
- raise ValueError(
103
- f"Invalid HuggingFace dataset: expected mcp_config and prompt, got {item}" # noqa: E501
104
- )
105
- raw_rows.append(item)
106
- if raw:
107
- return raw_rows
108
- for row in raw_rows:
109
- task = Task(**row)
110
- tasks.append(task)
111
-
112
- except ImportError as e:
113
- raise ImportError(
114
- "Please install 'datasets' to load from HuggingFace: uv pip install datasets"
115
- ) from e
116
- except Exception as e:
117
- raise ValueError(f"Failed to load HuggingFace dataset '{tasks_input}': {e}") from e
118
-
119
- else:
120
- raise ValueError(
121
- f"Invalid tasks input: '{tasks_input}' is neither a file path nor a HuggingFace dataset" # noqa: E501
122
- )
123
-
124
- else:
125
- raise TypeError(f"tasks_input must be str or list, got {type(tasks_input)}")
126
-
127
- return tasks