hud-python 0.3.5__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (192) hide show
  1. hud/__init__.py +22 -89
  2. hud/agents/__init__.py +15 -0
  3. hud/agents/art.py +101 -0
  4. hud/agents/base.py +599 -0
  5. hud/{mcp → agents}/claude.py +373 -321
  6. hud/{mcp → agents}/langchain.py +250 -250
  7. hud/agents/misc/__init__.py +7 -0
  8. hud/{agent → agents}/misc/response_agent.py +80 -80
  9. hud/{mcp → agents}/openai.py +352 -334
  10. hud/agents/openai_chat_generic.py +154 -0
  11. hud/{mcp → agents}/tests/__init__.py +1 -1
  12. hud/agents/tests/test_base.py +742 -0
  13. hud/agents/tests/test_claude.py +324 -0
  14. hud/{mcp → agents}/tests/test_client.py +363 -324
  15. hud/{mcp → agents}/tests/test_openai.py +237 -238
  16. hud/cli/__init__.py +617 -0
  17. hud/cli/__main__.py +8 -0
  18. hud/cli/analyze.py +371 -0
  19. hud/cli/analyze_metadata.py +230 -0
  20. hud/cli/build.py +427 -0
  21. hud/cli/clone.py +185 -0
  22. hud/cli/cursor.py +92 -0
  23. hud/cli/debug.py +392 -0
  24. hud/cli/docker_utils.py +83 -0
  25. hud/cli/init.py +281 -0
  26. hud/cli/interactive.py +353 -0
  27. hud/cli/mcp_server.py +756 -0
  28. hud/cli/pull.py +336 -0
  29. hud/cli/push.py +370 -0
  30. hud/cli/remote_runner.py +311 -0
  31. hud/cli/runner.py +160 -0
  32. hud/cli/tests/__init__.py +3 -0
  33. hud/cli/tests/test_analyze.py +284 -0
  34. hud/cli/tests/test_cli_init.py +265 -0
  35. hud/cli/tests/test_cli_main.py +27 -0
  36. hud/cli/tests/test_clone.py +142 -0
  37. hud/cli/tests/test_cursor.py +253 -0
  38. hud/cli/tests/test_debug.py +453 -0
  39. hud/cli/tests/test_mcp_server.py +139 -0
  40. hud/cli/tests/test_utils.py +388 -0
  41. hud/cli/utils.py +263 -0
  42. hud/clients/README.md +143 -0
  43. hud/clients/__init__.py +16 -0
  44. hud/clients/base.py +379 -0
  45. hud/clients/fastmcp.py +222 -0
  46. hud/clients/mcp_use.py +278 -0
  47. hud/clients/tests/__init__.py +1 -0
  48. hud/clients/tests/test_client_integration.py +111 -0
  49. hud/clients/tests/test_fastmcp.py +342 -0
  50. hud/clients/tests/test_protocol.py +188 -0
  51. hud/clients/utils/__init__.py +1 -0
  52. hud/clients/utils/retry_transport.py +160 -0
  53. hud/datasets.py +322 -192
  54. hud/misc/__init__.py +1 -0
  55. hud/{agent → misc}/claude_plays_pokemon.py +292 -283
  56. hud/otel/__init__.py +35 -0
  57. hud/otel/collector.py +142 -0
  58. hud/otel/config.py +164 -0
  59. hud/otel/context.py +536 -0
  60. hud/otel/exporters.py +366 -0
  61. hud/otel/instrumentation.py +97 -0
  62. hud/otel/processors.py +118 -0
  63. hud/otel/tests/__init__.py +1 -0
  64. hud/otel/tests/test_processors.py +197 -0
  65. hud/server/__init__.py +5 -5
  66. hud/server/context.py +114 -0
  67. hud/server/helper/__init__.py +5 -0
  68. hud/server/low_level.py +132 -0
  69. hud/server/server.py +166 -0
  70. hud/server/tests/__init__.py +3 -0
  71. hud/settings.py +73 -79
  72. hud/shared/__init__.py +5 -0
  73. hud/{exceptions.py → shared/exceptions.py} +180 -180
  74. hud/{server → shared}/requests.py +264 -264
  75. hud/shared/tests/test_exceptions.py +157 -0
  76. hud/{server → shared}/tests/test_requests.py +275 -275
  77. hud/telemetry/__init__.py +25 -30
  78. hud/telemetry/instrument.py +379 -0
  79. hud/telemetry/job.py +309 -141
  80. hud/telemetry/replay.py +74 -0
  81. hud/telemetry/trace.py +83 -0
  82. hud/tools/__init__.py +33 -34
  83. hud/tools/base.py +365 -65
  84. hud/tools/bash.py +161 -137
  85. hud/tools/computer/__init__.py +15 -13
  86. hud/tools/computer/anthropic.py +437 -420
  87. hud/tools/computer/hud.py +376 -334
  88. hud/tools/computer/openai.py +295 -292
  89. hud/tools/computer/settings.py +82 -0
  90. hud/tools/edit.py +314 -290
  91. hud/tools/executors/__init__.py +30 -30
  92. hud/tools/executors/base.py +539 -532
  93. hud/tools/executors/pyautogui.py +621 -619
  94. hud/tools/executors/tests/__init__.py +1 -1
  95. hud/tools/executors/tests/test_base_executor.py +338 -338
  96. hud/tools/executors/tests/test_pyautogui_executor.py +165 -165
  97. hud/tools/executors/xdo.py +511 -503
  98. hud/tools/{playwright_tool.py → playwright.py} +412 -379
  99. hud/tools/tests/__init__.py +3 -3
  100. hud/tools/tests/test_base.py +282 -0
  101. hud/tools/tests/test_bash.py +158 -152
  102. hud/tools/tests/test_bash_extended.py +197 -0
  103. hud/tools/tests/test_computer.py +425 -52
  104. hud/tools/tests/test_computer_actions.py +34 -34
  105. hud/tools/tests/test_edit.py +259 -240
  106. hud/tools/tests/test_init.py +27 -27
  107. hud/tools/tests/test_playwright_tool.py +183 -183
  108. hud/tools/tests/test_tools.py +145 -157
  109. hud/tools/tests/test_utils.py +156 -156
  110. hud/tools/types.py +72 -0
  111. hud/tools/utils.py +50 -50
  112. hud/types.py +136 -89
  113. hud/utils/__init__.py +10 -16
  114. hud/utils/async_utils.py +65 -0
  115. hud/utils/design.py +168 -0
  116. hud/utils/mcp.py +55 -0
  117. hud/utils/progress.py +149 -149
  118. hud/utils/telemetry.py +66 -66
  119. hud/utils/tests/test_async_utils.py +173 -0
  120. hud/utils/tests/test_init.py +17 -21
  121. hud/utils/tests/test_progress.py +261 -225
  122. hud/utils/tests/test_telemetry.py +82 -37
  123. hud/utils/tests/test_version.py +8 -8
  124. hud/version.py +7 -7
  125. hud_python-0.4.1.dist-info/METADATA +476 -0
  126. hud_python-0.4.1.dist-info/RECORD +132 -0
  127. hud_python-0.4.1.dist-info/entry_points.txt +3 -0
  128. {hud_python-0.3.5.dist-info → hud_python-0.4.1.dist-info}/licenses/LICENSE +21 -21
  129. hud/adapters/__init__.py +0 -8
  130. hud/adapters/claude/__init__.py +0 -5
  131. hud/adapters/claude/adapter.py +0 -180
  132. hud/adapters/claude/tests/__init__.py +0 -1
  133. hud/adapters/claude/tests/test_adapter.py +0 -519
  134. hud/adapters/common/__init__.py +0 -6
  135. hud/adapters/common/adapter.py +0 -178
  136. hud/adapters/common/tests/test_adapter.py +0 -289
  137. hud/adapters/common/types.py +0 -446
  138. hud/adapters/operator/__init__.py +0 -5
  139. hud/adapters/operator/adapter.py +0 -108
  140. hud/adapters/operator/tests/__init__.py +0 -1
  141. hud/adapters/operator/tests/test_adapter.py +0 -370
  142. hud/agent/__init__.py +0 -19
  143. hud/agent/base.py +0 -126
  144. hud/agent/claude.py +0 -271
  145. hud/agent/langchain.py +0 -215
  146. hud/agent/misc/__init__.py +0 -3
  147. hud/agent/operator.py +0 -268
  148. hud/agent/tests/__init__.py +0 -1
  149. hud/agent/tests/test_base.py +0 -202
  150. hud/env/__init__.py +0 -11
  151. hud/env/client.py +0 -35
  152. hud/env/docker_client.py +0 -349
  153. hud/env/environment.py +0 -446
  154. hud/env/local_docker_client.py +0 -358
  155. hud/env/remote_client.py +0 -212
  156. hud/env/remote_docker_client.py +0 -292
  157. hud/gym.py +0 -130
  158. hud/job.py +0 -773
  159. hud/mcp/__init__.py +0 -17
  160. hud/mcp/base.py +0 -631
  161. hud/mcp/client.py +0 -312
  162. hud/mcp/tests/test_base.py +0 -512
  163. hud/mcp/tests/test_claude.py +0 -294
  164. hud/task.py +0 -149
  165. hud/taskset.py +0 -237
  166. hud/telemetry/_trace.py +0 -347
  167. hud/telemetry/context.py +0 -230
  168. hud/telemetry/exporter.py +0 -575
  169. hud/telemetry/instrumentation/__init__.py +0 -3
  170. hud/telemetry/instrumentation/mcp.py +0 -259
  171. hud/telemetry/instrumentation/registry.py +0 -59
  172. hud/telemetry/mcp_models.py +0 -270
  173. hud/telemetry/tests/__init__.py +0 -1
  174. hud/telemetry/tests/test_context.py +0 -210
  175. hud/telemetry/tests/test_trace.py +0 -312
  176. hud/tools/helper/README.md +0 -56
  177. hud/tools/helper/__init__.py +0 -9
  178. hud/tools/helper/mcp_server.py +0 -78
  179. hud/tools/helper/server_initialization.py +0 -115
  180. hud/tools/helper/utils.py +0 -58
  181. hud/trajectory.py +0 -94
  182. hud/utils/agent.py +0 -37
  183. hud/utils/common.py +0 -256
  184. hud/utils/config.py +0 -120
  185. hud/utils/deprecation.py +0 -115
  186. hud/utils/misc.py +0 -53
  187. hud/utils/tests/test_common.py +0 -277
  188. hud/utils/tests/test_config.py +0 -129
  189. hud_python-0.3.5.dist-info/METADATA +0 -284
  190. hud_python-0.3.5.dist-info/RECORD +0 -120
  191. /hud/{adapters/common → shared}/tests/__init__.py +0 -0
  192. {hud_python-0.3.5.dist-info → hud_python-0.4.1.dist-info}/WHEEL +0 -0
hud/datasets.py CHANGED
@@ -1,192 +1,322 @@
1
- """Dataset utilities for working with HuggingFace datasets and TaskConfigs."""
2
-
3
- from __future__ import annotations
4
-
5
- import asyncio
6
- import logging
7
- from string import Template
8
- from typing import TYPE_CHECKING, Any
9
-
10
- from mcp.types import CallToolRequestParams as MCPToolParams
11
- from pydantic import BaseModel, Field, field_validator
12
-
13
- from hud.telemetry.job import job
14
-
15
- if TYPE_CHECKING:
16
- from datasets import Dataset
17
-
18
- from hud.mcp.base import AgentResult, BaseMCPAgent
19
-
20
- logger = logging.getLogger("hud.datasets")
21
-
22
-
23
- class TaskConfig(BaseModel):
24
- """
25
- A task configuration that can be used to create a task.
26
-
27
- The mcp_config field supports environment variable substitution using
28
- template placeholders in the format ${VAR_NAME} or ${VAR_NAME:default_value}.
29
-
30
- Example:
31
- mcp_config: {
32
- "hud": {
33
- "url": "${HUD_MCP_URL:https://mcp.hud.so/v3/mcp}",
34
- "headers": {
35
- "Authorization": "Bearer ${HUD_API_KEY}",
36
- "Run-Id": "${RUN_ID}",
37
- "Mcp-Image": "your-mcp-image"
38
- }
39
- }
40
- }
41
- """
42
-
43
- id: str | None = None
44
- prompt: str
45
- mcp_config: dict[str, Any]
46
- setup_tool: MCPToolParams | None = None
47
- evaluate_tool: MCPToolParams | None = None
48
- metadata: dict[str, Any] = Field(default_factory=dict)
49
-
50
- @field_validator("mcp_config", mode="before")
51
- @classmethod
52
- def resolve_env_vars(cls, v: dict[str, Any]) -> dict[str, Any]:
53
- """
54
- Automatically resolve environment variables in mcp_config using Template.
55
-
56
- Supports ${VAR_NAME} syntax with variable substitution from:
57
- 1. System environment variables (including HUD_API_KEY, etc.)
58
- 2. Runtime context variables (e.g., RUN_ID from telemetry context)
59
-
60
- Missing variables resolve to empty strings.
61
- """
62
- import os
63
-
64
- from hud.telemetry.context import get_current_task_run_id
65
-
66
- # Start with current environment variables
67
- mapping = dict(os.environ)
68
-
69
- # Add runtime context variables if available
70
- run_id = get_current_task_run_id()
71
- if run_id:
72
- mapping["RUN_ID"] = run_id
73
-
74
- def substitute_in_value(obj: Any) -> Any:
75
- """Recursively substitute variables in nested structures."""
76
- if isinstance(obj, str):
77
- # Use Template's substitute with defaultdict - missing vars become empty strings
78
- from collections import defaultdict
79
-
80
- safe_mapping = defaultdict(str, mapping)
81
- return Template(obj).substitute(safe_mapping)
82
- elif isinstance(obj, dict):
83
- return {k: substitute_in_value(v) for k, v in obj.items()}
84
- elif isinstance(obj, list):
85
- return [substitute_in_value(item) for item in obj]
86
- else:
87
- return obj
88
-
89
- return substitute_in_value(v)
90
-
91
-
92
- def to_taskconfigs(dataset: Dataset) -> Dataset:
93
- """
94
- Convert a HuggingFace dataset to contain TaskConfig objects.
95
-
96
- Args:
97
- dataset: HuggingFace dataset with task data
98
-
99
- Returns:
100
- Dataset with 'task' column containing TaskConfig objects
101
-
102
- Example:
103
- >>> dataset = load_dataset("hud/sheetbench-v1", split="test")
104
- >>> tasks = to_taskconfigs(dataset)
105
- >>> tasks[0]["task"] # This is a TaskConfig object
106
- """
107
-
108
- def _convert(example: dict[str, Any]) -> dict[str, TaskConfig]:
109
- return {"task": TaskConfig(**example)}
110
-
111
- # Map and keep only the task column
112
- return dataset.map(_convert, remove_columns=dataset.column_names)
113
-
114
-
115
- async def run_dataset(
116
- name: str,
117
- dataset: Dataset,
118
- agent_class: type[BaseMCPAgent],
119
- agent_config: dict[str, Any] | None = None,
120
- max_concurrent: int = 5,
121
- metadata: dict[str, Any] | None = None,
122
- ) -> list[Any]:
123
- """
124
- Run all tasks in a dataset with automatic job tracking.
125
-
126
- Args:
127
- name: Name for the job
128
- dataset: HuggingFace Dataset (raw, not converted)
129
- agent_class: Agent class to instantiate (e.g., ClaudeMCPAgent)
130
- agent_config: Configuration for agent (model, etc.)
131
- max_concurrent: Maximum parallel task execution
132
- metadata: Optional metadata for the job
133
-
134
- Returns:
135
- List of results from agent.run() in dataset order
136
-
137
- Example:
138
- >>> from datasets import load_dataset
139
- >>> from hud.mcp import ClaudeMCPAgent
140
- >>> dataset = load_dataset("hud/sheetbench-v1", split="test")
141
- >>> results = await run_dataset(
142
- ... "sheetbench_eval",
143
- ... dataset,
144
- ... ClaudeMCPAgent,
145
- ... {"model": "claude-3-5-sonnet-20241022"},
146
- ... max_concurrent=3,
147
- ... )
148
- """
149
- # Import here to avoid circular imports
150
- import hud
151
- from hud.mcp.client import MCPClient
152
-
153
- # Convert dataset to TaskConfigs internally
154
- tasks = to_taskconfigs(dataset)
155
-
156
- # Create job context
157
- job_metadata = metadata or {}
158
- job_metadata["agent_class"] = agent_class.__name__
159
- if agent_config:
160
- job_metadata["agent_config"] = agent_config
161
-
162
- with job(name, metadata=job_metadata):
163
- # Run tasks with semaphore for concurrency control
164
- sem = asyncio.Semaphore(max_concurrent)
165
- results: list[AgentResult | None] = [None] * len(tasks)
166
-
167
- async def _worker(index: int, row: Any) -> None:
168
- async with sem:
169
- task = row["task"]
170
-
171
- # Create trace for this task
172
- with hud.trace(f"task_{index}"):
173
- # Create fresh MCP client per task
174
- if task.mcp_config:
175
- client = MCPClient(mcp_config=task.mcp_config)
176
- agent = agent_class(mcp_client=client, **(agent_config or {}))
177
-
178
- try:
179
- results[index] = await agent.run(task)
180
- finally:
181
- await client.close()
182
- else:
183
- logger.warning("Task %d has no mcp_config defined", index)
184
- results[index] = None
185
-
186
- # Execute all tasks
187
- await asyncio.gather(
188
- *[_worker(i, row) for i, row in enumerate(tasks)],
189
- return_exceptions=True, # Don't fail entire batch on one error
190
- )
191
-
192
- return results
1
+ """Dataset utilities for working with HuggingFace datasets and Tasks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import logging
8
+ from string import Template
9
+ from typing import TYPE_CHECKING, Any, cast
10
+
11
+ from datasets import Dataset, load_dataset
12
+ from pydantic import BaseModel, Field, field_validator
13
+
14
+ from hud.agents.misc import ResponseAgent
15
+
16
+ from .types import MCPToolCall
17
+
18
+ if TYPE_CHECKING:
19
+ from hud.agents import MCPAgent
20
+
21
+ logger = logging.getLogger("hud.datasets")
22
+
23
+
24
+ class Task(BaseModel):
25
+ """
26
+ A task configuration that can be used to create a task.
27
+
28
+ The mcp_config field supports environment variable substitution using
29
+ template placeholders in the format ${VAR_NAME} or ${VAR_NAME:default_value}.
30
+
31
+ Example:
32
+ mcp_config: {
33
+ "hud": {
34
+ "url": "${HUD_MCP_URL:https://mcp.hud.so/v3/mcp}",
35
+ "headers": {
36
+ "Authorization": "Bearer ${HUD_API_KEY}",
37
+ "Mcp-Image": "your-mcp-image"
38
+ }
39
+ }
40
+ }
41
+ """
42
+
43
+ id: str | None = None
44
+ prompt: str
45
+ mcp_config: dict[str, Any]
46
+ setup_tool: MCPToolCall | list[MCPToolCall] | None = None
47
+ evaluate_tool: MCPToolCall | list[MCPToolCall] | None = None
48
+ system_prompt: str | None = None
49
+ metadata: dict[str, Any] = Field(default_factory=dict)
50
+
51
+ @field_validator("mcp_config", "metadata", mode="before")
52
+ @classmethod
53
+ def parse_json_strings(cls, v: Any) -> Any:
54
+ """Parse JSON strings into dictionaries."""
55
+ if isinstance(v, str):
56
+ try:
57
+ return json.loads(v)
58
+ except json.JSONDecodeError as e:
59
+ raise ValueError(f"Invalid JSON string: {e}") from e
60
+ return v
61
+
62
+ @field_validator("setup_tool", "evaluate_tool", mode="before")
63
+ @classmethod
64
+ def convert_dict_to_tool_call(cls, v: Any) -> Any:
65
+ """Convert dict to MCPToolCall instance, parsing JSON strings first."""
66
+ if v is None:
67
+ return None
68
+
69
+ # Parse JSON string if needed
70
+ if isinstance(v, str):
71
+ try:
72
+ v = json.loads(v)
73
+ except json.JSONDecodeError as e:
74
+ raise ValueError(f"Invalid JSON string: {e}") from e
75
+
76
+ if isinstance(v, dict):
77
+ return MCPToolCall(**v)
78
+ if isinstance(v, list):
79
+ return [MCPToolCall(**item) if isinstance(item, dict) else item for item in v]
80
+ return v
81
+
82
+ @field_validator("mcp_config", mode="before")
83
+ @classmethod
84
+ def resolve_env_vars(cls, v: dict[str, Any]) -> dict[str, Any]:
85
+ """
86
+ Automatically resolve environment variables in mcp_config using Template.
87
+
88
+ Supports ${VAR_NAME} syntax with variable substitution from
89
+ System environment variables (including HUD_API_KEY, etc.)
90
+
91
+ Missing variables resolve to empty strings.
92
+ """
93
+ import os
94
+
95
+ # Start with current environment variables
96
+ mapping = dict(os.environ)
97
+
98
+ def substitute_in_value(obj: Any) -> Any:
99
+ """Recursively substitute variables in nested structures."""
100
+ if isinstance(obj, str):
101
+ # Use Template's substitute with defaultdict - missing vars become empty strings
102
+ from collections import defaultdict
103
+
104
+ safe_mapping = defaultdict(str, mapping)
105
+ return Template(obj).substitute(safe_mapping)
106
+ elif isinstance(obj, dict):
107
+ return {k: substitute_in_value(v) for k, v in obj.items()}
108
+ elif isinstance(obj, list):
109
+ return [substitute_in_value(item) for item in obj]
110
+ else:
111
+ return obj
112
+
113
+ return substitute_in_value(v)
114
+
115
+
116
+ async def fetch_system_prompt_from_dataset(dataset_id: str) -> str | None:
117
+ """
118
+ Fetch system_prompt.txt from a HuggingFace dataset repository.
119
+
120
+ Args:
121
+ dataset_id: HuggingFace dataset identifier (e.g., "hud-evals/SheetBench-50")
122
+
123
+ Returns:
124
+ System prompt text if found, None otherwise
125
+ """
126
+ try:
127
+ # Import here to avoid unnecessary dependency
128
+ from huggingface_hub import hf_hub_download
129
+ from huggingface_hub.errors import EntryNotFoundError
130
+
131
+ # Try to download the system_prompt.txt file
132
+ try:
133
+ file_path = hf_hub_download(
134
+ repo_id=dataset_id, filename="system_prompt.txt", repo_type="dataset"
135
+ )
136
+
137
+ # Read and return the content
138
+ with open(file_path, encoding="utf-8") as f: # noqa: ASYNC230
139
+ content = f.read().strip()
140
+ if content:
141
+ logger.info(
142
+ "Loaded system prompt from %s (length: %d chars)", dataset_id, len(content)
143
+ )
144
+ return content
145
+ else:
146
+ logger.warning("System prompt file is empty in %s", dataset_id)
147
+ return None
148
+
149
+ except EntryNotFoundError:
150
+ logger.debug("No system_prompt.txt found in dataset %s", dataset_id)
151
+ return None
152
+
153
+ except ImportError:
154
+ logger.warning(
155
+ "huggingface_hub not installed. Install it to fetch system prompts from datasets."
156
+ )
157
+ return None
158
+ except Exception as e:
159
+ logger.error("Error fetching system prompt from %s: %s", dataset_id, e)
160
+ return None
161
+
162
+
163
+ async def run_dataset(
164
+ name: str,
165
+ dataset: str | Dataset | list[dict[str, Any]],
166
+ agent_class: type[MCPAgent],
167
+ agent_config: dict[str, Any] | None = None,
168
+ max_concurrent: int = 50,
169
+ metadata: dict[str, Any] | None = None,
170
+ max_steps: int = 40,
171
+ split: str = "train",
172
+ auto_respond: bool = False,
173
+ custom_system_prompt: str | None = None,
174
+ ) -> list[Any]:
175
+ """
176
+ Run all tasks in a dataset with automatic job tracking.
177
+
178
+ Args:
179
+ name: Name for the job
180
+ dataset: HuggingFace dataset identifier (e.g. "hud-evals/SheetBench-50"),
181
+ Dataset object, OR list of Task objects
182
+ agent_class: Agent class to instantiate (e.g., ClaudeAgent)
183
+ agent_config: Configuration/kwargs for agent (model, etc.)
184
+ max_concurrent: Maximum parallel task execution
185
+ metadata: Optional metadata for the job
186
+ max_steps: Maximum steps per task
187
+ split: Dataset split to use when loading from string (default: "train")
188
+ auto_respond: Whether to use auto-response agent
189
+
190
+ Returns:
191
+ List of results from agent.run() in dataset order
192
+
193
+ Example:
194
+ >>> from hud.agents import ClaudeAgent
195
+ >>> # Option 1: From dataset string identifier
196
+ >>> results = await run_dataset(
197
+ ... "SheetBench Eval",
198
+ ... "hud-evals/SheetBench-50",
199
+ ... ClaudeAgent,
200
+ ... {"model": "claude-3-5-sonnet-20241022"},
201
+ ... )
202
+ >>> # Option 2: From HuggingFace dataset object
203
+ >>> from datasets import load_dataset
204
+ >>> dataset = load_dataset("hud-evals/SheetBench-50", split="train")
205
+ >>> results = await run_dataset("my_eval", dataset, ClaudeAgent)
206
+ >>> # Option 3: From list of dicts
207
+ >>> tasks = [{"prompt": "...", "mcp_config": {...}, ...}, ...]
208
+ >>> results = await run_dataset("browser_eval", tasks, ClaudeAgent)
209
+ """
210
+ # Import here to avoid circular imports
211
+ import hud
212
+
213
+ dataset_link = None
214
+
215
+ # Load dataset from string if needed
216
+ if isinstance(dataset, str):
217
+ logger.info("Loading dataset %s from HuggingFace...", dataset)
218
+ dataset_link = dataset
219
+
220
+ # Load dataset from HuggingFace
221
+ dataset = cast("Dataset", load_dataset(dataset, split=split))
222
+
223
+ # Create job context
224
+ job_metadata = metadata or {}
225
+ job_metadata["agent_class"] = agent_class.__name__
226
+ job_metadata["agent_config"] = agent_config
227
+
228
+ # Extract dataset verification info if available
229
+ if isinstance(dataset, Dataset) and not dataset_link:
230
+ general_info = next(iter(dataset.info.__dict__["download_checksums"].keys())).split("/")
231
+ project = general_info[3]
232
+ dataset_name = general_info[4].split("@")[0]
233
+ dataset_link = f"{project}/{dataset_name}"
234
+
235
+ with hud.job(name, metadata=job_metadata, dataset_link=dataset_link) as job_obj:
236
+ # Run tasks with semaphore for concurrency control
237
+ sem = asyncio.Semaphore(max_concurrent)
238
+ results: list[Any | None] = [None] * len(dataset)
239
+
240
+ async def _worker(index: int, task_dict: Any, max_steps: int = 40) -> None:
241
+ async with sem:
242
+ # Create trace for this task
243
+ task_name = task_dict.get("prompt") or f"Task {index}"
244
+ if "system_prompt" not in task_dict:
245
+ task_dict["system_prompt"] = custom_system_prompt
246
+ with hud.trace(task_name, job_id=job_obj.id, task_id=task_dict.get("id")):
247
+ # Convert dict to Task here, at trace level
248
+ task = Task(**task_dict)
249
+
250
+ agent = agent_class(**(agent_config or {}))
251
+
252
+ if auto_respond:
253
+ agent.response_agent = ResponseAgent()
254
+ results[index] = await agent.run(task, max_steps=max_steps)
255
+
256
+ # Execute all tasks
257
+ await asyncio.gather(
258
+ *[_worker(i, task, max_steps=max_steps) for i, task in enumerate(dataset)],
259
+ return_exceptions=True, # Don't fail entire batch on one error
260
+ )
261
+
262
+ return results
263
+
264
+
265
+ def save_tasks(
266
+ tasks: list[dict[str, Any]], repo_id: str, fields: list[str] | None = None, **kwargs: Any
267
+ ) -> None:
268
+ """
269
+ Save data to HuggingFace dataset with JSON string serialization.
270
+
271
+ Complex fields (dicts, lists) are serialized as JSON strings to maintain clean schema
272
+ and avoid null value pollution in HuggingFace datasets.
273
+
274
+ Args:
275
+ tasks: List of dictionaries to save
276
+ repo_id: HuggingFace repository ID (e.g., "hud-evals/my-tasks")
277
+ fields: Optional list of fields to save. If None, saves all fields from each dict.
278
+ **kwargs: Additional arguments passed to dataset.push_to_hub()
279
+ """
280
+ from datasets import Dataset
281
+
282
+ # Safety check: Ensure we're not saving Task objects (which have resolved env vars)
283
+ if tasks and isinstance(tasks[0], Task):
284
+ raise ValueError(
285
+ "save_tasks expects dictionaries, not Task objects. "
286
+ "Task objects have resolved environment variables which would expose secrets. "
287
+ "Please pass raw dictionaries with template strings like '${HUD_API_KEY}' preserved."
288
+ )
289
+
290
+ # Convert to rows with JSON string fields
291
+ data = []
292
+ for i, tc_dict in enumerate(tasks):
293
+ # Additional safety check for each item
294
+ if isinstance(tc_dict, Task):
295
+ raise ValueError(
296
+ f"Item {i} is a Task object, not a dictionary. "
297
+ "This would expose resolved environment variables. "
298
+ "Please convert to dictionary format with template strings preserved."
299
+ )
300
+
301
+ row = {}
302
+
303
+ # Determine which fields to process
304
+ fields_to_process = fields if fields is not None else list(tc_dict.keys())
305
+
306
+ for field in fields_to_process:
307
+ if field in tc_dict:
308
+ value = tc_dict[field]
309
+ # Serialize complex types as JSON strings
310
+ if isinstance(value, (dict | list)):
311
+ row[field] = json.dumps(value)
312
+ elif isinstance(value, (str | int | float | bool | type(None))):
313
+ row[field] = value if value is not None else ""
314
+ else:
315
+ # For other types, convert to string
316
+ row[field] = str(value)
317
+
318
+ data.append(row)
319
+
320
+ # Create and push dataset
321
+ dataset = Dataset.from_list(data)
322
+ dataset.push_to_hub(repo_id, **kwargs)
hud/misc/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """Miscellaneous utilities for HUD SDK."""