hud-python 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (130) hide show
  1. hud/__init__.py +22 -22
  2. hud/agents/__init__.py +13 -15
  3. hud/agents/base.py +599 -599
  4. hud/agents/claude.py +373 -373
  5. hud/agents/langchain.py +261 -250
  6. hud/agents/misc/__init__.py +7 -7
  7. hud/agents/misc/response_agent.py +82 -80
  8. hud/agents/openai.py +352 -352
  9. hud/agents/openai_chat_generic.py +154 -154
  10. hud/agents/tests/__init__.py +1 -1
  11. hud/agents/tests/test_base.py +742 -742
  12. hud/agents/tests/test_claude.py +324 -324
  13. hud/agents/tests/test_client.py +363 -363
  14. hud/agents/tests/test_openai.py +237 -237
  15. hud/cli/__init__.py +617 -617
  16. hud/cli/__main__.py +8 -8
  17. hud/cli/analyze.py +371 -371
  18. hud/cli/analyze_metadata.py +230 -230
  19. hud/cli/build.py +498 -427
  20. hud/cli/clone.py +185 -185
  21. hud/cli/cursor.py +92 -92
  22. hud/cli/debug.py +392 -392
  23. hud/cli/docker_utils.py +83 -83
  24. hud/cli/init.py +280 -281
  25. hud/cli/interactive.py +353 -353
  26. hud/cli/mcp_server.py +764 -756
  27. hud/cli/pull.py +330 -336
  28. hud/cli/push.py +404 -370
  29. hud/cli/remote_runner.py +311 -311
  30. hud/cli/runner.py +160 -160
  31. hud/cli/tests/__init__.py +3 -3
  32. hud/cli/tests/test_analyze.py +284 -284
  33. hud/cli/tests/test_cli_init.py +265 -265
  34. hud/cli/tests/test_cli_main.py +27 -27
  35. hud/cli/tests/test_clone.py +142 -142
  36. hud/cli/tests/test_cursor.py +253 -253
  37. hud/cli/tests/test_debug.py +453 -453
  38. hud/cli/tests/test_mcp_server.py +139 -139
  39. hud/cli/tests/test_utils.py +388 -388
  40. hud/cli/utils.py +263 -263
  41. hud/clients/README.md +143 -143
  42. hud/clients/__init__.py +16 -16
  43. hud/clients/base.py +378 -379
  44. hud/clients/fastmcp.py +222 -222
  45. hud/clients/mcp_use.py +298 -278
  46. hud/clients/tests/__init__.py +1 -1
  47. hud/clients/tests/test_client_integration.py +111 -111
  48. hud/clients/tests/test_fastmcp.py +342 -342
  49. hud/clients/tests/test_protocol.py +188 -188
  50. hud/clients/utils/__init__.py +1 -1
  51. hud/clients/utils/retry_transport.py +160 -160
  52. hud/datasets.py +327 -322
  53. hud/misc/__init__.py +1 -1
  54. hud/misc/claude_plays_pokemon.py +292 -292
  55. hud/otel/__init__.py +35 -35
  56. hud/otel/collector.py +142 -142
  57. hud/otel/config.py +164 -164
  58. hud/otel/context.py +536 -536
  59. hud/otel/exporters.py +366 -366
  60. hud/otel/instrumentation.py +97 -97
  61. hud/otel/processors.py +118 -118
  62. hud/otel/tests/__init__.py +1 -1
  63. hud/otel/tests/test_processors.py +197 -197
  64. hud/server/__init__.py +5 -5
  65. hud/server/context.py +114 -114
  66. hud/server/helper/__init__.py +5 -5
  67. hud/server/low_level.py +132 -132
  68. hud/server/server.py +170 -166
  69. hud/server/tests/__init__.py +3 -3
  70. hud/settings.py +73 -73
  71. hud/shared/__init__.py +5 -5
  72. hud/shared/exceptions.py +180 -180
  73. hud/shared/requests.py +264 -264
  74. hud/shared/tests/test_exceptions.py +157 -157
  75. hud/shared/tests/test_requests.py +275 -275
  76. hud/telemetry/__init__.py +25 -25
  77. hud/telemetry/instrument.py +379 -379
  78. hud/telemetry/job.py +309 -309
  79. hud/telemetry/replay.py +74 -74
  80. hud/telemetry/trace.py +83 -83
  81. hud/tools/__init__.py +33 -33
  82. hud/tools/base.py +365 -365
  83. hud/tools/bash.py +161 -161
  84. hud/tools/computer/__init__.py +15 -15
  85. hud/tools/computer/anthropic.py +437 -437
  86. hud/tools/computer/hud.py +376 -376
  87. hud/tools/computer/openai.py +295 -295
  88. hud/tools/computer/settings.py +82 -82
  89. hud/tools/edit.py +314 -314
  90. hud/tools/executors/__init__.py +30 -30
  91. hud/tools/executors/base.py +539 -539
  92. hud/tools/executors/pyautogui.py +621 -621
  93. hud/tools/executors/tests/__init__.py +1 -1
  94. hud/tools/executors/tests/test_base_executor.py +338 -338
  95. hud/tools/executors/tests/test_pyautogui_executor.py +165 -165
  96. hud/tools/executors/xdo.py +511 -511
  97. hud/tools/playwright.py +412 -412
  98. hud/tools/tests/__init__.py +3 -3
  99. hud/tools/tests/test_base.py +282 -282
  100. hud/tools/tests/test_bash.py +158 -158
  101. hud/tools/tests/test_bash_extended.py +197 -197
  102. hud/tools/tests/test_computer.py +425 -425
  103. hud/tools/tests/test_computer_actions.py +34 -34
  104. hud/tools/tests/test_edit.py +259 -259
  105. hud/tools/tests/test_init.py +27 -27
  106. hud/tools/tests/test_playwright_tool.py +183 -183
  107. hud/tools/tests/test_tools.py +145 -145
  108. hud/tools/tests/test_utils.py +156 -156
  109. hud/tools/types.py +72 -72
  110. hud/tools/utils.py +50 -50
  111. hud/types.py +136 -136
  112. hud/utils/__init__.py +10 -10
  113. hud/utils/async_utils.py +65 -65
  114. hud/utils/design.py +236 -168
  115. hud/utils/mcp.py +55 -55
  116. hud/utils/progress.py +149 -149
  117. hud/utils/telemetry.py +66 -66
  118. hud/utils/tests/test_async_utils.py +173 -173
  119. hud/utils/tests/test_init.py +17 -17
  120. hud/utils/tests/test_progress.py +261 -261
  121. hud/utils/tests/test_telemetry.py +82 -82
  122. hud/utils/tests/test_version.py +8 -8
  123. hud/version.py +7 -7
  124. {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/METADATA +10 -8
  125. hud_python-0.4.3.dist-info/RECORD +131 -0
  126. {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/licenses/LICENSE +21 -21
  127. hud/agents/art.py +0 -101
  128. hud_python-0.4.1.dist-info/RECORD +0 -132
  129. {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/WHEEL +0 -0
  130. {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/entry_points.txt +0 -0
hud/datasets.py CHANGED
@@ -1,322 +1,327 @@
1
- """Dataset utilities for working with HuggingFace datasets and Tasks."""
2
-
3
- from __future__ import annotations
4
-
5
- import asyncio
6
- import json
7
- import logging
8
- from string import Template
9
- from typing import TYPE_CHECKING, Any, cast
10
-
11
- from datasets import Dataset, load_dataset
12
- from pydantic import BaseModel, Field, field_validator
13
-
14
- from hud.agents.misc import ResponseAgent
15
-
16
- from .types import MCPToolCall
17
-
18
- if TYPE_CHECKING:
19
- from hud.agents import MCPAgent
20
-
21
- logger = logging.getLogger("hud.datasets")
22
-
23
-
24
- class Task(BaseModel):
25
- """
26
- A task configuration that can be used to create a task.
27
-
28
- The mcp_config field supports environment variable substitution using
29
- template placeholders in the format ${VAR_NAME} or ${VAR_NAME:default_value}.
30
-
31
- Example:
32
- mcp_config: {
33
- "hud": {
34
- "url": "${HUD_MCP_URL:https://mcp.hud.so/v3/mcp}",
35
- "headers": {
36
- "Authorization": "Bearer ${HUD_API_KEY}",
37
- "Mcp-Image": "your-mcp-image"
38
- }
39
- }
40
- }
41
- """
42
-
43
- id: str | None = None
44
- prompt: str
45
- mcp_config: dict[str, Any]
46
- setup_tool: MCPToolCall | list[MCPToolCall] | None = None
47
- evaluate_tool: MCPToolCall | list[MCPToolCall] | None = None
48
- system_prompt: str | None = None
49
- metadata: dict[str, Any] = Field(default_factory=dict)
50
-
51
- @field_validator("mcp_config", "metadata", mode="before")
52
- @classmethod
53
- def parse_json_strings(cls, v: Any) -> Any:
54
- """Parse JSON strings into dictionaries."""
55
- if isinstance(v, str):
56
- try:
57
- return json.loads(v)
58
- except json.JSONDecodeError as e:
59
- raise ValueError(f"Invalid JSON string: {e}") from e
60
- return v
61
-
62
- @field_validator("setup_tool", "evaluate_tool", mode="before")
63
- @classmethod
64
- def convert_dict_to_tool_call(cls, v: Any) -> Any:
65
- """Convert dict to MCPToolCall instance, parsing JSON strings first."""
66
- if v is None:
67
- return None
68
-
69
- # Parse JSON string if needed
70
- if isinstance(v, str):
71
- try:
72
- v = json.loads(v)
73
- except json.JSONDecodeError as e:
74
- raise ValueError(f"Invalid JSON string: {e}") from e
75
-
76
- if isinstance(v, dict):
77
- return MCPToolCall(**v)
78
- if isinstance(v, list):
79
- return [MCPToolCall(**item) if isinstance(item, dict) else item for item in v]
80
- return v
81
-
82
- @field_validator("mcp_config", mode="before")
83
- @classmethod
84
- def resolve_env_vars(cls, v: dict[str, Any]) -> dict[str, Any]:
85
- """
86
- Automatically resolve environment variables in mcp_config using Template.
87
-
88
- Supports ${VAR_NAME} syntax with variable substitution from
89
- System environment variables (including HUD_API_KEY, etc.)
90
-
91
- Missing variables resolve to empty strings.
92
- """
93
- import os
94
-
95
- # Start with current environment variables
96
- mapping = dict(os.environ)
97
-
98
- def substitute_in_value(obj: Any) -> Any:
99
- """Recursively substitute variables in nested structures."""
100
- if isinstance(obj, str):
101
- # Use Template's substitute with defaultdict - missing vars become empty strings
102
- from collections import defaultdict
103
-
104
- safe_mapping = defaultdict(str, mapping)
105
- return Template(obj).substitute(safe_mapping)
106
- elif isinstance(obj, dict):
107
- return {k: substitute_in_value(v) for k, v in obj.items()}
108
- elif isinstance(obj, list):
109
- return [substitute_in_value(item) for item in obj]
110
- else:
111
- return obj
112
-
113
- return substitute_in_value(v)
114
-
115
-
116
- async def fetch_system_prompt_from_dataset(dataset_id: str) -> str | None:
117
- """
118
- Fetch system_prompt.txt from a HuggingFace dataset repository.
119
-
120
- Args:
121
- dataset_id: HuggingFace dataset identifier (e.g., "hud-evals/SheetBench-50")
122
-
123
- Returns:
124
- System prompt text if found, None otherwise
125
- """
126
- try:
127
- # Import here to avoid unnecessary dependency
128
- from huggingface_hub import hf_hub_download
129
- from huggingface_hub.errors import EntryNotFoundError
130
-
131
- # Try to download the system_prompt.txt file
132
- try:
133
- file_path = hf_hub_download(
134
- repo_id=dataset_id, filename="system_prompt.txt", repo_type="dataset"
135
- )
136
-
137
- # Read and return the content
138
- with open(file_path, encoding="utf-8") as f: # noqa: ASYNC230
139
- content = f.read().strip()
140
- if content:
141
- logger.info(
142
- "Loaded system prompt from %s (length: %d chars)", dataset_id, len(content)
143
- )
144
- return content
145
- else:
146
- logger.warning("System prompt file is empty in %s", dataset_id)
147
- return None
148
-
149
- except EntryNotFoundError:
150
- logger.debug("No system_prompt.txt found in dataset %s", dataset_id)
151
- return None
152
-
153
- except ImportError:
154
- logger.warning(
155
- "huggingface_hub not installed. Install it to fetch system prompts from datasets."
156
- )
157
- return None
158
- except Exception as e:
159
- logger.error("Error fetching system prompt from %s: %s", dataset_id, e)
160
- return None
161
-
162
-
163
- async def run_dataset(
164
- name: str,
165
- dataset: str | Dataset | list[dict[str, Any]],
166
- agent_class: type[MCPAgent],
167
- agent_config: dict[str, Any] | None = None,
168
- max_concurrent: int = 50,
169
- metadata: dict[str, Any] | None = None,
170
- max_steps: int = 40,
171
- split: str = "train",
172
- auto_respond: bool = False,
173
- custom_system_prompt: str | None = None,
174
- ) -> list[Any]:
175
- """
176
- Run all tasks in a dataset with automatic job tracking.
177
-
178
- Args:
179
- name: Name for the job
180
- dataset: HuggingFace dataset identifier (e.g. "hud-evals/SheetBench-50"),
181
- Dataset object, OR list of Task objects
182
- agent_class: Agent class to instantiate (e.g., ClaudeAgent)
183
- agent_config: Configuration/kwargs for agent (model, etc.)
184
- max_concurrent: Maximum parallel task execution
185
- metadata: Optional metadata for the job
186
- max_steps: Maximum steps per task
187
- split: Dataset split to use when loading from string (default: "train")
188
- auto_respond: Whether to use auto-response agent
189
-
190
- Returns:
191
- List of results from agent.run() in dataset order
192
-
193
- Example:
194
- >>> from hud.agents import ClaudeAgent
195
- >>> # Option 1: From dataset string identifier
196
- >>> results = await run_dataset(
197
- ... "SheetBench Eval",
198
- ... "hud-evals/SheetBench-50",
199
- ... ClaudeAgent,
200
- ... {"model": "claude-3-5-sonnet-20241022"},
201
- ... )
202
- >>> # Option 2: From HuggingFace dataset object
203
- >>> from datasets import load_dataset
204
- >>> dataset = load_dataset("hud-evals/SheetBench-50", split="train")
205
- >>> results = await run_dataset("my_eval", dataset, ClaudeAgent)
206
- >>> # Option 3: From list of dicts
207
- >>> tasks = [{"prompt": "...", "mcp_config": {...}, ...}, ...]
208
- >>> results = await run_dataset("browser_eval", tasks, ClaudeAgent)
209
- """
210
- # Import here to avoid circular imports
211
- import hud
212
-
213
- dataset_link = None
214
-
215
- # Load dataset from string if needed
216
- if isinstance(dataset, str):
217
- logger.info("Loading dataset %s from HuggingFace...", dataset)
218
- dataset_link = dataset
219
-
220
- # Load dataset from HuggingFace
221
- dataset = cast("Dataset", load_dataset(dataset, split=split))
222
-
223
- # Create job context
224
- job_metadata = metadata or {}
225
- job_metadata["agent_class"] = agent_class.__name__
226
- job_metadata["agent_config"] = agent_config
227
-
228
- # Extract dataset verification info if available
229
- if isinstance(dataset, Dataset) and not dataset_link:
230
- general_info = next(iter(dataset.info.__dict__["download_checksums"].keys())).split("/")
231
- project = general_info[3]
232
- dataset_name = general_info[4].split("@")[0]
233
- dataset_link = f"{project}/{dataset_name}"
234
-
235
- with hud.job(name, metadata=job_metadata, dataset_link=dataset_link) as job_obj:
236
- # Run tasks with semaphore for concurrency control
237
- sem = asyncio.Semaphore(max_concurrent)
238
- results: list[Any | None] = [None] * len(dataset)
239
-
240
- async def _worker(index: int, task_dict: Any, max_steps: int = 40) -> None:
241
- async with sem:
242
- # Create trace for this task
243
- task_name = task_dict.get("prompt") or f"Task {index}"
244
- if "system_prompt" not in task_dict:
245
- task_dict["system_prompt"] = custom_system_prompt
246
- with hud.trace(task_name, job_id=job_obj.id, task_id=task_dict.get("id")):
247
- # Convert dict to Task here, at trace level
248
- task = Task(**task_dict)
249
-
250
- agent = agent_class(**(agent_config or {}))
251
-
252
- if auto_respond:
253
- agent.response_agent = ResponseAgent()
254
- results[index] = await agent.run(task, max_steps=max_steps)
255
-
256
- # Execute all tasks
257
- await asyncio.gather(
258
- *[_worker(i, task, max_steps=max_steps) for i, task in enumerate(dataset)],
259
- return_exceptions=True, # Don't fail entire batch on one error
260
- )
261
-
262
- return results
263
-
264
-
265
- def save_tasks(
266
- tasks: list[dict[str, Any]], repo_id: str, fields: list[str] | None = None, **kwargs: Any
267
- ) -> None:
268
- """
269
- Save data to HuggingFace dataset with JSON string serialization.
270
-
271
- Complex fields (dicts, lists) are serialized as JSON strings to maintain clean schema
272
- and avoid null value pollution in HuggingFace datasets.
273
-
274
- Args:
275
- tasks: List of dictionaries to save
276
- repo_id: HuggingFace repository ID (e.g., "hud-evals/my-tasks")
277
- fields: Optional list of fields to save. If None, saves all fields from each dict.
278
- **kwargs: Additional arguments passed to dataset.push_to_hub()
279
- """
280
- from datasets import Dataset
281
-
282
- # Safety check: Ensure we're not saving Task objects (which have resolved env vars)
283
- if tasks and isinstance(tasks[0], Task):
284
- raise ValueError(
285
- "save_tasks expects dictionaries, not Task objects. "
286
- "Task objects have resolved environment variables which would expose secrets. "
287
- "Please pass raw dictionaries with template strings like '${HUD_API_KEY}' preserved."
288
- )
289
-
290
- # Convert to rows with JSON string fields
291
- data = []
292
- for i, tc_dict in enumerate(tasks):
293
- # Additional safety check for each item
294
- if isinstance(tc_dict, Task):
295
- raise ValueError(
296
- f"Item {i} is a Task object, not a dictionary. "
297
- "This would expose resolved environment variables. "
298
- "Please convert to dictionary format with template strings preserved."
299
- )
300
-
301
- row = {}
302
-
303
- # Determine which fields to process
304
- fields_to_process = fields if fields is not None else list(tc_dict.keys())
305
-
306
- for field in fields_to_process:
307
- if field in tc_dict:
308
- value = tc_dict[field]
309
- # Serialize complex types as JSON strings
310
- if isinstance(value, (dict | list)):
311
- row[field] = json.dumps(value)
312
- elif isinstance(value, (str | int | float | bool | type(None))):
313
- row[field] = value if value is not None else ""
314
- else:
315
- # For other types, convert to string
316
- row[field] = str(value)
317
-
318
- data.append(row)
319
-
320
- # Create and push dataset
321
- dataset = Dataset.from_list(data)
322
- dataset.push_to_hub(repo_id, **kwargs)
1
+ """Dataset utilities for working with HuggingFace datasets and Tasks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import logging
8
+ from string import Template
9
+ from typing import TYPE_CHECKING, Any, cast
10
+
11
+ from datasets import Dataset, load_dataset
12
+ from pydantic import BaseModel, Field, field_validator
13
+
14
+ from hud.agents.misc import ResponseAgent
15
+ from hud.settings import settings
16
+
17
+ from .types import MCPToolCall
18
+
19
+ if TYPE_CHECKING:
20
+ from hud.agents import MCPAgent
21
+
22
+ logger = logging.getLogger("hud.datasets")
23
+
24
+
25
+ class Task(BaseModel):
26
+ """
27
+ A task configuration that can be used to create a task.
28
+
29
+ The mcp_config field supports environment variable substitution using
30
+ template placeholders in the format ${VAR_NAME} or ${VAR_NAME:default_value}.
31
+
32
+ Example:
33
+ mcp_config: {
34
+ "hud": {
35
+ "url": "${HUD_MCP_URL:https://mcp.hud.so/v3/mcp}",
36
+ "headers": {
37
+ "Authorization": "Bearer ${HUD_API_KEY}",
38
+ "Mcp-Image": "your-mcp-image"
39
+ }
40
+ }
41
+ }
42
+ """
43
+
44
+ id: str | None = None
45
+ prompt: str
46
+ mcp_config: dict[str, Any]
47
+ setup_tool: MCPToolCall | list[MCPToolCall] | None = None
48
+ evaluate_tool: MCPToolCall | list[MCPToolCall] | None = None
49
+ system_prompt: str | None = None
50
+ metadata: dict[str, Any] = Field(default_factory=dict)
51
+
52
+ @field_validator("mcp_config", "metadata", mode="before")
53
+ @classmethod
54
+ def parse_json_strings(cls, v: Any) -> Any:
55
+ """Parse JSON strings into dictionaries."""
56
+ if isinstance(v, str):
57
+ try:
58
+ return json.loads(v)
59
+ except json.JSONDecodeError as e:
60
+ raise ValueError(f"Invalid JSON string: {e}") from e
61
+ return v
62
+
63
+ @field_validator("setup_tool", "evaluate_tool", mode="before")
64
+ @classmethod
65
+ def convert_dict_to_tool_call(cls, v: Any) -> Any:
66
+ """Convert dict to MCPToolCall instance, parsing JSON strings first."""
67
+ if v is None:
68
+ return None
69
+
70
+ # Parse JSON string if needed
71
+ if isinstance(v, str):
72
+ try:
73
+ v = json.loads(v)
74
+ except json.JSONDecodeError as e:
75
+ raise ValueError(f"Invalid JSON string: {e}") from e
76
+
77
+ if isinstance(v, dict):
78
+ return MCPToolCall(**v)
79
+ if isinstance(v, list):
80
+ return [MCPToolCall(**item) if isinstance(item, dict) else item for item in v]
81
+ return v
82
+
83
+ @field_validator("mcp_config", mode="before")
84
+ @classmethod
85
+ def resolve_env_vars(cls, v: dict[str, Any]) -> dict[str, Any]:
86
+ """
87
+ Automatically resolve environment variables in mcp_config using Template.
88
+
89
+ Supports ${VAR_NAME} syntax with variable substitution from
90
+ System environment variables (including HUD_API_KEY, etc.)
91
+
92
+ Missing variables resolve to empty strings.
93
+ """
94
+ import os
95
+
96
+ # Start with current environment variables
97
+ mapping = dict(os.environ)
98
+ mapping.update(settings.model_dump())
99
+
100
+ if settings.api_key:
101
+ mapping["HUD_API_KEY"] = settings.api_key
102
+
103
+ def substitute_in_value(obj: Any) -> Any:
104
+ """Recursively substitute variables in nested structures."""
105
+ if isinstance(obj, str):
106
+ # Use Template's substitute with defaultdict - missing vars become empty strings
107
+ from collections import defaultdict
108
+
109
+ safe_mapping = defaultdict(str, mapping)
110
+ return Template(obj).substitute(safe_mapping)
111
+ elif isinstance(obj, dict):
112
+ return {k: substitute_in_value(v) for k, v in obj.items()}
113
+ elif isinstance(obj, list):
114
+ return [substitute_in_value(item) for item in obj]
115
+ else:
116
+ return obj
117
+
118
+ return substitute_in_value(v)
119
+
120
+
121
+ async def fetch_system_prompt_from_dataset(dataset_id: str) -> str | None:
122
+ """
123
+ Fetch system_prompt.txt from a HuggingFace dataset repository.
124
+
125
+ Args:
126
+ dataset_id: HuggingFace dataset identifier (e.g., "hud-evals/SheetBench-50")
127
+
128
+ Returns:
129
+ System prompt text if found, None otherwise
130
+ """
131
+ try:
132
+ # Import here to avoid unnecessary dependency
133
+ from huggingface_hub import hf_hub_download
134
+ from huggingface_hub.errors import EntryNotFoundError
135
+
136
+ # Try to download the system_prompt.txt file
137
+ try:
138
+ file_path = hf_hub_download(
139
+ repo_id=dataset_id, filename="system_prompt.txt", repo_type="dataset"
140
+ )
141
+
142
+ # Read and return the content
143
+ with open(file_path, encoding="utf-8") as f: # noqa: ASYNC230
144
+ content = f.read().strip()
145
+ if content:
146
+ logger.info(
147
+ "Loaded system prompt from %s (length: %d chars)", dataset_id, len(content)
148
+ )
149
+ return content
150
+ else:
151
+ logger.warning("System prompt file is empty in %s", dataset_id)
152
+ return None
153
+
154
+ except EntryNotFoundError:
155
+ logger.debug("No system_prompt.txt found in dataset %s", dataset_id)
156
+ return None
157
+
158
+ except ImportError:
159
+ logger.warning(
160
+ "huggingface_hub not installed. Install it to fetch system prompts from datasets."
161
+ )
162
+ return None
163
+ except Exception as e:
164
+ logger.error("Error fetching system prompt from %s: %s", dataset_id, e)
165
+ return None
166
+
167
+
168
+ async def run_dataset(
169
+ name: str,
170
+ dataset: str | Dataset | list[dict[str, Any]],
171
+ agent_class: type[MCPAgent],
172
+ agent_config: dict[str, Any] | None = None,
173
+ max_concurrent: int = 50,
174
+ metadata: dict[str, Any] | None = None,
175
+ max_steps: int = 40,
176
+ split: str = "train",
177
+ auto_respond: bool = False,
178
+ custom_system_prompt: str | None = None,
179
+ ) -> list[Any]:
180
+ """
181
+ Run all tasks in a dataset with automatic job tracking.
182
+
183
+ Args:
184
+ name: Name for the job
185
+ dataset: HuggingFace dataset identifier (e.g. "hud-evals/SheetBench-50"),
186
+ Dataset object, OR list of Task objects
187
+ agent_class: Agent class to instantiate (e.g., ClaudeAgent)
188
+ agent_config: Configuration/kwargs for agent (model, etc.)
189
+ max_concurrent: Maximum parallel task execution
190
+ metadata: Optional metadata for the job
191
+ max_steps: Maximum steps per task
192
+ split: Dataset split to use when loading from string (default: "train")
193
+ auto_respond: Whether to use auto-response agent
194
+
195
+ Returns:
196
+ List of results from agent.run() in dataset order
197
+
198
+ Example:
199
+ >>> from hud.agents import ClaudeAgent
200
+ >>> # Option 1: From dataset string identifier
201
+ >>> results = await run_dataset(
202
+ ... "SheetBench Eval",
203
+ ... "hud-evals/SheetBench-50",
204
+ ... ClaudeAgent,
205
+ ... {"model": "claude-3-5-sonnet-20241022"},
206
+ ... )
207
+ >>> # Option 2: From HuggingFace dataset object
208
+ >>> from datasets import load_dataset
209
+ >>> dataset = load_dataset("hud-evals/SheetBench-50", split="train")
210
+ >>> results = await run_dataset("my_eval", dataset, ClaudeAgent)
211
+ >>> # Option 3: From list of dicts
212
+ >>> tasks = [{"prompt": "...", "mcp_config": {...}, ...}, ...]
213
+ >>> results = await run_dataset("browser_eval", tasks, ClaudeAgent)
214
+ """
215
+ # Import here to avoid circular imports
216
+ import hud
217
+
218
+ dataset_link = None
219
+
220
+ # Load dataset from string if needed
221
+ if isinstance(dataset, str):
222
+ logger.info("Loading dataset %s from HuggingFace...", dataset)
223
+ dataset_link = dataset
224
+
225
+ # Load dataset from HuggingFace
226
+ dataset = cast("Dataset", load_dataset(dataset, split=split))
227
+
228
+ # Create job context
229
+ job_metadata = metadata or {}
230
+ job_metadata["agent_class"] = agent_class.__name__
231
+ job_metadata["agent_config"] = agent_config
232
+
233
+ # Extract dataset verification info if available
234
+ if isinstance(dataset, Dataset) and not dataset_link:
235
+ general_info = next(iter(dataset.info.__dict__["download_checksums"].keys())).split("/")
236
+ project = general_info[3]
237
+ dataset_name = general_info[4].split("@")[0]
238
+ dataset_link = f"{project}/{dataset_name}"
239
+
240
+ with hud.job(name, metadata=job_metadata, dataset_link=dataset_link) as job_obj:
241
+ # Run tasks with semaphore for concurrency control
242
+ sem = asyncio.Semaphore(max_concurrent)
243
+ results: list[Any | None] = [None] * len(dataset)
244
+
245
+ async def _worker(index: int, task_dict: Any, max_steps: int = 40) -> None:
246
+ async with sem:
247
+ # Create trace for this task
248
+ task_name = task_dict.get("prompt") or f"Task {index}"
249
+ if "system_prompt" not in task_dict:
250
+ task_dict["system_prompt"] = custom_system_prompt
251
+ with hud.trace(task_name, job_id=job_obj.id, task_id=task_dict.get("id")):
252
+ # Convert dict to Task here, at trace level
253
+ task = Task(**task_dict)
254
+
255
+ agent = agent_class(**(agent_config or {}))
256
+
257
+ if auto_respond:
258
+ agent.response_agent = ResponseAgent()
259
+ results[index] = await agent.run(task, max_steps=max_steps)
260
+
261
+ # Execute all tasks
262
+ await asyncio.gather(
263
+ *[_worker(i, task, max_steps=max_steps) for i, task in enumerate(dataset)],
264
+ return_exceptions=True, # Don't fail entire batch on one error
265
+ )
266
+
267
+ return results
268
+
269
+
270
+ def save_tasks(
271
+ tasks: list[dict[str, Any]], repo_id: str, fields: list[str] | None = None, **kwargs: Any
272
+ ) -> None:
273
+ """
274
+ Save data to HuggingFace dataset with JSON string serialization.
275
+
276
+ Complex fields (dicts, lists) are serialized as JSON strings to maintain clean schema
277
+ and avoid null value pollution in HuggingFace datasets.
278
+
279
+ Args:
280
+ tasks: List of dictionaries to save
281
+ repo_id: HuggingFace repository ID (e.g., "hud-evals/my-tasks")
282
+ fields: Optional list of fields to save. If None, saves all fields from each dict.
283
+ **kwargs: Additional arguments passed to dataset.push_to_hub()
284
+ """
285
+ from datasets import Dataset
286
+
287
+ # Safety check: Ensure we're not saving Task objects (which have resolved env vars)
288
+ if tasks and isinstance(tasks[0], Task):
289
+ raise ValueError(
290
+ "save_tasks expects dictionaries, not Task objects. "
291
+ "Task objects have resolved environment variables which would expose secrets. "
292
+ "Please pass raw dictionaries with template strings like '${HUD_API_KEY}' preserved."
293
+ )
294
+
295
+ # Convert to rows with JSON string fields
296
+ data = []
297
+ for i, tc_dict in enumerate(tasks):
298
+ # Additional safety check for each item
299
+ if isinstance(tc_dict, Task):
300
+ raise ValueError(
301
+ f"Item {i} is a Task object, not a dictionary. "
302
+ "This would expose resolved environment variables. "
303
+ "Please convert to dictionary format with template strings preserved."
304
+ )
305
+
306
+ row = {}
307
+
308
+ # Determine which fields to process
309
+ fields_to_process = fields if fields is not None else list(tc_dict.keys())
310
+
311
+ for field in fields_to_process:
312
+ if field in tc_dict:
313
+ value = tc_dict[field]
314
+ # Serialize complex types as JSON strings
315
+ if isinstance(value, (dict | list)):
316
+ row[field] = json.dumps(value)
317
+ elif isinstance(value, (str | int | float | bool | type(None))):
318
+ row[field] = value if value is not None else ""
319
+ else:
320
+ # For other types, convert to string
321
+ row[field] = str(value)
322
+
323
+ data.append(row)
324
+
325
+ # Create and push dataset
326
+ dataset = Dataset.from_list(data)
327
+ dataset.push_to_hub(repo_id, **kwargs)