hud-python 0.4.13__py3-none-any.whl → 0.4.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

hud/datasets.py DELETED
@@ -1,327 +0,0 @@
1
- """Dataset utilities for working with HuggingFace datasets and Tasks."""
2
-
3
- from __future__ import annotations
4
-
5
- import asyncio
6
- import json
7
- import logging
8
- from string import Template
9
- from typing import TYPE_CHECKING, Any, cast
10
-
11
- from datasets import Dataset, load_dataset
12
- from pydantic import BaseModel, Field, field_validator
13
-
14
- from hud.agents.misc import ResponseAgent
15
- from hud.settings import settings
16
-
17
- from .types import MCPToolCall
18
-
19
- if TYPE_CHECKING:
20
- from hud.agents import MCPAgent
21
-
22
- logger = logging.getLogger("hud.datasets")
23
-
24
-
25
- class Task(BaseModel):
26
- """
27
- A task configuration that can be used to create a task.
28
-
29
- The mcp_config field supports environment variable substitution using
30
- template placeholders in the format ${VAR_NAME} or ${VAR_NAME:default_value}.
31
-
32
- Example:
33
- mcp_config: {
34
- "hud": {
35
- "url": "${HUD_MCP_URL:https://mcp.hud.so/v3/mcp}",
36
- "headers": {
37
- "Authorization": "Bearer ${HUD_API_KEY}",
38
- "Mcp-Image": "your-mcp-image"
39
- }
40
- }
41
- }
42
- """
43
-
44
- id: str | None = None
45
- prompt: str
46
- mcp_config: dict[str, Any]
47
- setup_tool: MCPToolCall | list[MCPToolCall] | None = None
48
- evaluate_tool: MCPToolCall | list[MCPToolCall] | None = None
49
- system_prompt: str | None = None
50
- metadata: dict[str, Any] = Field(default_factory=dict)
51
-
52
- @field_validator("mcp_config", "metadata", mode="before")
53
- @classmethod
54
- def parse_json_strings(cls, v: Any) -> Any:
55
- """Parse JSON strings into dictionaries."""
56
- if isinstance(v, str):
57
- try:
58
- return json.loads(v)
59
- except json.JSONDecodeError as e:
60
- raise ValueError(f"Invalid JSON string: {e}") from e
61
- return v
62
-
63
- @field_validator("setup_tool", "evaluate_tool", mode="before")
64
- @classmethod
65
- def convert_dict_to_tool_call(cls, v: Any) -> Any:
66
- """Convert dict to MCPToolCall instance, parsing JSON strings first."""
67
- if v is None:
68
- return None
69
-
70
- # Parse JSON string if needed
71
- if isinstance(v, str):
72
- try:
73
- v = json.loads(v)
74
- except json.JSONDecodeError as e:
75
- raise ValueError(f"Invalid JSON string: {e}") from e
76
-
77
- if isinstance(v, dict):
78
- return MCPToolCall(**v)
79
- if isinstance(v, list):
80
- return [MCPToolCall(**item) if isinstance(item, dict) else item for item in v]
81
- return v
82
-
83
- @field_validator("mcp_config", mode="before")
84
- @classmethod
85
- def resolve_env_vars(cls, v: dict[str, Any]) -> dict[str, Any]:
86
- """
87
- Automatically resolve environment variables in mcp_config using Template.
88
-
89
- Supports ${VAR_NAME} syntax with variable substitution from
90
- System environment variables (including HUD_API_KEY, etc.)
91
-
92
- Missing variables resolve to empty strings.
93
- """
94
- import os
95
-
96
- # Start with current environment variables
97
- mapping = dict(os.environ)
98
- mapping.update(settings.model_dump())
99
-
100
- if settings.api_key:
101
- mapping["HUD_API_KEY"] = settings.api_key
102
-
103
- def substitute_in_value(obj: Any) -> Any:
104
- """Recursively substitute variables in nested structures."""
105
- if isinstance(obj, str):
106
- # Use Template's substitute with defaultdict - missing vars become empty strings
107
- from collections import defaultdict
108
-
109
- safe_mapping = defaultdict(str, mapping)
110
- return Template(obj).substitute(safe_mapping)
111
- elif isinstance(obj, dict):
112
- return {k: substitute_in_value(v) for k, v in obj.items()}
113
- elif isinstance(obj, list):
114
- return [substitute_in_value(item) for item in obj]
115
- else:
116
- return obj
117
-
118
- return substitute_in_value(v)
119
-
120
-
121
- async def fetch_system_prompt_from_dataset(dataset_id: str) -> str | None:
122
- """
123
- Fetch system_prompt.txt from a HuggingFace dataset repository.
124
-
125
- Args:
126
- dataset_id: HuggingFace dataset identifier (e.g., "hud-evals/SheetBench-50")
127
-
128
- Returns:
129
- System prompt text if found, None otherwise
130
- """
131
- try:
132
- # Import here to avoid unnecessary dependency
133
- from huggingface_hub import hf_hub_download
134
- from huggingface_hub.errors import EntryNotFoundError
135
-
136
- # Try to download the system_prompt.txt file
137
- try:
138
- file_path = hf_hub_download(
139
- repo_id=dataset_id, filename="system_prompt.txt", repo_type="dataset"
140
- )
141
-
142
- # Read and return the content
143
- with open(file_path, encoding="utf-8") as f: # noqa: ASYNC230
144
- content = f.read().strip()
145
- if content:
146
- logger.info(
147
- "Loaded system prompt from %s (length: %d chars)", dataset_id, len(content)
148
- )
149
- return content
150
- else:
151
- logger.warning("System prompt file is empty in %s", dataset_id)
152
- return None
153
-
154
- except EntryNotFoundError:
155
- logger.debug("No system_prompt.txt found in dataset %s", dataset_id)
156
- return None
157
-
158
- except ImportError:
159
- logger.warning(
160
- "huggingface_hub not installed. Install it to fetch system prompts from datasets."
161
- )
162
- return None
163
- except Exception as e:
164
- logger.error("Error fetching system prompt from %s: %s", dataset_id, e)
165
- return None
166
-
167
-
168
- async def run_dataset(
169
- name: str,
170
- dataset: str | Dataset | list[dict[str, Any]],
171
- agent_class: type[MCPAgent],
172
- agent_config: dict[str, Any] | None = None,
173
- max_concurrent: int = 50,
174
- metadata: dict[str, Any] | None = None,
175
- max_steps: int = 40,
176
- split: str = "train",
177
- auto_respond: bool = False,
178
- custom_system_prompt: str | None = None,
179
- ) -> list[Any]:
180
- """
181
- Run all tasks in a dataset with automatic job tracking.
182
-
183
- Args:
184
- name: Name for the job
185
- dataset: HuggingFace dataset identifier (e.g. "hud-evals/SheetBench-50"),
186
- Dataset object, OR list of Task objects
187
- agent_class: Agent class to instantiate (e.g., ClaudeAgent)
188
- agent_config: Configuration/kwargs for agent (model, etc.)
189
- max_concurrent: Maximum parallel task execution
190
- metadata: Optional metadata for the job
191
- max_steps: Maximum steps per task
192
- split: Dataset split to use when loading from string (default: "train")
193
- auto_respond: Whether to use auto-response agent
194
-
195
- Returns:
196
- List of results from agent.run() in dataset order
197
-
198
- Example:
199
- >>> from hud.agents import ClaudeAgent
200
- >>> # Option 1: From dataset string identifier
201
- >>> results = await run_dataset(
202
- ... "SheetBench Eval",
203
- ... "hud-evals/SheetBench-50",
204
- ... ClaudeAgent,
205
- ... {"model": "claude-3-5-sonnet-20241022"},
206
- ... )
207
- >>> # Option 2: From HuggingFace dataset object
208
- >>> from datasets import load_dataset
209
- >>> dataset = load_dataset("hud-evals/SheetBench-50", split="train")
210
- >>> results = await run_dataset("my_eval", dataset, ClaudeAgent)
211
- >>> # Option 3: From list of dicts
212
- >>> tasks = [{"prompt": "...", "mcp_config": {...}, ...}, ...]
213
- >>> results = await run_dataset("browser_eval", tasks, ClaudeAgent)
214
- """
215
- # Import here to avoid circular imports
216
- import hud
217
-
218
- dataset_link = None
219
-
220
- # Load dataset from string if needed
221
- if isinstance(dataset, str):
222
- logger.info("Loading dataset %s from HuggingFace...", dataset)
223
- dataset_link = dataset
224
-
225
- # Load dataset from HuggingFace
226
- dataset = cast("Dataset", load_dataset(dataset, split=split))
227
-
228
- # Create job context
229
- job_metadata = metadata or {}
230
- job_metadata["agent_class"] = agent_class.__name__
231
- job_metadata["agent_config"] = agent_config
232
-
233
- # Extract dataset verification info if available
234
- if isinstance(dataset, Dataset) and not dataset_link:
235
- general_info = next(iter(dataset.info.__dict__["download_checksums"].keys())).split("/")
236
- project = general_info[3]
237
- dataset_name = general_info[4].split("@")[0]
238
- dataset_link = f"{project}/{dataset_name}"
239
-
240
- with hud.job(name, metadata=job_metadata, dataset_link=dataset_link) as job_obj:
241
- # Run tasks with semaphore for concurrency control
242
- sem = asyncio.Semaphore(max_concurrent)
243
- results: list[Any | None] = [None] * len(dataset)
244
-
245
- async def _worker(index: int, task_dict: Any, max_steps: int = 40) -> None:
246
- async with sem:
247
- # Create trace for this task
248
- task_name = task_dict.get("prompt") or f"Task {index}"
249
- if "system_prompt" not in task_dict:
250
- task_dict["system_prompt"] = custom_system_prompt
251
- with hud.trace(task_name, job_id=job_obj.id, task_id=task_dict.get("id")):
252
- # Convert dict to Task here, at trace level
253
- task = Task(**task_dict)
254
-
255
- agent = agent_class(**(agent_config or {}))
256
-
257
- if auto_respond:
258
- agent.response_agent = ResponseAgent()
259
- results[index] = await agent.run(task, max_steps=max_steps)
260
-
261
- # Execute all tasks
262
- await asyncio.gather(
263
- *[_worker(i, task, max_steps=max_steps) for i, task in enumerate(dataset)],
264
- return_exceptions=True, # Don't fail entire batch on one error
265
- )
266
-
267
- return results
268
-
269
-
270
- def save_tasks(
271
- tasks: list[dict[str, Any]], repo_id: str, fields: list[str] | None = None, **kwargs: Any
272
- ) -> None:
273
- """
274
- Save data to HuggingFace dataset with JSON string serialization.
275
-
276
- Complex fields (dicts, lists) are serialized as JSON strings to maintain clean schema
277
- and avoid null value pollution in HuggingFace datasets.
278
-
279
- Args:
280
- tasks: List of dictionaries to save
281
- repo_id: HuggingFace repository ID (e.g., "hud-evals/my-tasks")
282
- fields: Optional list of fields to save. If None, saves all fields from each dict.
283
- **kwargs: Additional arguments passed to dataset.push_to_hub()
284
- """
285
- from datasets import Dataset
286
-
287
- # Safety check: Ensure we're not saving Task objects (which have resolved env vars)
288
- if tasks and isinstance(tasks[0], Task):
289
- raise ValueError(
290
- "save_tasks expects dictionaries, not Task objects. "
291
- "Task objects have resolved environment variables which would expose secrets. "
292
- "Please pass raw dictionaries with template strings like '${HUD_API_KEY}' preserved."
293
- )
294
-
295
- # Convert to rows with JSON string fields
296
- data = []
297
- for i, tc_dict in enumerate(tasks):
298
- # Additional safety check for each item
299
- if isinstance(tc_dict, Task):
300
- raise ValueError(
301
- f"Item {i} is a Task object, not a dictionary. "
302
- "This would expose resolved environment variables. "
303
- "Please convert to dictionary format with template strings preserved."
304
- )
305
-
306
- row = {}
307
-
308
- # Determine which fields to process
309
- fields_to_process = fields if fields is not None else list(tc_dict.keys())
310
-
311
- for field in fields_to_process:
312
- if field in tc_dict:
313
- value = tc_dict[field]
314
- # Serialize complex types as JSON strings
315
- if isinstance(value, (dict | list)):
316
- row[field] = json.dumps(value)
317
- elif isinstance(value, (str | int | float | bool | type(None))):
318
- row[field] = value if value is not None else ""
319
- else:
320
- # For other types, convert to string
321
- row[field] = str(value)
322
-
323
- data.append(row)
324
-
325
- # Create and push dataset
326
- dataset = Dataset.from_list(data)
327
- dataset.push_to_hub(repo_id, **kwargs)