hud-python 0.4.14__py3-none-any.whl → 0.4.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/agents/base.py +118 -33
- hud/agents/claude.py +1 -1
- hud/agents/openai.py +5 -16
- hud/agents/tests/test_openai.py +24 -79
- hud/cli/__init__.py +137 -15
- hud/cli/analyze.py +2 -4
- hud/cli/build.py +6 -2
- hud/cli/dev.py +67 -0
- hud/cli/eval.py +90 -35
- hud/cli/hf.py +406 -0
- hud/cli/init.py +38 -19
- hud/cli/rl/README.md +243 -0
- hud/cli/rl/__init__.py +82 -0
- hud/cli/rl/init.py +370 -0
- hud/cli/rl/pod.py +491 -0
- hud/cli/rl/ssh.py +288 -0
- hud/cli/rl/train.py +421 -0
- hud/cli/rl/utils.py +165 -0
- hud/cli/tests/test_mcp_server.py +1 -4
- hud/clients/base.py +2 -0
- hud/clients/fastmcp.py +7 -2
- hud/clients/mcp_use.py +3 -1
- hud/clients/utils/retry_transport.py +34 -8
- hud/datasets/__init__.py +32 -0
- hud/datasets/execution/__init__.py +13 -0
- hud/datasets/execution/parallel.py +592 -0
- hud/datasets/execution/runner.py +123 -0
- hud/datasets/task.py +107 -0
- hud/datasets/utils.py +118 -0
- hud/otel/instrumentation.py +2 -1
- hud/server/server.py +58 -21
- hud/settings.py +12 -0
- hud/types.py +31 -10
- hud/utils/design.py +168 -2
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/METADATA +4 -3
- {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/RECORD +41 -28
- hud/datasets.py +0 -327
- {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/WHEEL +0 -0
- {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/licenses/LICENSE +0 -0
hud/datasets.py
DELETED
|
@@ -1,327 +0,0 @@
|
|
|
1
|
-
"""Dataset utilities for working with HuggingFace datasets and Tasks."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import asyncio
|
|
6
|
-
import json
|
|
7
|
-
import logging
|
|
8
|
-
from string import Template
|
|
9
|
-
from typing import TYPE_CHECKING, Any, cast
|
|
10
|
-
|
|
11
|
-
from datasets import Dataset, load_dataset
|
|
12
|
-
from pydantic import BaseModel, Field, field_validator
|
|
13
|
-
|
|
14
|
-
from hud.agents.misc import ResponseAgent
|
|
15
|
-
from hud.settings import settings
|
|
16
|
-
|
|
17
|
-
from .types import MCPToolCall
|
|
18
|
-
|
|
19
|
-
if TYPE_CHECKING:
|
|
20
|
-
from hud.agents import MCPAgent
|
|
21
|
-
|
|
22
|
-
logger = logging.getLogger("hud.datasets")
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
class Task(BaseModel):
|
|
26
|
-
"""
|
|
27
|
-
A task configuration that can be used to create a task.
|
|
28
|
-
|
|
29
|
-
The mcp_config field supports environment variable substitution using
|
|
30
|
-
template placeholders in the format ${VAR_NAME} or ${VAR_NAME:default_value}.
|
|
31
|
-
|
|
32
|
-
Example:
|
|
33
|
-
mcp_config: {
|
|
34
|
-
"hud": {
|
|
35
|
-
"url": "${HUD_MCP_URL:https://mcp.hud.so/v3/mcp}",
|
|
36
|
-
"headers": {
|
|
37
|
-
"Authorization": "Bearer ${HUD_API_KEY}",
|
|
38
|
-
"Mcp-Image": "your-mcp-image"
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
}
|
|
42
|
-
"""
|
|
43
|
-
|
|
44
|
-
id: str | None = None
|
|
45
|
-
prompt: str
|
|
46
|
-
mcp_config: dict[str, Any]
|
|
47
|
-
setup_tool: MCPToolCall | list[MCPToolCall] | None = None
|
|
48
|
-
evaluate_tool: MCPToolCall | list[MCPToolCall] | None = None
|
|
49
|
-
system_prompt: str | None = None
|
|
50
|
-
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
51
|
-
|
|
52
|
-
@field_validator("mcp_config", "metadata", mode="before")
|
|
53
|
-
@classmethod
|
|
54
|
-
def parse_json_strings(cls, v: Any) -> Any:
|
|
55
|
-
"""Parse JSON strings into dictionaries."""
|
|
56
|
-
if isinstance(v, str):
|
|
57
|
-
try:
|
|
58
|
-
return json.loads(v)
|
|
59
|
-
except json.JSONDecodeError as e:
|
|
60
|
-
raise ValueError(f"Invalid JSON string: {e}") from e
|
|
61
|
-
return v
|
|
62
|
-
|
|
63
|
-
@field_validator("setup_tool", "evaluate_tool", mode="before")
|
|
64
|
-
@classmethod
|
|
65
|
-
def convert_dict_to_tool_call(cls, v: Any) -> Any:
|
|
66
|
-
"""Convert dict to MCPToolCall instance, parsing JSON strings first."""
|
|
67
|
-
if v is None:
|
|
68
|
-
return None
|
|
69
|
-
|
|
70
|
-
# Parse JSON string if needed
|
|
71
|
-
if isinstance(v, str):
|
|
72
|
-
try:
|
|
73
|
-
v = json.loads(v)
|
|
74
|
-
except json.JSONDecodeError as e:
|
|
75
|
-
raise ValueError(f"Invalid JSON string: {e}") from e
|
|
76
|
-
|
|
77
|
-
if isinstance(v, dict):
|
|
78
|
-
return MCPToolCall(**v)
|
|
79
|
-
if isinstance(v, list):
|
|
80
|
-
return [MCPToolCall(**item) if isinstance(item, dict) else item for item in v]
|
|
81
|
-
return v
|
|
82
|
-
|
|
83
|
-
@field_validator("mcp_config", mode="before")
|
|
84
|
-
@classmethod
|
|
85
|
-
def resolve_env_vars(cls, v: dict[str, Any]) -> dict[str, Any]:
|
|
86
|
-
"""
|
|
87
|
-
Automatically resolve environment variables in mcp_config using Template.
|
|
88
|
-
|
|
89
|
-
Supports ${VAR_NAME} syntax with variable substitution from
|
|
90
|
-
System environment variables (including HUD_API_KEY, etc.)
|
|
91
|
-
|
|
92
|
-
Missing variables resolve to empty strings.
|
|
93
|
-
"""
|
|
94
|
-
import os
|
|
95
|
-
|
|
96
|
-
# Start with current environment variables
|
|
97
|
-
mapping = dict(os.environ)
|
|
98
|
-
mapping.update(settings.model_dump())
|
|
99
|
-
|
|
100
|
-
if settings.api_key:
|
|
101
|
-
mapping["HUD_API_KEY"] = settings.api_key
|
|
102
|
-
|
|
103
|
-
def substitute_in_value(obj: Any) -> Any:
|
|
104
|
-
"""Recursively substitute variables in nested structures."""
|
|
105
|
-
if isinstance(obj, str):
|
|
106
|
-
# Use Template's substitute with defaultdict - missing vars become empty strings
|
|
107
|
-
from collections import defaultdict
|
|
108
|
-
|
|
109
|
-
safe_mapping = defaultdict(str, mapping)
|
|
110
|
-
return Template(obj).substitute(safe_mapping)
|
|
111
|
-
elif isinstance(obj, dict):
|
|
112
|
-
return {k: substitute_in_value(v) for k, v in obj.items()}
|
|
113
|
-
elif isinstance(obj, list):
|
|
114
|
-
return [substitute_in_value(item) for item in obj]
|
|
115
|
-
else:
|
|
116
|
-
return obj
|
|
117
|
-
|
|
118
|
-
return substitute_in_value(v)
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
async def fetch_system_prompt_from_dataset(dataset_id: str) -> str | None:
|
|
122
|
-
"""
|
|
123
|
-
Fetch system_prompt.txt from a HuggingFace dataset repository.
|
|
124
|
-
|
|
125
|
-
Args:
|
|
126
|
-
dataset_id: HuggingFace dataset identifier (e.g., "hud-evals/SheetBench-50")
|
|
127
|
-
|
|
128
|
-
Returns:
|
|
129
|
-
System prompt text if found, None otherwise
|
|
130
|
-
"""
|
|
131
|
-
try:
|
|
132
|
-
# Import here to avoid unnecessary dependency
|
|
133
|
-
from huggingface_hub import hf_hub_download
|
|
134
|
-
from huggingface_hub.errors import EntryNotFoundError
|
|
135
|
-
|
|
136
|
-
# Try to download the system_prompt.txt file
|
|
137
|
-
try:
|
|
138
|
-
file_path = hf_hub_download(
|
|
139
|
-
repo_id=dataset_id, filename="system_prompt.txt", repo_type="dataset"
|
|
140
|
-
)
|
|
141
|
-
|
|
142
|
-
# Read and return the content
|
|
143
|
-
with open(file_path, encoding="utf-8") as f: # noqa: ASYNC230
|
|
144
|
-
content = f.read().strip()
|
|
145
|
-
if content:
|
|
146
|
-
logger.info(
|
|
147
|
-
"Loaded system prompt from %s (length: %d chars)", dataset_id, len(content)
|
|
148
|
-
)
|
|
149
|
-
return content
|
|
150
|
-
else:
|
|
151
|
-
logger.warning("System prompt file is empty in %s", dataset_id)
|
|
152
|
-
return None
|
|
153
|
-
|
|
154
|
-
except EntryNotFoundError:
|
|
155
|
-
logger.debug("No system_prompt.txt found in dataset %s", dataset_id)
|
|
156
|
-
return None
|
|
157
|
-
|
|
158
|
-
except ImportError:
|
|
159
|
-
logger.warning(
|
|
160
|
-
"huggingface_hub not installed. Install it to fetch system prompts from datasets."
|
|
161
|
-
)
|
|
162
|
-
return None
|
|
163
|
-
except Exception as e:
|
|
164
|
-
logger.error("Error fetching system prompt from %s: %s", dataset_id, e)
|
|
165
|
-
return None
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
async def run_dataset(
|
|
169
|
-
name: str,
|
|
170
|
-
dataset: str | Dataset | list[dict[str, Any]],
|
|
171
|
-
agent_class: type[MCPAgent],
|
|
172
|
-
agent_config: dict[str, Any] | None = None,
|
|
173
|
-
max_concurrent: int = 50,
|
|
174
|
-
metadata: dict[str, Any] | None = None,
|
|
175
|
-
max_steps: int = 40,
|
|
176
|
-
split: str = "train",
|
|
177
|
-
auto_respond: bool = False,
|
|
178
|
-
custom_system_prompt: str | None = None,
|
|
179
|
-
) -> list[Any]:
|
|
180
|
-
"""
|
|
181
|
-
Run all tasks in a dataset with automatic job tracking.
|
|
182
|
-
|
|
183
|
-
Args:
|
|
184
|
-
name: Name for the job
|
|
185
|
-
dataset: HuggingFace dataset identifier (e.g. "hud-evals/SheetBench-50"),
|
|
186
|
-
Dataset object, OR list of Task objects
|
|
187
|
-
agent_class: Agent class to instantiate (e.g., ClaudeAgent)
|
|
188
|
-
agent_config: Configuration/kwargs for agent (model, etc.)
|
|
189
|
-
max_concurrent: Maximum parallel task execution
|
|
190
|
-
metadata: Optional metadata for the job
|
|
191
|
-
max_steps: Maximum steps per task
|
|
192
|
-
split: Dataset split to use when loading from string (default: "train")
|
|
193
|
-
auto_respond: Whether to use auto-response agent
|
|
194
|
-
|
|
195
|
-
Returns:
|
|
196
|
-
List of results from agent.run() in dataset order
|
|
197
|
-
|
|
198
|
-
Example:
|
|
199
|
-
>>> from hud.agents import ClaudeAgent
|
|
200
|
-
>>> # Option 1: From dataset string identifier
|
|
201
|
-
>>> results = await run_dataset(
|
|
202
|
-
... "SheetBench Eval",
|
|
203
|
-
... "hud-evals/SheetBench-50",
|
|
204
|
-
... ClaudeAgent,
|
|
205
|
-
... {"model": "claude-3-5-sonnet-20241022"},
|
|
206
|
-
... )
|
|
207
|
-
>>> # Option 2: From HuggingFace dataset object
|
|
208
|
-
>>> from datasets import load_dataset
|
|
209
|
-
>>> dataset = load_dataset("hud-evals/SheetBench-50", split="train")
|
|
210
|
-
>>> results = await run_dataset("my_eval", dataset, ClaudeAgent)
|
|
211
|
-
>>> # Option 3: From list of dicts
|
|
212
|
-
>>> tasks = [{"prompt": "...", "mcp_config": {...}, ...}, ...]
|
|
213
|
-
>>> results = await run_dataset("browser_eval", tasks, ClaudeAgent)
|
|
214
|
-
"""
|
|
215
|
-
# Import here to avoid circular imports
|
|
216
|
-
import hud
|
|
217
|
-
|
|
218
|
-
dataset_link = None
|
|
219
|
-
|
|
220
|
-
# Load dataset from string if needed
|
|
221
|
-
if isinstance(dataset, str):
|
|
222
|
-
logger.info("Loading dataset %s from HuggingFace...", dataset)
|
|
223
|
-
dataset_link = dataset
|
|
224
|
-
|
|
225
|
-
# Load dataset from HuggingFace
|
|
226
|
-
dataset = cast("Dataset", load_dataset(dataset, split=split))
|
|
227
|
-
|
|
228
|
-
# Create job context
|
|
229
|
-
job_metadata = metadata or {}
|
|
230
|
-
job_metadata["agent_class"] = agent_class.__name__
|
|
231
|
-
job_metadata["agent_config"] = agent_config
|
|
232
|
-
|
|
233
|
-
# Extract dataset verification info if available
|
|
234
|
-
if isinstance(dataset, Dataset) and not dataset_link:
|
|
235
|
-
general_info = next(iter(dataset.info.__dict__["download_checksums"].keys())).split("/")
|
|
236
|
-
project = general_info[3]
|
|
237
|
-
dataset_name = general_info[4].split("@")[0]
|
|
238
|
-
dataset_link = f"{project}/{dataset_name}"
|
|
239
|
-
|
|
240
|
-
with hud.job(name, metadata=job_metadata, dataset_link=dataset_link) as job_obj:
|
|
241
|
-
# Run tasks with semaphore for concurrency control
|
|
242
|
-
sem = asyncio.Semaphore(max_concurrent)
|
|
243
|
-
results: list[Any | None] = [None] * len(dataset)
|
|
244
|
-
|
|
245
|
-
async def _worker(index: int, task_dict: Any, max_steps: int = 40) -> None:
|
|
246
|
-
async with sem:
|
|
247
|
-
# Create trace for this task
|
|
248
|
-
task_name = task_dict.get("prompt") or f"Task {index}"
|
|
249
|
-
if "system_prompt" not in task_dict:
|
|
250
|
-
task_dict["system_prompt"] = custom_system_prompt
|
|
251
|
-
with hud.trace(task_name, job_id=job_obj.id, task_id=task_dict.get("id")):
|
|
252
|
-
# Convert dict to Task here, at trace level
|
|
253
|
-
task = Task(**task_dict)
|
|
254
|
-
|
|
255
|
-
agent = agent_class(**(agent_config or {}))
|
|
256
|
-
|
|
257
|
-
if auto_respond:
|
|
258
|
-
agent.response_agent = ResponseAgent()
|
|
259
|
-
results[index] = await agent.run(task, max_steps=max_steps)
|
|
260
|
-
|
|
261
|
-
# Execute all tasks
|
|
262
|
-
await asyncio.gather(
|
|
263
|
-
*[_worker(i, task, max_steps=max_steps) for i, task in enumerate(dataset)],
|
|
264
|
-
return_exceptions=True, # Don't fail entire batch on one error
|
|
265
|
-
)
|
|
266
|
-
|
|
267
|
-
return results
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
def save_tasks(
|
|
271
|
-
tasks: list[dict[str, Any]], repo_id: str, fields: list[str] | None = None, **kwargs: Any
|
|
272
|
-
) -> None:
|
|
273
|
-
"""
|
|
274
|
-
Save data to HuggingFace dataset with JSON string serialization.
|
|
275
|
-
|
|
276
|
-
Complex fields (dicts, lists) are serialized as JSON strings to maintain clean schema
|
|
277
|
-
and avoid null value pollution in HuggingFace datasets.
|
|
278
|
-
|
|
279
|
-
Args:
|
|
280
|
-
tasks: List of dictionaries to save
|
|
281
|
-
repo_id: HuggingFace repository ID (e.g., "hud-evals/my-tasks")
|
|
282
|
-
fields: Optional list of fields to save. If None, saves all fields from each dict.
|
|
283
|
-
**kwargs: Additional arguments passed to dataset.push_to_hub()
|
|
284
|
-
"""
|
|
285
|
-
from datasets import Dataset
|
|
286
|
-
|
|
287
|
-
# Safety check: Ensure we're not saving Task objects (which have resolved env vars)
|
|
288
|
-
if tasks and isinstance(tasks[0], Task):
|
|
289
|
-
raise ValueError(
|
|
290
|
-
"save_tasks expects dictionaries, not Task objects. "
|
|
291
|
-
"Task objects have resolved environment variables which would expose secrets. "
|
|
292
|
-
"Please pass raw dictionaries with template strings like '${HUD_API_KEY}' preserved."
|
|
293
|
-
)
|
|
294
|
-
|
|
295
|
-
# Convert to rows with JSON string fields
|
|
296
|
-
data = []
|
|
297
|
-
for i, tc_dict in enumerate(tasks):
|
|
298
|
-
# Additional safety check for each item
|
|
299
|
-
if isinstance(tc_dict, Task):
|
|
300
|
-
raise ValueError(
|
|
301
|
-
f"Item {i} is a Task object, not a dictionary. "
|
|
302
|
-
"This would expose resolved environment variables. "
|
|
303
|
-
"Please convert to dictionary format with template strings preserved."
|
|
304
|
-
)
|
|
305
|
-
|
|
306
|
-
row = {}
|
|
307
|
-
|
|
308
|
-
# Determine which fields to process
|
|
309
|
-
fields_to_process = fields if fields is not None else list(tc_dict.keys())
|
|
310
|
-
|
|
311
|
-
for field in fields_to_process:
|
|
312
|
-
if field in tc_dict:
|
|
313
|
-
value = tc_dict[field]
|
|
314
|
-
# Serialize complex types as JSON strings
|
|
315
|
-
if isinstance(value, (dict | list)):
|
|
316
|
-
row[field] = json.dumps(value)
|
|
317
|
-
elif isinstance(value, (str | int | float | bool | type(None))):
|
|
318
|
-
row[field] = value if value is not None else ""
|
|
319
|
-
else:
|
|
320
|
-
# For other types, convert to string
|
|
321
|
-
row[field] = str(value)
|
|
322
|
-
|
|
323
|
-
data.append(row)
|
|
324
|
-
|
|
325
|
-
# Create and push dataset
|
|
326
|
-
dataset = Dataset.from_list(data)
|
|
327
|
-
dataset.push_to_hub(repo_id, **kwargs)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|