hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/datasets/loader.py
ADDED
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
"""Task loading utilities for HUD.
|
|
2
|
+
|
|
3
|
+
Unified interface for loading evaluation tasks from:
|
|
4
|
+
- HUD API (v5 format)
|
|
5
|
+
- Local JSON/JSONL files (v4 LegacyTask format, auto-converted)
|
|
6
|
+
- HuggingFace datasets (v4 LegacyTask format, auto-converted)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import logging
|
|
13
|
+
import warnings
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import TYPE_CHECKING, Any, overload
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from hud.eval.task import Task
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
__all__ = ["load_dataset", "load_tasks", "save_tasks"]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _load_raw_from_file(path: Path) -> list[dict[str, Any]]:
|
|
26
|
+
"""Load raw task dicts from a local JSON or JSONL file."""
|
|
27
|
+
raw_items: list[dict[str, Any]] = []
|
|
28
|
+
|
|
29
|
+
if path.suffix == ".jsonl":
|
|
30
|
+
# JSONL: one task per line
|
|
31
|
+
with open(path, encoding="utf-8") as f:
|
|
32
|
+
for line in f:
|
|
33
|
+
line = line.strip()
|
|
34
|
+
if not line:
|
|
35
|
+
continue
|
|
36
|
+
item = json.loads(line)
|
|
37
|
+
# Handle case where line contains a list
|
|
38
|
+
if isinstance(item, list):
|
|
39
|
+
raw_items.extend(i for i in item if isinstance(i, dict))
|
|
40
|
+
elif isinstance(item, dict):
|
|
41
|
+
raw_items.append(item)
|
|
42
|
+
else:
|
|
43
|
+
raise ValueError(
|
|
44
|
+
f"Invalid JSONL format: expected dict or list, got {type(item)}"
|
|
45
|
+
)
|
|
46
|
+
else:
|
|
47
|
+
# JSON: array of tasks
|
|
48
|
+
with open(path, encoding="utf-8") as f:
|
|
49
|
+
data = json.load(f)
|
|
50
|
+
|
|
51
|
+
if isinstance(data, list):
|
|
52
|
+
raw_items = [item for item in data if isinstance(item, dict)]
|
|
53
|
+
elif isinstance(data, dict):
|
|
54
|
+
raw_items = [data]
|
|
55
|
+
else:
|
|
56
|
+
raise ValueError(f"JSON file must contain an array or object, got {type(data)}")
|
|
57
|
+
|
|
58
|
+
return raw_items
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _load_from_file(path: Path) -> list[Task]:
|
|
62
|
+
"""Load tasks from a local JSON or JSONL file."""
|
|
63
|
+
from hud.eval.task import Task
|
|
64
|
+
|
|
65
|
+
raw_items = _load_raw_from_file(path)
|
|
66
|
+
return [Task(**item) for item in raw_items]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _load_raw_from_huggingface(dataset_name: str) -> list[dict[str, Any]]:
|
|
70
|
+
"""Load raw task dicts from HuggingFace dataset."""
|
|
71
|
+
try:
|
|
72
|
+
from datasets import load_dataset as hf_load_dataset
|
|
73
|
+
except ImportError as e:
|
|
74
|
+
raise ImportError(
|
|
75
|
+
"Please install 'datasets' to load from HuggingFace: uv pip install datasets"
|
|
76
|
+
) from e
|
|
77
|
+
|
|
78
|
+
# Parse dataset name and optional split
|
|
79
|
+
if ":" in dataset_name:
|
|
80
|
+
name, split = dataset_name.split(":", 1)
|
|
81
|
+
else:
|
|
82
|
+
name = dataset_name
|
|
83
|
+
split = "train" # Default split
|
|
84
|
+
|
|
85
|
+
logger.info("Loading from HuggingFace dataset: %s (split=%s)", name, split)
|
|
86
|
+
dataset = hf_load_dataset(name, split=split)
|
|
87
|
+
|
|
88
|
+
raw_items: list[dict[str, Any]] = []
|
|
89
|
+
for item in dataset:
|
|
90
|
+
if not isinstance(item, dict):
|
|
91
|
+
raise ValueError(f"Invalid HuggingFace dataset: expected dict, got {type(item)}")
|
|
92
|
+
raw_items.append(dict(item))
|
|
93
|
+
|
|
94
|
+
return raw_items
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _load_from_huggingface(dataset_name: str) -> list[Task]:
|
|
98
|
+
"""Load tasks from HuggingFace dataset."""
|
|
99
|
+
raw_items = _load_raw_from_huggingface(dataset_name)
|
|
100
|
+
from hud.eval.task import Task
|
|
101
|
+
|
|
102
|
+
return [Task(**item) for item in raw_items]
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _load_raw_from_api(dataset_name: str) -> list[dict[str, Any]]:
|
|
106
|
+
"""Load raw task dicts from HUD API."""
|
|
107
|
+
import httpx
|
|
108
|
+
|
|
109
|
+
from hud.settings import settings
|
|
110
|
+
|
|
111
|
+
headers = {}
|
|
112
|
+
if settings.api_key:
|
|
113
|
+
headers["Authorization"] = f"Bearer {settings.api_key}"
|
|
114
|
+
|
|
115
|
+
with httpx.Client() as client:
|
|
116
|
+
response = client.get(
|
|
117
|
+
f"{settings.hud_api_url}/tasks/evalset/{dataset_name}",
|
|
118
|
+
headers=headers,
|
|
119
|
+
params={"all": "true"},
|
|
120
|
+
)
|
|
121
|
+
response.raise_for_status()
|
|
122
|
+
data = response.json()
|
|
123
|
+
|
|
124
|
+
# Extract tasks dict from response
|
|
125
|
+
tasks_dict = data.get("tasks", {})
|
|
126
|
+
|
|
127
|
+
raw_items: list[dict[str, Any]] = []
|
|
128
|
+
for task_id, task_data in tasks_dict.items():
|
|
129
|
+
if task_data.get("id") is None:
|
|
130
|
+
task_data["id"] = task_id
|
|
131
|
+
raw_items.append(task_data)
|
|
132
|
+
|
|
133
|
+
return raw_items
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _load_from_api(dataset_name: str) -> list[Task]:
|
|
137
|
+
"""Load tasks from HUD API."""
|
|
138
|
+
from hud.eval.task import Task
|
|
139
|
+
|
|
140
|
+
raw_items = _load_raw_from_api(dataset_name)
|
|
141
|
+
return [Task(**item) for item in raw_items]
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@overload
|
|
145
|
+
def load_tasks(source: str, *, raw: bool = False) -> list[Task]: ...
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@overload
|
|
149
|
+
def load_tasks(source: str, *, raw: bool = True) -> list[dict[str, Any]]: ...
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def load_tasks(source: str, *, raw: bool = False) -> list[Task] | list[dict[str, Any]]:
|
|
153
|
+
"""Load tasks from a source.
|
|
154
|
+
|
|
155
|
+
Supports multiple sources with auto-detection:
|
|
156
|
+
- Local file path (JSON or JSONL)
|
|
157
|
+
- HUD API dataset slug (e.g., "hud-evals/SheetBench-50")
|
|
158
|
+
- HuggingFace dataset (e.g., "username/dataset" or "username/dataset:split")
|
|
159
|
+
|
|
160
|
+
Automatically detects and converts v4 LegacyTask format to v5 Task.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
source: Task source. Can be:
|
|
164
|
+
- Path to a local JSON/JSONL file
|
|
165
|
+
- HUD API dataset slug (e.g., "hud-evals/SheetBench-50")
|
|
166
|
+
- HuggingFace dataset name (e.g., "hud-evals/tasks" or "hud-evals/tasks:train")
|
|
167
|
+
raw: If True, return raw dicts without validation or env var substitution.
|
|
168
|
+
Useful for preserving template strings like "${HUD_API_KEY}".
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
- If raw=False (default): list[Task] ready to use with hud.eval()
|
|
172
|
+
- If raw=True: list[dict] with raw task data
|
|
173
|
+
|
|
174
|
+
Example:
|
|
175
|
+
```python
|
|
176
|
+
import hud
|
|
177
|
+
from hud.datasets import load_tasks
|
|
178
|
+
|
|
179
|
+
# Load from HUD API
|
|
180
|
+
tasks = load_tasks("hud-evals/SheetBench-50")
|
|
181
|
+
|
|
182
|
+
# Load from local file (v4 format auto-converted)
|
|
183
|
+
tasks = load_tasks("./my-tasks.json")
|
|
184
|
+
|
|
185
|
+
# Load from HuggingFace
|
|
186
|
+
tasks = load_tasks("hud-evals/benchmark:test")
|
|
187
|
+
|
|
188
|
+
# Load raw dicts (preserves env var placeholders)
|
|
189
|
+
raw_tasks = load_tasks("./tasks.json", raw=True)
|
|
190
|
+
|
|
191
|
+
# Run evaluation
|
|
192
|
+
async with hud.eval(tasks) as ctx:
|
|
193
|
+
await agent.run(ctx)
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
Raises:
|
|
197
|
+
ValueError: If task loading fails
|
|
198
|
+
"""
|
|
199
|
+
# Check if it's a local file
|
|
200
|
+
path = Path(source)
|
|
201
|
+
if path.exists() and path.suffix in {".json", ".jsonl"}:
|
|
202
|
+
logger.info("Loading tasks from file: %s", source)
|
|
203
|
+
items = _load_raw_from_file(path) if raw else _load_from_file(path)
|
|
204
|
+
logger.info("Loaded %d tasks from %s", len(items), source)
|
|
205
|
+
return items
|
|
206
|
+
|
|
207
|
+
# Try HUD API first
|
|
208
|
+
try:
|
|
209
|
+
logger.info("Trying HUD API: %s", source)
|
|
210
|
+
items = _load_raw_from_api(source) if raw else _load_from_api(source)
|
|
211
|
+
logger.info("Loaded %d tasks from HUD API: %s", len(items), source)
|
|
212
|
+
return items
|
|
213
|
+
except Exception as hud_error:
|
|
214
|
+
logger.debug("HUD API load failed (%s), trying HuggingFace", hud_error)
|
|
215
|
+
|
|
216
|
+
# Try HuggingFace as fallback
|
|
217
|
+
try:
|
|
218
|
+
logger.info("Trying HuggingFace dataset: %s", source)
|
|
219
|
+
items = _load_raw_from_huggingface(source) if raw else _load_from_huggingface(source)
|
|
220
|
+
logger.info("Loaded %d tasks from HuggingFace: %s", len(items), source)
|
|
221
|
+
return items
|
|
222
|
+
except ImportError:
|
|
223
|
+
raise ValueError(
|
|
224
|
+
f"Failed to load tasks from '{source}'. "
|
|
225
|
+
"Install 'datasets' package for HuggingFace support."
|
|
226
|
+
) from None
|
|
227
|
+
except Exception as hf_error:
|
|
228
|
+
raise ValueError(f"Failed to load tasks from '{source}': {hf_error}") from hf_error
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def save_tasks(
|
|
232
|
+
name: str,
|
|
233
|
+
tasks: list[Task],
|
|
234
|
+
) -> str:
|
|
235
|
+
"""Save tasks to the HUD API.
|
|
236
|
+
|
|
237
|
+
Creates or updates an evalset with the given tasks.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
name: Evalset name/slug (e.g., "my-evals/benchmark-v1").
|
|
241
|
+
If no org prefix, uses user's default org.
|
|
242
|
+
tasks: List of Task objects (v5 format) to save.
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
The evalset ID of the created/updated evalset.
|
|
246
|
+
|
|
247
|
+
Example:
|
|
248
|
+
```python
|
|
249
|
+
from hud.datasets import save_tasks, load_tasks
|
|
250
|
+
from hud.eval.task import Task
|
|
251
|
+
from hud.environment import Environment
|
|
252
|
+
|
|
253
|
+
# Create tasks
|
|
254
|
+
env = Environment("my-env")
|
|
255
|
+
tasks = [
|
|
256
|
+
Task(env=env, scenario="checkout", args={"user": "alice"}),
|
|
257
|
+
Task(env=env, scenario="checkout", args={"user": "bob"}),
|
|
258
|
+
]
|
|
259
|
+
|
|
260
|
+
# Save to HUD API
|
|
261
|
+
evalset_id = save_tasks("my-evals/benchmark-v1", tasks)
|
|
262
|
+
|
|
263
|
+
# Later, load them back
|
|
264
|
+
loaded = load_tasks("my-evals/benchmark-v1")
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
Raises:
|
|
268
|
+
TypeError: If any task is not a v5 Task object (must have 'scenario')
|
|
269
|
+
ValueError: If API key is not set or save fails
|
|
270
|
+
"""
|
|
271
|
+
import httpx
|
|
272
|
+
|
|
273
|
+
from hud.settings import settings
|
|
274
|
+
|
|
275
|
+
if not settings.api_key:
|
|
276
|
+
raise ValueError("HUD_API_KEY is required to save tasks")
|
|
277
|
+
|
|
278
|
+
# Validate all tasks are v5 format (must have 'scenario')
|
|
279
|
+
for i, task in enumerate(tasks):
|
|
280
|
+
if not hasattr(task, "scenario"):
|
|
281
|
+
raise TypeError(
|
|
282
|
+
f"Task at index {i} is missing 'scenario' - only v5 Task objects can be saved. "
|
|
283
|
+
"Use Task.from_v4(legacy_task) to convert from LegacyTask."
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
# Convert tasks to dicts (Task is a Pydantic model)
|
|
287
|
+
task_dicts = [task.model_dump(mode="json", exclude_none=True) for task in tasks]
|
|
288
|
+
|
|
289
|
+
# Build request payload
|
|
290
|
+
payload: dict[str, Any] = {
|
|
291
|
+
"name": name,
|
|
292
|
+
"tasks": task_dicts,
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
headers = {"Authorization": f"Bearer {settings.api_key}"}
|
|
296
|
+
|
|
297
|
+
try:
|
|
298
|
+
with httpx.Client(timeout=60) as client:
|
|
299
|
+
response = client.post(
|
|
300
|
+
f"{settings.hud_api_url}/tasks/evalset",
|
|
301
|
+
json=payload,
|
|
302
|
+
headers=headers,
|
|
303
|
+
)
|
|
304
|
+
response.raise_for_status()
|
|
305
|
+
data = response.json()
|
|
306
|
+
evalset_id = data.get("evalset_id") or data.get("id") or name
|
|
307
|
+
logger.info("Saved %d tasks to evalset: %s", len(tasks), evalset_id)
|
|
308
|
+
return evalset_id
|
|
309
|
+
except httpx.HTTPStatusError as e:
|
|
310
|
+
raise ValueError(f"Failed to save tasks: {e.response.text}") from e
|
|
311
|
+
except Exception as e:
|
|
312
|
+
raise ValueError(f"Failed to save tasks: {e}") from e
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
# Deprecated alias for backwards compatibility
|
|
316
|
+
def load_dataset(source: str, *, raw: bool = False) -> list[Task] | list[dict[str, Any]]:
|
|
317
|
+
"""Deprecated: Use load_tasks() instead.
|
|
318
|
+
|
|
319
|
+
.. deprecated:: 0.6.0
|
|
320
|
+
load_dataset() is deprecated. Use load_tasks() instead.
|
|
321
|
+
"""
|
|
322
|
+
warnings.warn(
|
|
323
|
+
"load_dataset() is deprecated. Use load_tasks() instead.",
|
|
324
|
+
DeprecationWarning,
|
|
325
|
+
stacklevel=2,
|
|
326
|
+
)
|
|
327
|
+
return load_tasks(source, raw=raw)
|
hud/datasets/runner.py
CHANGED
|
@@ -1,126 +1,213 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Core task runner for evaluating agents on datasets.
|
|
2
|
+
|
|
3
|
+
Requires the [agents] extra: pip install hud-python[agents]
|
|
4
|
+
"""
|
|
2
5
|
|
|
3
6
|
from __future__ import annotations
|
|
4
7
|
|
|
5
|
-
import asyncio
|
|
6
8
|
import logging
|
|
7
|
-
from typing import TYPE_CHECKING, Any
|
|
8
|
-
|
|
9
|
-
from datasets import Dataset, load_dataset
|
|
9
|
+
from typing import TYPE_CHECKING, Any
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
from hud.types import
|
|
11
|
+
import hud
|
|
12
|
+
from hud.types import AgentType, LegacyTask, TaskInput, Trace
|
|
13
13
|
|
|
14
14
|
if TYPE_CHECKING:
|
|
15
|
-
from
|
|
15
|
+
from collections.abc import Sequence
|
|
16
|
+
|
|
17
|
+
from hud.eval.context import EvalContext
|
|
18
|
+
from hud.eval.task import Task
|
|
16
19
|
|
|
17
20
|
logger = logging.getLogger("hud.datasets")
|
|
18
21
|
|
|
19
22
|
|
|
20
23
|
async def run_dataset(
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
max_concurrent: int = 30,
|
|
26
|
-
metadata: dict[str, Any] | None = None,
|
|
24
|
+
tasks: str | TaskInput | Sequence[TaskInput],
|
|
25
|
+
agent_type: str | AgentType,
|
|
26
|
+
*,
|
|
27
|
+
agent_params: dict[str, Any] | None = None,
|
|
27
28
|
max_steps: int = 10,
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
) -> list[
|
|
32
|
-
"""
|
|
33
|
-
|
|
29
|
+
max_concurrent: int = 30,
|
|
30
|
+
group_size: int = 1,
|
|
31
|
+
quiet: bool = True,
|
|
32
|
+
) -> list[EvalContext]:
|
|
33
|
+
"""Run an agent on a dataset of tasks.
|
|
34
|
+
|
|
35
|
+
This is the primary entry point for running evaluations programmatically.
|
|
36
|
+
The agent is created fresh for each task context to ensure correct tool initialization.
|
|
34
37
|
|
|
35
38
|
Args:
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
custom_system_prompt: Override system prompt for all tasks
|
|
39
|
+
tasks: Tasks to run. Can be:
|
|
40
|
+
- A source string (file path, API slug) - loaded via load_tasks()
|
|
41
|
+
- A single TaskInput (Task, LegacyTask, or dict)
|
|
42
|
+
- A list of TaskInput objects
|
|
43
|
+
agent_type: Type of agent to create (e.g., "claude", "openai", AgentType.CLAUDE).
|
|
44
|
+
agent_params: Parameters to pass to agent.create().
|
|
45
|
+
max_steps: Maximum steps per task.
|
|
46
|
+
max_concurrent: Maximum concurrent tasks (for parallel execution).
|
|
47
|
+
group_size: Number of times to run each task (for variance estimation).
|
|
48
|
+
quiet: Whether to suppress printing eval links and opening browser (default True).
|
|
47
49
|
|
|
48
50
|
Returns:
|
|
49
|
-
List of results from
|
|
51
|
+
List of EvalContext results from each task execution. Access `.reward` on each.
|
|
50
52
|
|
|
51
53
|
Example:
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
>>> dataset = load_dataset("hud-evals/SheetBench-50", split="train")
|
|
63
|
-
>>> results = await run_dataset("my_eval", dataset, ClaudeAgent)
|
|
64
|
-
>>> # Option 3: From list of dicts
|
|
65
|
-
>>> tasks = [{"prompt": "...", "mcp_config": {...}, ...}, ...]
|
|
66
|
-
>>> results = await run_dataset("browser_eval", tasks, ClaudeAgent)
|
|
67
|
-
"""
|
|
68
|
-
# Import here to avoid circular imports
|
|
69
|
-
import hud
|
|
70
|
-
|
|
71
|
-
dataset_link = None
|
|
72
|
-
|
|
73
|
-
# Load dataset from string if needed
|
|
74
|
-
if isinstance(dataset, str):
|
|
75
|
-
logger.info("Loading dataset %s from HuggingFace...", dataset)
|
|
76
|
-
dataset_link = dataset
|
|
77
|
-
|
|
78
|
-
# Load dataset from HuggingFace
|
|
79
|
-
dataset = cast("Dataset", load_dataset(dataset, split=split))
|
|
80
|
-
|
|
81
|
-
# Create job context
|
|
82
|
-
job_metadata = metadata or {}
|
|
83
|
-
job_metadata["agent_class"] = agent_class.__name__
|
|
84
|
-
job_metadata["agent_config"] = agent_config
|
|
85
|
-
|
|
86
|
-
# Extract dataset verification info if available
|
|
87
|
-
if isinstance(dataset, Dataset) and not dataset_link:
|
|
88
|
-
try:
|
|
89
|
-
general_info = next(iter(dataset.info.__dict__["download_checksums"].keys())).split("/")
|
|
90
|
-
project = general_info[3]
|
|
91
|
-
dataset_name = general_info[4].split("@")[0]
|
|
92
|
-
dataset_link = f"{project}/{dataset_name}"
|
|
93
|
-
except Exception:
|
|
94
|
-
logger.warning("Failed to extract dataset verification info")
|
|
95
|
-
|
|
96
|
-
with hud.job(name, metadata=job_metadata, dataset_link=dataset_link) as job_obj:
|
|
97
|
-
# Run tasks with semaphore for concurrency control
|
|
98
|
-
sem = asyncio.Semaphore(max_concurrent)
|
|
99
|
-
results: list[Any | None] = [None] * len(dataset)
|
|
100
|
-
|
|
101
|
-
async def _worker(index: int, task_dict: Any, max_steps: int = 10) -> None:
|
|
102
|
-
async with sem:
|
|
103
|
-
# Create trace for this task
|
|
104
|
-
task_name = task_dict.get("prompt") or f"Task {index}"
|
|
105
|
-
if custom_system_prompt and "system_prompt" not in task_dict:
|
|
106
|
-
task_dict["system_prompt"] = custom_system_prompt
|
|
107
|
-
# Ensure task_id is a string for baggage propagation
|
|
108
|
-
raw_task_id = task_dict.get("id")
|
|
109
|
-
safe_task_id = str(raw_task_id) if raw_task_id is not None else None
|
|
110
|
-
with hud.trace(task_name, job_id=job_obj.id, task_id=safe_task_id):
|
|
111
|
-
# Convert dict to Task here, at trace level
|
|
112
|
-
task = Task(**task_dict)
|
|
113
|
-
|
|
114
|
-
agent = agent_class(**(agent_config or {}))
|
|
115
|
-
|
|
116
|
-
if auto_respond:
|
|
117
|
-
agent.response_agent = ResponseAgent()
|
|
118
|
-
results[index] = await agent.run(task, max_steps=max_steps)
|
|
119
|
-
|
|
120
|
-
# Execute all tasks
|
|
121
|
-
await asyncio.gather(
|
|
122
|
-
*[_worker(i, task, max_steps=max_steps) for i, task in enumerate(dataset)],
|
|
123
|
-
return_exceptions=True, # Don't fail entire batch on one error
|
|
54
|
+
```python
|
|
55
|
+
from hud.datasets import load_tasks, run_dataset
|
|
56
|
+
|
|
57
|
+
# Load tasks and run
|
|
58
|
+
tasks = load_tasks("my-tasks.json")
|
|
59
|
+
results = await run_dataset(
|
|
60
|
+
tasks,
|
|
61
|
+
agent_type="claude",
|
|
62
|
+
agent_params={"checkpoint_name": "claude-sonnet-4-20250514"},
|
|
63
|
+
max_steps=50,
|
|
124
64
|
)
|
|
125
65
|
|
|
126
|
-
|
|
66
|
+
for ctx in results:
|
|
67
|
+
print(f"Reward: {ctx.reward}")
|
|
68
|
+
```
|
|
69
|
+
"""
|
|
70
|
+
from hud.datasets.loader import load_tasks
|
|
71
|
+
from hud.eval.task import Task
|
|
72
|
+
|
|
73
|
+
# Normalize tasks to list[Task]
|
|
74
|
+
task_list: list[Task]
|
|
75
|
+
if isinstance(tasks, str):
|
|
76
|
+
task_list = load_tasks(tasks)
|
|
77
|
+
elif isinstance(tasks, Task):
|
|
78
|
+
task_list = [tasks]
|
|
79
|
+
elif isinstance(tasks, LegacyTask | dict):
|
|
80
|
+
# Single LegacyTask or dict - convert to Task
|
|
81
|
+
task_list = [Task.from_v4(tasks)]
|
|
82
|
+
else:
|
|
83
|
+
# Sequence of TaskInput - convert each to Task
|
|
84
|
+
task_list = [t if isinstance(t, Task) else Task.from_v4(t) for t in tasks]
|
|
85
|
+
|
|
86
|
+
if not task_list:
|
|
87
|
+
raise ValueError("No tasks to run")
|
|
88
|
+
|
|
89
|
+
# Resolve agent class
|
|
90
|
+
agent_type_enum = agent_type if isinstance(agent_type, AgentType) else AgentType(agent_type)
|
|
91
|
+
agent_cls = agent_type_enum.cls
|
|
92
|
+
|
|
93
|
+
# Use hud.eval() for both single and parallel execution
|
|
94
|
+
async with hud.eval(
|
|
95
|
+
task_list,
|
|
96
|
+
group=group_size,
|
|
97
|
+
max_concurrent=max_concurrent,
|
|
98
|
+
quiet=quiet,
|
|
99
|
+
) as ctx:
|
|
100
|
+
# Create agent fresh for each context (ensures correct tool initialization)
|
|
101
|
+
agent = agent_cls.create(**(agent_params or {}))
|
|
102
|
+
await agent.run(ctx, max_steps=max_steps)
|
|
103
|
+
# Reward is computed by EvalContext.__aexit__ from evaluate tools
|
|
104
|
+
|
|
105
|
+
# For parallel execution, results are collected via ctx.results
|
|
106
|
+
if hasattr(ctx, "results") and ctx.results:
|
|
107
|
+
return ctx.results
|
|
108
|
+
|
|
109
|
+
return [ctx]
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
async def run_single_task(
|
|
113
|
+
task: Task,
|
|
114
|
+
*,
|
|
115
|
+
agent_type: AgentType,
|
|
116
|
+
agent_params: dict[str, Any] | None = None,
|
|
117
|
+
max_steps: int = 10,
|
|
118
|
+
job_id: str | None = None,
|
|
119
|
+
task_id: str | None = None,
|
|
120
|
+
group_id: str | None = None,
|
|
121
|
+
trace_name: str | None = None,
|
|
122
|
+
metadata: dict[str, Any] | None = None,
|
|
123
|
+
trace_id: str | None = None,
|
|
124
|
+
api_key: str | None = None,
|
|
125
|
+
trace: bool = True,
|
|
126
|
+
quiet: bool = False,
|
|
127
|
+
) -> Trace:
|
|
128
|
+
"""Run a single task with full control over eval context parameters.
|
|
129
|
+
|
|
130
|
+
This is the low-level entry point for running individual tasks with explicit
|
|
131
|
+
trace/job/group IDs. Used by remote execution workers.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
task: Task object to run. Use Task.from_v4() or load_tasks() to create.
|
|
135
|
+
agent_type: AgentType enum specifying the agent to use.
|
|
136
|
+
agent_params: Parameters passed to agent.create(). Should include
|
|
137
|
+
pre-configured model_client for inference gateway usage.
|
|
138
|
+
max_steps: Maximum steps allowed for the agent.
|
|
139
|
+
job_id: HUD job identifier for telemetry association.
|
|
140
|
+
task_id: Task identifier (used in trace name if trace_name not provided).
|
|
141
|
+
group_id: Optional group identifier for parallel runs.
|
|
142
|
+
trace_name: Name for the trace (defaults to task_id or task.id).
|
|
143
|
+
metadata: Additional metadata for the trace context.
|
|
144
|
+
trace_id: Pre-assigned trace ID (if provided by backend).
|
|
145
|
+
api_key: API key override for telemetry and backend calls.
|
|
146
|
+
trace: Whether to send trace data to backend (default True).
|
|
147
|
+
quiet: Whether to suppress printing eval link (default False).
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
Trace result from the agent run.
|
|
151
|
+
|
|
152
|
+
Example:
|
|
153
|
+
```python
|
|
154
|
+
from hud.datasets import run_single_task
|
|
155
|
+
from hud.eval.task import Task
|
|
156
|
+
from hud.types import AgentType
|
|
157
|
+
from openai import AsyncOpenAI
|
|
158
|
+
|
|
159
|
+
# Create task (from v4 dict or directly)
|
|
160
|
+
task = Task.from_v4({"prompt": "...", "mcp_config": {...}, "evaluate_tool": {...}})
|
|
161
|
+
|
|
162
|
+
# Configure agent with inference gateway
|
|
163
|
+
agent_params = {
|
|
164
|
+
"checkpoint_name": "gpt-4o",
|
|
165
|
+
"validate_api_key": False,
|
|
166
|
+
"model_client": AsyncOpenAI(
|
|
167
|
+
api_key=hud_api_key,
|
|
168
|
+
base_url=settings.hud_gateway_url,
|
|
169
|
+
),
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
result = await run_single_task(
|
|
173
|
+
task=task,
|
|
174
|
+
agent_type=AgentType.OPENAI,
|
|
175
|
+
agent_params=agent_params,
|
|
176
|
+
max_steps=20,
|
|
177
|
+
job_id="job-123",
|
|
178
|
+
task_id="task-456",
|
|
179
|
+
)
|
|
180
|
+
```
|
|
181
|
+
"""
|
|
182
|
+
# Determine trace name
|
|
183
|
+
effective_trace_name = trace_name or task_id or task.id or "single_task"
|
|
184
|
+
|
|
185
|
+
# Run with explicit eval context parameters
|
|
186
|
+
async with hud.eval(
|
|
187
|
+
task,
|
|
188
|
+
name=effective_trace_name,
|
|
189
|
+
job_id=job_id,
|
|
190
|
+
group_id=group_id,
|
|
191
|
+
trace_id=trace_id,
|
|
192
|
+
api_key=api_key,
|
|
193
|
+
trace=trace,
|
|
194
|
+
quiet=quiet,
|
|
195
|
+
) as ctx:
|
|
196
|
+
# Build agent params - use system_prompt from ctx (set from task.agent_config)
|
|
197
|
+
final_agent_params = dict(agent_params or {})
|
|
198
|
+
if ctx.system_prompt and "system_prompt" not in final_agent_params:
|
|
199
|
+
final_agent_params["system_prompt"] = ctx.system_prompt
|
|
200
|
+
|
|
201
|
+
# Create agent inside ctx so it has access to context-derived values
|
|
202
|
+
agent_cls = agent_type.cls
|
|
203
|
+
agent = agent_cls.create(**final_agent_params)
|
|
204
|
+
|
|
205
|
+
# Store metadata if provided
|
|
206
|
+
if metadata:
|
|
207
|
+
ctx.metadata.update(metadata)
|
|
208
|
+
|
|
209
|
+
result = await agent.run(ctx, max_steps=max_steps)
|
|
210
|
+
# Reward is computed by EvalContext.__aexit__ from evaluate tools
|
|
211
|
+
|
|
212
|
+
# Return the Trace (ctx.reward is set by EvalContext.__aexit__)
|
|
213
|
+
return result
|
|
File without changes
|