hud-python 0.4.28__py3-none-any.whl → 0.4.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +2 -1
- hud/agents/base.py +81 -45
- hud/agents/claude.py +8 -4
- hud/agents/openai_chat_generic.py +66 -40
- hud/agents/tests/test_base.py +0 -4
- hud/agents/tests/test_openai.py +1 -1
- hud/cli/__init__.py +182 -52
- hud/cli/dev.py +8 -9
- hud/cli/eval.py +317 -119
- hud/cli/flows/__init__.py +0 -0
- hud/cli/flows/tasks.py +0 -0
- hud/cli/get.py +160 -0
- hud/cli/rl/__init__.py +567 -71
- hud/cli/rl/config.py +94 -0
- hud/cli/rl/display.py +133 -0
- hud/cli/rl/gpu.py +63 -0
- hud/cli/rl/gpu_utils.py +318 -0
- hud/cli/rl/presets.py +96 -0
- hud/cli/rl/remote_runner.py +347 -0
- hud/cli/rl/rl_api.py +150 -0
- hud/cli/rl/vllm.py +177 -0
- hud/cli/tests/test_analyze_metadata.py +0 -1
- hud/cli/utils/tasks.py +26 -0
- hud/clients/base.py +21 -23
- hud/clients/mcp_use.py +36 -44
- hud/clients/tests/test_mcp_use_retry.py +10 -10
- hud/datasets/__init__.py +4 -3
- hud/datasets/{execution/parallel.py → parallel.py} +1 -1
- hud/datasets/{execution/runner.py → runner.py} +1 -1
- hud/datasets/utils.py +1 -1
- hud/native/comparator.py +6 -6
- hud/native/tests/test_comparator.py +8 -8
- hud/native/tests/test_native_init.py +13 -11
- hud/otel/config.py +1 -1
- hud/otel/instrumentation.py +35 -0
- hud/rl/README.md +30 -0
- hud/rl/__init__.py +1 -0
- hud/rl/actor.py +174 -0
- hud/rl/buffer.py +371 -0
- hud/rl/chat_template.jinja +101 -0
- hud/rl/config.py +184 -0
- hud/rl/distributed.py +95 -0
- hud/rl/learner.py +589 -0
- hud/rl/tests/__init__.py +1 -0
- hud/rl/tests/test_learner.py +171 -0
- hud/rl/train.py +354 -0
- hud/rl/types.py +101 -0
- hud/rl/utils/start_vllm_server.sh +30 -0
- hud/rl/utils.py +524 -0
- hud/rl/vllm_adapter.py +125 -0
- hud/settings.py +6 -0
- hud/telemetry/__init__.py +2 -1
- hud/telemetry/job.py +46 -3
- hud/telemetry/tests/test_trace.py +3 -3
- hud/telemetry/trace.py +85 -13
- hud/tools/tests/test_computer.py +3 -3
- hud/tools/tests/test_computer_actions.py +1 -1
- hud/types.py +123 -2
- hud/utils/group_eval.py +223 -0
- hud/utils/hud_console.py +113 -13
- hud/utils/tasks.py +119 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/METADATA +20 -2
- {hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/RECORD +68 -48
- hud/cli/hf.py +0 -406
- hud/cli/rl/README.md +0 -243
- hud/cli/rl/init.py +0 -370
- hud/cli/rl/pod.py +0 -501
- hud/cli/rl/ssh.py +0 -322
- hud/cli/rl/train.py +0 -562
- hud/cli/rl/utils.py +0 -165
- hud/datasets/execution/__init__.py +0 -13
- hud/datasets/task.py +0 -116
- {hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/WHEEL +0 -0
- {hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/licenses/LICENSE +0 -0
hud/cli/eval.py
CHANGED
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
6
|
-
import json
|
|
7
6
|
import logging
|
|
8
7
|
from pathlib import Path
|
|
9
8
|
from typing import Any, Literal
|
|
@@ -11,23 +10,117 @@ from typing import Any, Literal
|
|
|
11
10
|
import typer
|
|
12
11
|
|
|
13
12
|
import hud
|
|
13
|
+
from hud.settings import settings
|
|
14
|
+
from hud.utils.group_eval import display_group_statistics, run_tasks_grouped
|
|
14
15
|
from hud.utils.hud_console import HUDConsole
|
|
15
16
|
|
|
16
17
|
logger = logging.getLogger(__name__)
|
|
17
18
|
hud_console = HUDConsole()
|
|
18
19
|
|
|
19
20
|
|
|
21
|
+
def get_available_models() -> list[dict[str, str | None]]:
|
|
22
|
+
"""Fetch available models from the HUD API (only ready models).
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
List of dicts with 'name', 'vllm_url', and 'base_model' keys
|
|
26
|
+
"""
|
|
27
|
+
try:
|
|
28
|
+
from hud.cli.rl import rl_api
|
|
29
|
+
|
|
30
|
+
hud_console.info("Fetching your models from https://app.hud.so/models")
|
|
31
|
+
models = rl_api.list_models()
|
|
32
|
+
|
|
33
|
+
# Filter for ready models only and sort by recency
|
|
34
|
+
ready_models = [m for m in models if m.status == "ready"]
|
|
35
|
+
ready_models.sort(key=lambda m: m.created_at or "", reverse=True)
|
|
36
|
+
|
|
37
|
+
# Count other statuses for informational purposes
|
|
38
|
+
training_count = sum(1 for m in models if m.status == "training")
|
|
39
|
+
# other_count = len(models) - len(ready_models) - training_count
|
|
40
|
+
|
|
41
|
+
if ready_models:
|
|
42
|
+
hud_console.success(f"Found {len(ready_models)} ready models:")
|
|
43
|
+
for model in ready_models:
|
|
44
|
+
vllm_status = " (vLLM deployed)" if model.vllm_url else ""
|
|
45
|
+
hud_console.info(f" ✅ {model.name}{vllm_status}")
|
|
46
|
+
|
|
47
|
+
if training_count > 0:
|
|
48
|
+
hud_console.info(f"\n({training_count} models currently training)")
|
|
49
|
+
|
|
50
|
+
return [
|
|
51
|
+
{"name": model.name, "vllm_url": model.vllm_url, "base_model": model.base_model}
|
|
52
|
+
for model in ready_models
|
|
53
|
+
]
|
|
54
|
+
else:
|
|
55
|
+
if training_count > 0:
|
|
56
|
+
hud_console.warning(
|
|
57
|
+
f"No ready models found. You have {training_count} models currently training."
|
|
58
|
+
)
|
|
59
|
+
else:
|
|
60
|
+
hud_console.warning("No models found in your account.")
|
|
61
|
+
return []
|
|
62
|
+
except Exception as e:
|
|
63
|
+
hud_console.debug(f"Error fetching models: {e}")
|
|
64
|
+
# Don't show the error to the user, just proceed without HUD models
|
|
65
|
+
return []
|
|
66
|
+
|
|
67
|
+
|
|
20
68
|
def build_agent(
|
|
21
|
-
agent_type: Literal["claude", "openai"],
|
|
69
|
+
agent_type: Literal["claude", "openai", "vllm"],
|
|
22
70
|
*,
|
|
23
71
|
model: str | None = None,
|
|
24
72
|
allowed_tools: list[str] | None = None,
|
|
25
73
|
verbose: bool = False,
|
|
74
|
+
vllm_base_url: str | None = None,
|
|
26
75
|
) -> Any:
|
|
27
76
|
"""Create and return the requested agent type."""
|
|
28
77
|
|
|
29
78
|
# Import agents lazily to avoid dependency issues
|
|
30
|
-
if agent_type == "
|
|
79
|
+
if agent_type == "vllm":
|
|
80
|
+
# Create a generic OpenAI agent for vLLM server
|
|
81
|
+
try:
|
|
82
|
+
from openai import AsyncOpenAI
|
|
83
|
+
|
|
84
|
+
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
|
|
85
|
+
except ImportError as e:
|
|
86
|
+
hud_console.error(
|
|
87
|
+
"OpenAI dependencies are not installed. "
|
|
88
|
+
"Please install with: pip install 'hud-python[agent]'"
|
|
89
|
+
)
|
|
90
|
+
raise typer.Exit(1) from e
|
|
91
|
+
|
|
92
|
+
# Determine the base URL to use
|
|
93
|
+
if vllm_base_url is not None:
|
|
94
|
+
# Use the provided vLLM URL (for custom/local servers)
|
|
95
|
+
base_url = vllm_base_url
|
|
96
|
+
hud_console.info(f"Using vLLM server at {base_url}")
|
|
97
|
+
api_key = (
|
|
98
|
+
settings.api_key if base_url.startswith(settings.hud_rl_url) else "token-abc123"
|
|
99
|
+
)
|
|
100
|
+
else:
|
|
101
|
+
# Default to localhost
|
|
102
|
+
base_url = "http://localhost:8000/v1"
|
|
103
|
+
api_key = "token-abc123"
|
|
104
|
+
|
|
105
|
+
# Create OpenAI client for vLLM
|
|
106
|
+
openai_client = AsyncOpenAI(
|
|
107
|
+
base_url=base_url,
|
|
108
|
+
api_key=api_key,
|
|
109
|
+
timeout=30.0,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
return GenericOpenAIChatAgent(
|
|
113
|
+
openai_client=openai_client,
|
|
114
|
+
model_name=model or "served-model", # Default model name
|
|
115
|
+
verbose=verbose,
|
|
116
|
+
completion_kwargs={
|
|
117
|
+
"temperature": 0.7,
|
|
118
|
+
"max_tokens": 2048,
|
|
119
|
+
"tool_choice": "required", # if self.actor_config.force_tool_choice else "auto",
|
|
120
|
+
},
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
elif agent_type == "openai":
|
|
31
124
|
try:
|
|
32
125
|
from hud.agents import OperatorAgent
|
|
33
126
|
except ImportError as e:
|
|
@@ -73,17 +166,19 @@ def build_agent(
|
|
|
73
166
|
async def run_single_task(
|
|
74
167
|
source: str,
|
|
75
168
|
*,
|
|
76
|
-
agent_type: Literal["claude", "openai"] = "claude",
|
|
169
|
+
agent_type: Literal["claude", "openai", "vllm"] = "claude",
|
|
77
170
|
model: str | None = None,
|
|
78
171
|
allowed_tools: list[str] | None = None,
|
|
79
172
|
max_steps: int = 10,
|
|
80
173
|
verbose: bool = False,
|
|
174
|
+
vllm_base_url: str | None = None,
|
|
175
|
+
group_size: int = 1,
|
|
81
176
|
) -> None:
|
|
82
177
|
"""Load one task and execute it, or detect if JSON contains a list and run as dataset."""
|
|
83
178
|
|
|
84
179
|
# Import Task and run_dataset lazily
|
|
85
180
|
try:
|
|
86
|
-
from hud.
|
|
181
|
+
from hud.utils.tasks import load_tasks
|
|
87
182
|
except ImportError as e:
|
|
88
183
|
hud_console.error(
|
|
89
184
|
"Dataset dependencies are not installed. "
|
|
@@ -91,114 +186,113 @@ async def run_single_task(
|
|
|
91
186
|
)
|
|
92
187
|
raise typer.Exit(1) from e
|
|
93
188
|
|
|
94
|
-
# Check if it's a
|
|
189
|
+
# Check if it's a file
|
|
95
190
|
path = Path(source)
|
|
96
|
-
if path.exists() and path.suffix
|
|
191
|
+
if path.exists() and (path.suffix in [".json", ".jsonl"]):
|
|
97
192
|
hud_console.info("📊 Loading task file…")
|
|
98
|
-
with open(path) as f: # noqa: ASYNC230
|
|
99
|
-
json_data = json.load(f)
|
|
100
|
-
|
|
101
|
-
# Check if JSON contains multiple tasks (list with more than 1 task)
|
|
102
|
-
if isinstance(json_data, list) and len(json_data) > 1:
|
|
103
|
-
hud_console.info(f"Found {len(json_data)} tasks in JSON file, running as dataset…")
|
|
104
|
-
|
|
105
|
-
# Build agent class and config for run_dataset
|
|
106
|
-
if agent_type == "openai":
|
|
107
|
-
try:
|
|
108
|
-
from hud.agents import OperatorAgent
|
|
109
|
-
|
|
110
|
-
agent_class = OperatorAgent
|
|
111
|
-
except ImportError as e:
|
|
112
|
-
hud_console.error(
|
|
113
|
-
"OpenAI agent dependencies are not installed. "
|
|
114
|
-
"Please install with: pip install 'hud-python\u27e6agent\u27e7'"
|
|
115
|
-
)
|
|
116
|
-
raise typer.Exit(1) from e
|
|
117
|
-
|
|
118
|
-
agent_config: dict[str, Any] = {"verbose": verbose}
|
|
119
|
-
if allowed_tools:
|
|
120
|
-
agent_config["allowed_tools"] = allowed_tools
|
|
121
193
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
from hud.agents import ClaudeAgent
|
|
125
|
-
|
|
126
|
-
agent_class = ClaudeAgent
|
|
127
|
-
except ImportError as e:
|
|
128
|
-
hud_console.error(
|
|
129
|
-
"Claude agent dependencies are not installed. "
|
|
130
|
-
"Please install with: pip install 'hud-python[agent]'"
|
|
131
|
-
)
|
|
132
|
-
raise typer.Exit(1) from e
|
|
133
|
-
|
|
134
|
-
agent_config = {
|
|
135
|
-
"model": model or "claude-sonnet-4-20250514",
|
|
136
|
-
"verbose": verbose,
|
|
137
|
-
}
|
|
138
|
-
if allowed_tools:
|
|
139
|
-
agent_config["allowed_tools"] = allowed_tools
|
|
140
|
-
|
|
141
|
-
# Run as dataset with single-task concurrency to maintain debug behavior
|
|
142
|
-
results = await run_dataset(
|
|
143
|
-
name=f"JSON Dataset: {path.name}",
|
|
144
|
-
dataset=json_data, # Pass the list directly
|
|
145
|
-
agent_class=agent_class,
|
|
146
|
-
agent_config=agent_config,
|
|
147
|
-
max_concurrent=1, # Run sequentially for debug mode
|
|
148
|
-
metadata={"source": str(path)},
|
|
149
|
-
max_steps=max_steps,
|
|
150
|
-
)
|
|
194
|
+
# Use unified loader for both JSON and JSONL
|
|
195
|
+
tasks = load_tasks(str(path))
|
|
151
196
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
return
|
|
156
|
-
|
|
157
|
-
# Single task JSON (either direct object or list with 1 task)
|
|
158
|
-
if isinstance(json_data, list) and len(json_data) == 1:
|
|
159
|
-
hud_console.info("Found 1 task in JSON file, running as single task…")
|
|
160
|
-
task = Task(**json_data[0])
|
|
161
|
-
elif isinstance(json_data, dict):
|
|
162
|
-
task = Task(**json_data)
|
|
163
|
-
else:
|
|
164
|
-
hud_console.error("JSON file must contain a list of tasks when using --full flag")
|
|
165
|
-
raise typer.Exit(1)
|
|
197
|
+
# Single task - use the first (and only) task
|
|
198
|
+
task = tasks[0]
|
|
199
|
+
hud_console.info("Found 1 task, running as single task…")
|
|
166
200
|
else:
|
|
167
|
-
# Load from HuggingFace dataset
|
|
168
|
-
hud_console.info(f"📊 Loading
|
|
169
|
-
|
|
170
|
-
from datasets import load_dataset
|
|
171
|
-
except ImportError as e:
|
|
172
|
-
hud_console.error(
|
|
173
|
-
"Datasets library is not installed. "
|
|
174
|
-
"Please install with: pip install 'hud-python[agent]'"
|
|
175
|
-
)
|
|
176
|
-
raise typer.Exit(1) from e
|
|
201
|
+
# Load from HuggingFace dataset or non-file source
|
|
202
|
+
hud_console.info(f"📊 Loading tasks from: {source}…")
|
|
203
|
+
tasks = load_tasks(source)
|
|
177
204
|
|
|
178
|
-
|
|
205
|
+
if not tasks:
|
|
206
|
+
hud_console.error(f"No tasks found in: {source}")
|
|
207
|
+
raise typer.Exit(1)
|
|
179
208
|
|
|
180
|
-
#
|
|
181
|
-
|
|
182
|
-
|
|
209
|
+
# Single task - use the first task
|
|
210
|
+
task = tasks[0]
|
|
211
|
+
hud_console.info(
|
|
212
|
+
"Using first task from dataset (run with --full to run the entire dataset)..."
|
|
213
|
+
)
|
|
183
214
|
|
|
184
215
|
task_prompt = task.prompt[:50] + "..." if len(task.prompt) > 50 else task.prompt
|
|
185
216
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
217
|
+
# Use grouped evaluation if group_size > 1
|
|
218
|
+
if group_size > 1:
|
|
219
|
+
hud_console.info(f"🔄 Running task with group_size={group_size}")
|
|
220
|
+
agent_config: dict[str, Any] = {}
|
|
221
|
+
|
|
222
|
+
# Build agent configuration
|
|
223
|
+
if agent_type == "vllm":
|
|
224
|
+
# Special handling for vLLM
|
|
225
|
+
sample_agent = build_agent(
|
|
226
|
+
agent_type,
|
|
227
|
+
model=model,
|
|
228
|
+
allowed_tools=allowed_tools,
|
|
229
|
+
verbose=verbose,
|
|
230
|
+
vllm_base_url=vllm_base_url,
|
|
231
|
+
)
|
|
232
|
+
agent_config = {
|
|
233
|
+
"openai_client": sample_agent.oai,
|
|
234
|
+
"model_name": sample_agent.model_name,
|
|
235
|
+
"verbose": verbose,
|
|
236
|
+
"completion_kwargs": sample_agent.completion_kwargs,
|
|
237
|
+
}
|
|
238
|
+
if allowed_tools:
|
|
239
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
240
|
+
|
|
241
|
+
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
|
|
242
|
+
|
|
243
|
+
agent_class = GenericOpenAIChatAgent
|
|
244
|
+
elif agent_type == "openai":
|
|
245
|
+
from hud.agents import OperatorAgent
|
|
246
|
+
|
|
247
|
+
agent_class = OperatorAgent
|
|
248
|
+
agent_config = {"verbose": verbose}
|
|
249
|
+
if allowed_tools:
|
|
250
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
251
|
+
else:
|
|
252
|
+
from hud.agents import ClaudeAgent
|
|
253
|
+
|
|
254
|
+
agent_class = ClaudeAgent
|
|
255
|
+
agent_config = {
|
|
256
|
+
"model": model or "claude-sonnet-4-20250514",
|
|
257
|
+
"verbose": verbose,
|
|
258
|
+
}
|
|
259
|
+
if allowed_tools:
|
|
260
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
261
|
+
|
|
262
|
+
# Run with grouping
|
|
263
|
+
with hud.trace(name=f"{task_prompt} (group_size={group_size})"):
|
|
264
|
+
stats = await run_tasks_grouped(
|
|
265
|
+
tasks=[task],
|
|
266
|
+
agent_class=agent_class,
|
|
267
|
+
agent_config=agent_config,
|
|
268
|
+
group_size=group_size,
|
|
269
|
+
max_parallel_episodes=48, # Same as RL default
|
|
270
|
+
max_steps=max_steps,
|
|
271
|
+
verbose=verbose,
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
# Display results
|
|
275
|
+
display_group_statistics(stats, show_details=True)
|
|
276
|
+
|
|
277
|
+
else:
|
|
278
|
+
# Original single-run logic
|
|
279
|
+
with hud.trace(name=task_prompt):
|
|
280
|
+
agent = build_agent(
|
|
281
|
+
agent_type,
|
|
282
|
+
model=model,
|
|
283
|
+
allowed_tools=allowed_tools,
|
|
284
|
+
verbose=verbose,
|
|
285
|
+
vllm_base_url=vllm_base_url,
|
|
286
|
+
)
|
|
287
|
+
hud_console.info(task.prompt)
|
|
288
|
+
result = await agent.run(task, max_steps=max_steps)
|
|
289
|
+
hud_console.success(f"Reward: {result.reward}")
|
|
196
290
|
|
|
197
291
|
|
|
198
292
|
async def run_full_dataset(
|
|
199
293
|
source: str,
|
|
200
294
|
*,
|
|
201
|
-
agent_type: Literal["claude", "openai"] = "claude",
|
|
295
|
+
agent_type: Literal["claude", "openai", "vllm"] = "claude",
|
|
202
296
|
model: str | None = None,
|
|
203
297
|
allowed_tools: list[str] | None = None,
|
|
204
298
|
max_concurrent: int = 50,
|
|
@@ -207,6 +301,8 @@ async def run_full_dataset(
|
|
|
207
301
|
max_workers: int | None = None,
|
|
208
302
|
max_concurrent_per_worker: int = 25,
|
|
209
303
|
verbose: bool = False,
|
|
304
|
+
vllm_base_url: str | None = None,
|
|
305
|
+
group_size: int = 1,
|
|
210
306
|
) -> list[Any]:
|
|
211
307
|
"""Run evaluation across the entire dataset.
|
|
212
308
|
|
|
@@ -216,32 +312,64 @@ async def run_full_dataset(
|
|
|
216
312
|
# Import run_dataset lazily
|
|
217
313
|
try:
|
|
218
314
|
from hud.datasets import run_dataset, run_dataset_parallel, run_dataset_parallel_manual
|
|
315
|
+
from hud.utils.tasks import load_tasks
|
|
219
316
|
except ImportError as e:
|
|
220
317
|
hud_console.error(
|
|
221
318
|
"Dataset dependencies are not installed. "
|
|
222
|
-
"Please install with: pip install 'hud-python[
|
|
319
|
+
"Please install with: pip install 'hud-python[agent]'"
|
|
223
320
|
)
|
|
224
321
|
raise typer.Exit(1) from e
|
|
225
322
|
|
|
226
|
-
#
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
dataset_name = source.split("/")[-1]
|
|
323
|
+
# Load tasks using unified loader
|
|
324
|
+
hud_console.info(f"📊 Loading tasks from: {source}…")
|
|
325
|
+
tasks = load_tasks(source)
|
|
230
326
|
|
|
231
|
-
if
|
|
232
|
-
|
|
233
|
-
|
|
327
|
+
if not tasks:
|
|
328
|
+
hud_console.error(f"No tasks found in: {source}")
|
|
329
|
+
raise typer.Exit(1)
|
|
234
330
|
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
331
|
+
# Convert Task objects to dicts for dataset runners
|
|
332
|
+
dataset_or_tasks = [task.model_dump() for task in tasks]
|
|
333
|
+
|
|
334
|
+
# Determine dataset name
|
|
335
|
+
path = Path(source)
|
|
336
|
+
dataset_name = f"Dataset: {path.name}" if path.exists() else source.split("/")[-1]
|
|
337
|
+
|
|
338
|
+
hud_console.info(f"Found {len(tasks)} tasks")
|
|
242
339
|
|
|
243
340
|
# Build agent class + config for run_dataset
|
|
244
|
-
if agent_type == "
|
|
341
|
+
if agent_type == "vllm":
|
|
342
|
+
try:
|
|
343
|
+
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
|
|
344
|
+
|
|
345
|
+
agent_class = GenericOpenAIChatAgent
|
|
346
|
+
except ImportError as e:
|
|
347
|
+
hud_console.error(
|
|
348
|
+
"OpenAI dependencies are not installed. "
|
|
349
|
+
"Please install with: pip install 'hud-python[agent]'"
|
|
350
|
+
)
|
|
351
|
+
raise typer.Exit(1) from e
|
|
352
|
+
|
|
353
|
+
# Use build_agent to create a sample agent to get the config
|
|
354
|
+
sample_agent = build_agent(
|
|
355
|
+
agent_type,
|
|
356
|
+
model=model,
|
|
357
|
+
allowed_tools=allowed_tools,
|
|
358
|
+
verbose=verbose,
|
|
359
|
+
vllm_base_url=vllm_base_url,
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
# Extract the config from the sample agent
|
|
363
|
+
agent_config: dict[str, Any] = {
|
|
364
|
+
"openai_client": sample_agent.oai,
|
|
365
|
+
"model_name": sample_agent.model_name,
|
|
366
|
+
"verbose": verbose,
|
|
367
|
+
"completion_kwargs": sample_agent.completion_kwargs,
|
|
368
|
+
}
|
|
369
|
+
if allowed_tools:
|
|
370
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
371
|
+
|
|
372
|
+
elif agent_type == "openai":
|
|
245
373
|
try:
|
|
246
374
|
from hud.agents import OperatorAgent
|
|
247
375
|
|
|
@@ -253,7 +381,7 @@ async def run_full_dataset(
|
|
|
253
381
|
)
|
|
254
382
|
raise typer.Exit(1) from e
|
|
255
383
|
|
|
256
|
-
agent_config
|
|
384
|
+
agent_config = {"verbose": verbose}
|
|
257
385
|
if allowed_tools:
|
|
258
386
|
agent_config["allowed_tools"] = allowed_tools
|
|
259
387
|
|
|
@@ -276,7 +404,51 @@ async def run_full_dataset(
|
|
|
276
404
|
if allowed_tools:
|
|
277
405
|
agent_config["allowed_tools"] = allowed_tools
|
|
278
406
|
|
|
279
|
-
if
|
|
407
|
+
# Use grouped evaluation if group_size > 1
|
|
408
|
+
if group_size > 1:
|
|
409
|
+
hud_console.info(f"🔄 Running dataset with group_size={group_size}")
|
|
410
|
+
|
|
411
|
+
# Run with job tracking
|
|
412
|
+
with hud.job(
|
|
413
|
+
name=f"Evaluation {dataset_name} (group_size={group_size})",
|
|
414
|
+
metadata={
|
|
415
|
+
"dataset": source,
|
|
416
|
+
"group_size": group_size,
|
|
417
|
+
"tasks": len(dataset_or_tasks),
|
|
418
|
+
"total_episodes": len(dataset_or_tasks) * group_size,
|
|
419
|
+
},
|
|
420
|
+
) as job:
|
|
421
|
+
# Convert dicts to Task objects if needed
|
|
422
|
+
from hud.datasets import Task
|
|
423
|
+
|
|
424
|
+
tasks = []
|
|
425
|
+
for item in dataset_or_tasks:
|
|
426
|
+
if isinstance(item, dict):
|
|
427
|
+
tasks.append(Task(**item))
|
|
428
|
+
else:
|
|
429
|
+
tasks.append(item)
|
|
430
|
+
|
|
431
|
+
stats = await run_tasks_grouped(
|
|
432
|
+
tasks=tasks,
|
|
433
|
+
agent_class=agent_class,
|
|
434
|
+
agent_config=agent_config,
|
|
435
|
+
group_size=group_size,
|
|
436
|
+
max_parallel_episodes=max_concurrent
|
|
437
|
+
if not parallel
|
|
438
|
+
else max_concurrent_per_worker * (max_workers or 4),
|
|
439
|
+
max_steps=max_steps,
|
|
440
|
+
verbose=verbose,
|
|
441
|
+
job_id=job.id,
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
# Display results
|
|
445
|
+
display_group_statistics(stats, show_details=len(stats) <= 20)
|
|
446
|
+
|
|
447
|
+
# Return stats for consistency with other modes
|
|
448
|
+
return stats
|
|
449
|
+
|
|
450
|
+
# Original logic for non-grouped evaluation
|
|
451
|
+
elif parallel:
|
|
280
452
|
hud_console.info(
|
|
281
453
|
f"🚀 Running PARALLEL evaluation (workers: {max_workers or 'auto'}, max_concurrent: {max_concurrent})…" # noqa: E501
|
|
282
454
|
)
|
|
@@ -322,17 +494,17 @@ async def run_full_dataset(
|
|
|
322
494
|
def eval_command(
|
|
323
495
|
source: str = typer.Argument(
|
|
324
496
|
...,
|
|
325
|
-
help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50'),
|
|
497
|
+
help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50'), JSON file (array of tasks), or JSONL file (one task per line)", # noqa: E501
|
|
326
498
|
),
|
|
327
499
|
full: bool = typer.Option(
|
|
328
500
|
False,
|
|
329
501
|
"--full",
|
|
330
502
|
help="Run the entire dataset (omit for single-task debug mode)",
|
|
331
503
|
),
|
|
332
|
-
agent: Literal["claude", "openai"] = typer.Option(
|
|
504
|
+
agent: Literal["claude", "openai", "vllm"] = typer.Option(
|
|
333
505
|
"claude",
|
|
334
506
|
"--agent",
|
|
335
|
-
help="Agent backend to use",
|
|
507
|
+
help="Agent backend to use (claude, openai, or vllm for local server)",
|
|
336
508
|
),
|
|
337
509
|
model: str | None = typer.Option(
|
|
338
510
|
None,
|
|
@@ -374,6 +546,16 @@ def eval_command(
|
|
|
374
546
|
"--verbose",
|
|
375
547
|
help="Enable verbose output from the agent",
|
|
376
548
|
),
|
|
549
|
+
vllm_base_url: str | None = typer.Option(
|
|
550
|
+
None,
|
|
551
|
+
"--vllm-base-url",
|
|
552
|
+
help="Base URL for vLLM server (when using --agent vllm)",
|
|
553
|
+
),
|
|
554
|
+
group_size: int = typer.Option(
|
|
555
|
+
1,
|
|
556
|
+
"--group-size",
|
|
557
|
+
help="Number of times to run each task (similar to RL training)",
|
|
558
|
+
),
|
|
377
559
|
) -> None:
|
|
378
560
|
"""🚀 Run evaluation on datasets or individual tasks with agents.
|
|
379
561
|
|
|
@@ -402,6 +584,12 @@ def eval_command(
|
|
|
402
584
|
# Run with OpenAI Operator agent
|
|
403
585
|
hud eval hud-evals/OSWorld-Gold-Beta --agent openai
|
|
404
586
|
|
|
587
|
+
# Use local vLLM server (default: localhost:8000)
|
|
588
|
+
hud eval task.json --agent vllm --model Qwen/Qwen2.5-VL-3B-Instruct
|
|
589
|
+
|
|
590
|
+
# Use custom vLLM server URL
|
|
591
|
+
hud eval task.json --agent vllm --vllm-base-url http://192.168.1.100:8000/v1
|
|
592
|
+
|
|
405
593
|
# Run with verbose output for debugging
|
|
406
594
|
hud eval task.json --verbose
|
|
407
595
|
"""
|
|
@@ -419,6 +607,12 @@ def eval_command(
|
|
|
419
607
|
hud_console.error("OPENAI_API_KEY is required for OpenAI agent")
|
|
420
608
|
hud_console.info("Set it in your environment or .env file: OPENAI_API_KEY=your-key-here")
|
|
421
609
|
raise typer.Exit(1)
|
|
610
|
+
elif agent == "vllm":
|
|
611
|
+
if model:
|
|
612
|
+
hud_console.info(f"Using vLLM with model: {model}")
|
|
613
|
+
else:
|
|
614
|
+
hud_console.error("Model name is required for vLLM agent, specify with --model")
|
|
615
|
+
raise typer.Exit(1)
|
|
422
616
|
|
|
423
617
|
# Check for HUD_API_KEY if using HUD services
|
|
424
618
|
if not settings.api_key:
|
|
@@ -448,6 +642,8 @@ def eval_command(
|
|
|
448
642
|
max_workers=max_workers,
|
|
449
643
|
max_concurrent_per_worker=max_concurrent_per_worker,
|
|
450
644
|
verbose=verbose,
|
|
645
|
+
vllm_base_url=vllm_base_url,
|
|
646
|
+
group_size=group_size,
|
|
451
647
|
)
|
|
452
648
|
)
|
|
453
649
|
else:
|
|
@@ -459,5 +655,7 @@ def eval_command(
|
|
|
459
655
|
allowed_tools=allowed_tools_list,
|
|
460
656
|
max_steps=max_steps,
|
|
461
657
|
verbose=verbose,
|
|
658
|
+
vllm_base_url=vllm_base_url,
|
|
659
|
+
group_size=group_size,
|
|
462
660
|
)
|
|
463
661
|
)
|
|
File without changes
|
hud/cli/flows/tasks.py
ADDED
|
File without changes
|