hud-python 0.4.27__py3-none-any.whl → 0.4.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (76) hide show
  1. hud/__init__.py +2 -1
  2. hud/agents/base.py +73 -45
  3. hud/agents/claude.py +8 -4
  4. hud/agents/openai_chat_generic.py +65 -40
  5. hud/agents/tests/test_base.py +0 -4
  6. hud/agents/tests/test_openai.py +1 -1
  7. hud/cli/__init__.py +182 -52
  8. hud/cli/dev.py +8 -9
  9. hud/cli/eval.py +317 -119
  10. hud/cli/flows/__init__.py +0 -0
  11. hud/cli/flows/tasks.py +0 -0
  12. hud/cli/get.py +160 -0
  13. hud/cli/rl/__init__.py +563 -71
  14. hud/cli/rl/config.py +94 -0
  15. hud/cli/rl/display.py +133 -0
  16. hud/cli/rl/gpu.py +63 -0
  17. hud/cli/rl/gpu_utils.py +318 -0
  18. hud/cli/rl/presets.py +96 -0
  19. hud/cli/rl/remote_runner.py +348 -0
  20. hud/cli/rl/rl_api.py +150 -0
  21. hud/cli/rl/vllm.py +177 -0
  22. hud/cli/tests/test_analyze_metadata.py +0 -1
  23. hud/cli/utils/tasks.py +26 -0
  24. hud/clients/base.py +21 -23
  25. hud/clients/mcp_use.py +36 -44
  26. hud/clients/tests/test_mcp_use_retry.py +10 -10
  27. hud/datasets/__init__.py +4 -3
  28. hud/datasets/{execution/parallel.py → parallel.py} +1 -1
  29. hud/datasets/{execution/runner.py → runner.py} +1 -1
  30. hud/datasets/utils.py +1 -1
  31. hud/native/tests/test_native_init.py +1 -1
  32. hud/otel/config.py +1 -1
  33. hud/otel/instrumentation.py +35 -0
  34. hud/rl/README.md +31 -0
  35. hud/rl/__init__.py +1 -0
  36. hud/rl/actor.py +174 -0
  37. hud/rl/buffer.py +371 -0
  38. hud/rl/chat_template.jinja +101 -0
  39. hud/rl/config.py +184 -0
  40. hud/rl/distributed.py +95 -0
  41. hud/rl/learner.py +586 -0
  42. hud/rl/tests/__init__.py +1 -0
  43. hud/rl/tests/test_learner.py +171 -0
  44. hud/rl/train.py +354 -0
  45. hud/rl/types.py +101 -0
  46. hud/rl/utils/start_vllm_server.sh +30 -0
  47. hud/rl/utils.py +524 -0
  48. hud/rl/vllm_adapter.py +125 -0
  49. hud/settings.py +6 -0
  50. hud/telemetry/__init__.py +2 -1
  51. hud/telemetry/job.py +46 -3
  52. hud/telemetry/tests/test_trace.py +3 -3
  53. hud/telemetry/trace.py +85 -13
  54. hud/tools/computer/hud.py +4 -4
  55. hud/tools/tests/test_computer.py +3 -3
  56. hud/tools/tests/test_computer_actions.py +1 -1
  57. hud/types.py +123 -2
  58. hud/utils/group_eval.py +223 -0
  59. hud/utils/hud_console.py +113 -13
  60. hud/utils/tasks.py +119 -0
  61. hud/utils/tests/test_version.py +1 -1
  62. hud/version.py +1 -1
  63. {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/METADATA +20 -2
  64. {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/RECORD +67 -47
  65. hud/cli/hf.py +0 -406
  66. hud/cli/rl/README.md +0 -243
  67. hud/cli/rl/init.py +0 -370
  68. hud/cli/rl/pod.py +0 -501
  69. hud/cli/rl/ssh.py +0 -322
  70. hud/cli/rl/train.py +0 -562
  71. hud/cli/rl/utils.py +0 -165
  72. hud/datasets/execution/__init__.py +0 -13
  73. hud/datasets/task.py +0 -116
  74. {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/WHEEL +0 -0
  75. {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/entry_points.txt +0 -0
  76. {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/licenses/LICENSE +0 -0
hud/cli/eval.py CHANGED
@@ -3,7 +3,6 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import asyncio
6
- import json
7
6
  import logging
8
7
  from pathlib import Path
9
8
  from typing import Any, Literal
@@ -11,23 +10,117 @@ from typing import Any, Literal
11
10
  import typer
12
11
 
13
12
  import hud
13
+ from hud.settings import settings
14
+ from hud.utils.group_eval import display_group_statistics, run_tasks_grouped
14
15
  from hud.utils.hud_console import HUDConsole
15
16
 
16
17
  logger = logging.getLogger(__name__)
17
18
  hud_console = HUDConsole()
18
19
 
19
20
 
21
+ def get_available_models() -> list[dict[str, str | None]]:
22
+ """Fetch available models from the HUD API (only ready models).
23
+
24
+ Returns:
25
+ List of dicts with 'name', 'vllm_url', and 'base_model' keys
26
+ """
27
+ try:
28
+ from hud.cli.rl import rl_api
29
+
30
+ hud_console.info("Fetching your models from https://app.hud.so/models")
31
+ models = rl_api.list_models()
32
+
33
+ # Filter for ready models only and sort by recency
34
+ ready_models = [m for m in models if m.status == "ready"]
35
+ ready_models.sort(key=lambda m: m.created_at or "", reverse=True)
36
+
37
+ # Count other statuses for informational purposes
38
+ training_count = sum(1 for m in models if m.status == "training")
39
+ # other_count = len(models) - len(ready_models) - training_count
40
+
41
+ if ready_models:
42
+ hud_console.success(f"Found {len(ready_models)} ready models:")
43
+ for model in ready_models:
44
+ vllm_status = " (vLLM deployed)" if model.vllm_url else ""
45
+ hud_console.info(f" ✅ {model.name}{vllm_status}")
46
+
47
+ if training_count > 0:
48
+ hud_console.info(f"\n({training_count} models currently training)")
49
+
50
+ return [
51
+ {"name": model.name, "vllm_url": model.vllm_url, "base_model": model.base_model}
52
+ for model in ready_models
53
+ ]
54
+ else:
55
+ if training_count > 0:
56
+ hud_console.warning(
57
+ f"No ready models found. You have {training_count} models currently training."
58
+ )
59
+ else:
60
+ hud_console.warning("No models found in your account.")
61
+ return []
62
+ except Exception as e:
63
+ hud_console.debug(f"Error fetching models: {e}")
64
+ # Don't show the error to the user, just proceed without HUD models
65
+ return []
66
+
67
+
20
68
  def build_agent(
21
- agent_type: Literal["claude", "openai"],
69
+ agent_type: Literal["claude", "openai", "vllm"],
22
70
  *,
23
71
  model: str | None = None,
24
72
  allowed_tools: list[str] | None = None,
25
73
  verbose: bool = False,
74
+ vllm_base_url: str | None = None,
26
75
  ) -> Any:
27
76
  """Create and return the requested agent type."""
28
77
 
29
78
  # Import agents lazily to avoid dependency issues
30
- if agent_type == "openai":
79
+ if agent_type == "vllm":
80
+ # Create a generic OpenAI agent for vLLM server
81
+ try:
82
+ from openai import AsyncOpenAI
83
+
84
+ from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
85
+ except ImportError as e:
86
+ hud_console.error(
87
+ "OpenAI dependencies are not installed. "
88
+ "Please install with: pip install 'hud-python[agent]'"
89
+ )
90
+ raise typer.Exit(1) from e
91
+
92
+ # Determine the base URL to use
93
+ if vllm_base_url is not None:
94
+ # Use the provided vLLM URL (for custom/local servers)
95
+ base_url = vllm_base_url
96
+ hud_console.info(f"Using vLLM server at {base_url}")
97
+ api_key = (
98
+ settings.api_key if base_url.startswith(settings.hud_rl_url) else "token-abc123"
99
+ )
100
+ else:
101
+ # Default to localhost
102
+ base_url = "http://localhost:8000/v1"
103
+ api_key = "token-abc123"
104
+
105
+ # Create OpenAI client for vLLM
106
+ openai_client = AsyncOpenAI(
107
+ base_url=base_url,
108
+ api_key=api_key,
109
+ timeout=30.0,
110
+ )
111
+
112
+ return GenericOpenAIChatAgent(
113
+ openai_client=openai_client,
114
+ model_name=model or "served-model", # Default model name
115
+ verbose=verbose,
116
+ completion_kwargs={
117
+ "temperature": 0.7,
118
+ "max_tokens": 2048,
119
+ "tool_choice": "required", # if self.actor_config.force_tool_choice else "auto",
120
+ },
121
+ )
122
+
123
+ elif agent_type == "openai":
31
124
  try:
32
125
  from hud.agents import OperatorAgent
33
126
  except ImportError as e:
@@ -73,17 +166,19 @@ def build_agent(
73
166
  async def run_single_task(
74
167
  source: str,
75
168
  *,
76
- agent_type: Literal["claude", "openai"] = "claude",
169
+ agent_type: Literal["claude", "openai", "vllm"] = "claude",
77
170
  model: str | None = None,
78
171
  allowed_tools: list[str] | None = None,
79
172
  max_steps: int = 10,
80
173
  verbose: bool = False,
174
+ vllm_base_url: str | None = None,
175
+ group_size: int = 1,
81
176
  ) -> None:
82
177
  """Load one task and execute it, or detect if JSON contains a list and run as dataset."""
83
178
 
84
179
  # Import Task and run_dataset lazily
85
180
  try:
86
- from hud.datasets import Task, run_dataset
181
+ from hud.utils.tasks import load_tasks
87
182
  except ImportError as e:
88
183
  hud_console.error(
89
184
  "Dataset dependencies are not installed. "
@@ -91,114 +186,113 @@ async def run_single_task(
91
186
  )
92
187
  raise typer.Exit(1) from e
93
188
 
94
- # Check if it's a JSON file
189
+ # Check if it's a file
95
190
  path = Path(source)
96
- if path.exists() and path.suffix == ".json":
191
+ if path.exists() and (path.suffix in [".json", ".jsonl"]):
97
192
  hud_console.info("📊 Loading task file…")
98
- with open(path) as f: # noqa: ASYNC230
99
- json_data = json.load(f)
100
-
101
- # Check if JSON contains multiple tasks (list with more than 1 task)
102
- if isinstance(json_data, list) and len(json_data) > 1:
103
- hud_console.info(f"Found {len(json_data)} tasks in JSON file, running as dataset…")
104
-
105
- # Build agent class and config for run_dataset
106
- if agent_type == "openai":
107
- try:
108
- from hud.agents import OperatorAgent
109
-
110
- agent_class = OperatorAgent
111
- except ImportError as e:
112
- hud_console.error(
113
- "OpenAI agent dependencies are not installed. "
114
- "Please install with: pip install 'hud-python\u27e6agent\u27e7'"
115
- )
116
- raise typer.Exit(1) from e
117
-
118
- agent_config: dict[str, Any] = {"verbose": verbose}
119
- if allowed_tools:
120
- agent_config["allowed_tools"] = allowed_tools
121
193
 
122
- else:
123
- try:
124
- from hud.agents import ClaudeAgent
125
-
126
- agent_class = ClaudeAgent
127
- except ImportError as e:
128
- hud_console.error(
129
- "Claude agent dependencies are not installed. "
130
- "Please install with: pip install 'hud-python[agent]'"
131
- )
132
- raise typer.Exit(1) from e
133
-
134
- agent_config = {
135
- "model": model or "claude-sonnet-4-20250514",
136
- "verbose": verbose,
137
- }
138
- if allowed_tools:
139
- agent_config["allowed_tools"] = allowed_tools
140
-
141
- # Run as dataset with single-task concurrency to maintain debug behavior
142
- results = await run_dataset(
143
- name=f"JSON Dataset: {path.name}",
144
- dataset=json_data, # Pass the list directly
145
- agent_class=agent_class,
146
- agent_config=agent_config,
147
- max_concurrent=1, # Run sequentially for debug mode
148
- metadata={"source": str(path)},
149
- max_steps=max_steps,
150
- )
194
+ # Use unified loader for both JSON and JSONL
195
+ tasks = load_tasks(str(path))
151
196
 
152
- # Display summary
153
- successful = sum(1 for r in results if getattr(r, "reward", 0) > 0)
154
- hud_console.success(f"Completed {len(results)} tasks: {successful} successful")
155
- return
156
-
157
- # Single task JSON (either direct object or list with 1 task)
158
- if isinstance(json_data, list) and len(json_data) == 1:
159
- hud_console.info("Found 1 task in JSON file, running as single task…")
160
- task = Task(**json_data[0])
161
- elif isinstance(json_data, dict):
162
- task = Task(**json_data)
163
- else:
164
- hud_console.error("JSON file must contain a list of tasks when using --full flag")
165
- raise typer.Exit(1)
197
+ # Single task - use the first (and only) task
198
+ task = tasks[0]
199
+ hud_console.info("Found 1 task, running as single task…")
166
200
  else:
167
- # Load from HuggingFace dataset
168
- hud_console.info(f"📊 Loading dataset from HuggingFace: {source}…")
169
- try:
170
- from datasets import load_dataset
171
- except ImportError as e:
172
- hud_console.error(
173
- "Datasets library is not installed. "
174
- "Please install with: pip install 'hud-python[agent]'"
175
- )
176
- raise typer.Exit(1) from e
201
+ # Load from HuggingFace dataset or non-file source
202
+ hud_console.info(f"📊 Loading tasks from: {source}…")
203
+ tasks = load_tasks(source)
177
204
 
178
- dataset = load_dataset(source, split="train")
205
+ if not tasks:
206
+ hud_console.error(f"No tasks found in: {source}")
207
+ raise typer.Exit(1)
179
208
 
180
- # Get first task from dataset
181
- sample_task = dataset[0] # type: ignore[index]
182
- task = Task(**sample_task) # type: ignore[arg-type]
209
+ # Single task - use the first task
210
+ task = tasks[0]
211
+ hud_console.info(
212
+ "Using first task from dataset (run with --full to run the entire dataset)..."
213
+ )
183
214
 
184
215
  task_prompt = task.prompt[:50] + "..." if len(task.prompt) > 50 else task.prompt
185
216
 
186
- with hud.trace(name=task_prompt):
187
- agent = build_agent(
188
- agent_type,
189
- model=model,
190
- allowed_tools=allowed_tools,
191
- verbose=verbose,
192
- )
193
- hud_console.info(task.prompt)
194
- result = await agent.run(task, max_steps=max_steps)
195
- hud_console.success(f"Reward: {result.reward}")
217
+ # Use grouped evaluation if group_size > 1
218
+ if group_size > 1:
219
+ hud_console.info(f"🔄 Running task with group_size={group_size}")
220
+ agent_config: dict[str, Any] = {}
221
+
222
+ # Build agent configuration
223
+ if agent_type == "vllm":
224
+ # Special handling for vLLM
225
+ sample_agent = build_agent(
226
+ agent_type,
227
+ model=model,
228
+ allowed_tools=allowed_tools,
229
+ verbose=verbose,
230
+ vllm_base_url=vllm_base_url,
231
+ )
232
+ agent_config = {
233
+ "openai_client": sample_agent.oai,
234
+ "model_name": sample_agent.model_name,
235
+ "verbose": verbose,
236
+ "completion_kwargs": sample_agent.completion_kwargs,
237
+ }
238
+ if allowed_tools:
239
+ agent_config["allowed_tools"] = allowed_tools
240
+
241
+ from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
242
+
243
+ agent_class = GenericOpenAIChatAgent
244
+ elif agent_type == "openai":
245
+ from hud.agents import OperatorAgent
246
+
247
+ agent_class = OperatorAgent
248
+ agent_config = {"verbose": verbose}
249
+ if allowed_tools:
250
+ agent_config["allowed_tools"] = allowed_tools
251
+ else:
252
+ from hud.agents import ClaudeAgent
253
+
254
+ agent_class = ClaudeAgent
255
+ agent_config = {
256
+ "model": model or "claude-sonnet-4-20250514",
257
+ "verbose": verbose,
258
+ }
259
+ if allowed_tools:
260
+ agent_config["allowed_tools"] = allowed_tools
261
+
262
+ # Run with grouping
263
+ with hud.trace(name=f"{task_prompt} (group_size={group_size})"):
264
+ stats = await run_tasks_grouped(
265
+ tasks=[task],
266
+ agent_class=agent_class,
267
+ agent_config=agent_config,
268
+ group_size=group_size,
269
+ max_parallel_episodes=48, # Same as RL default
270
+ max_steps=max_steps,
271
+ verbose=verbose,
272
+ )
273
+
274
+ # Display results
275
+ display_group_statistics(stats, show_details=True)
276
+
277
+ else:
278
+ # Original single-run logic
279
+ with hud.trace(name=task_prompt):
280
+ agent = build_agent(
281
+ agent_type,
282
+ model=model,
283
+ allowed_tools=allowed_tools,
284
+ verbose=verbose,
285
+ vllm_base_url=vllm_base_url,
286
+ )
287
+ hud_console.info(task.prompt)
288
+ result = await agent.run(task, max_steps=max_steps)
289
+ hud_console.success(f"Reward: {result.reward}")
196
290
 
197
291
 
198
292
  async def run_full_dataset(
199
293
  source: str,
200
294
  *,
201
- agent_type: Literal["claude", "openai"] = "claude",
295
+ agent_type: Literal["claude", "openai", "vllm"] = "claude",
202
296
  model: str | None = None,
203
297
  allowed_tools: list[str] | None = None,
204
298
  max_concurrent: int = 50,
@@ -207,6 +301,8 @@ async def run_full_dataset(
207
301
  max_workers: int | None = None,
208
302
  max_concurrent_per_worker: int = 25,
209
303
  verbose: bool = False,
304
+ vllm_base_url: str | None = None,
305
+ group_size: int = 1,
210
306
  ) -> list[Any]:
211
307
  """Run evaluation across the entire dataset.
212
308
 
@@ -216,32 +312,64 @@ async def run_full_dataset(
216
312
  # Import run_dataset lazily
217
313
  try:
218
314
  from hud.datasets import run_dataset, run_dataset_parallel, run_dataset_parallel_manual
315
+ from hud.utils.tasks import load_tasks
219
316
  except ImportError as e:
220
317
  hud_console.error(
221
318
  "Dataset dependencies are not installed. "
222
- "Please install with: pip install 'hud-python[[agent]]'"
319
+ "Please install with: pip install 'hud-python[agent]'"
223
320
  )
224
321
  raise typer.Exit(1) from e
225
322
 
226
- # Check if source is a JSON file with list of tasks
227
- path = Path(source)
228
- dataset_or_tasks = source
229
- dataset_name = source.split("/")[-1]
323
+ # Load tasks using unified loader
324
+ hud_console.info(f"📊 Loading tasks from: {source}…")
325
+ tasks = load_tasks(source)
230
326
 
231
- if path.exists() and path.suffix == ".json":
232
- with open(path) as f: # noqa: ASYNC230
233
- json_data = json.load(f)
327
+ if not tasks:
328
+ hud_console.error(f"No tasks found in: {source}")
329
+ raise typer.Exit(1)
234
330
 
235
- if isinstance(json_data, list):
236
- dataset_or_tasks = json_data
237
- dataset_name = f"JSON Dataset: {path.name}"
238
- hud_console.info(f"Found {len(json_data)} tasks in JSON file")
239
- else:
240
- hud_console.error("JSON file must contain a list of tasks when using --full flag")
241
- raise typer.Exit(1)
331
+ # Convert Task objects to dicts for dataset runners
332
+ dataset_or_tasks = [task.model_dump() for task in tasks]
333
+
334
+ # Determine dataset name
335
+ path = Path(source)
336
+ dataset_name = f"Dataset: {path.name}" if path.exists() else source.split("/")[-1]
337
+
338
+ hud_console.info(f"Found {len(tasks)} tasks")
242
339
 
243
340
  # Build agent class + config for run_dataset
244
- if agent_type == "openai":
341
+ if agent_type == "vllm":
342
+ try:
343
+ from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
344
+
345
+ agent_class = GenericOpenAIChatAgent
346
+ except ImportError as e:
347
+ hud_console.error(
348
+ "OpenAI dependencies are not installed. "
349
+ "Please install with: pip install 'hud-python[agent]'"
350
+ )
351
+ raise typer.Exit(1) from e
352
+
353
+ # Use build_agent to create a sample agent to get the config
354
+ sample_agent = build_agent(
355
+ agent_type,
356
+ model=model,
357
+ allowed_tools=allowed_tools,
358
+ verbose=verbose,
359
+ vllm_base_url=vllm_base_url,
360
+ )
361
+
362
+ # Extract the config from the sample agent
363
+ agent_config: dict[str, Any] = {
364
+ "openai_client": sample_agent.oai,
365
+ "model_name": sample_agent.model_name,
366
+ "verbose": verbose,
367
+ "completion_kwargs": sample_agent.completion_kwargs,
368
+ }
369
+ if allowed_tools:
370
+ agent_config["allowed_tools"] = allowed_tools
371
+
372
+ elif agent_type == "openai":
245
373
  try:
246
374
  from hud.agents import OperatorAgent
247
375
 
@@ -253,7 +381,7 @@ async def run_full_dataset(
253
381
  )
254
382
  raise typer.Exit(1) from e
255
383
 
256
- agent_config: dict[str, Any] = {"verbose": verbose}
384
+ agent_config = {"verbose": verbose}
257
385
  if allowed_tools:
258
386
  agent_config["allowed_tools"] = allowed_tools
259
387
 
@@ -276,7 +404,51 @@ async def run_full_dataset(
276
404
  if allowed_tools:
277
405
  agent_config["allowed_tools"] = allowed_tools
278
406
 
279
- if parallel:
407
+ # Use grouped evaluation if group_size > 1
408
+ if group_size > 1:
409
+ hud_console.info(f"🔄 Running dataset with group_size={group_size}")
410
+
411
+ # Run with job tracking
412
+ with hud.job(
413
+ name=f"Evaluation {dataset_name} (group_size={group_size})",
414
+ metadata={
415
+ "dataset": source,
416
+ "group_size": group_size,
417
+ "tasks": len(dataset_or_tasks),
418
+ "total_episodes": len(dataset_or_tasks) * group_size,
419
+ },
420
+ ) as job:
421
+ # Convert dicts to Task objects if needed
422
+ from hud.datasets import Task
423
+
424
+ tasks = []
425
+ for item in dataset_or_tasks:
426
+ if isinstance(item, dict):
427
+ tasks.append(Task(**item))
428
+ else:
429
+ tasks.append(item)
430
+
431
+ stats = await run_tasks_grouped(
432
+ tasks=tasks,
433
+ agent_class=agent_class,
434
+ agent_config=agent_config,
435
+ group_size=group_size,
436
+ max_parallel_episodes=max_concurrent
437
+ if not parallel
438
+ else max_concurrent_per_worker * (max_workers or 4),
439
+ max_steps=max_steps,
440
+ verbose=verbose,
441
+ job_id=job.id,
442
+ )
443
+
444
+ # Display results
445
+ display_group_statistics(stats, show_details=len(stats) <= 20)
446
+
447
+ # Return stats for consistency with other modes
448
+ return stats
449
+
450
+ # Original logic for non-grouped evaluation
451
+ elif parallel:
280
452
  hud_console.info(
281
453
  f"🚀 Running PARALLEL evaluation (workers: {max_workers or 'auto'}, max_concurrent: {max_concurrent})…" # noqa: E501
282
454
  )
@@ -322,17 +494,17 @@ async def run_full_dataset(
322
494
  def eval_command(
323
495
  source: str = typer.Argument(
324
496
  ...,
325
- help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50'), single task JSON file, or JSON file with list of tasks", # noqa: E501
497
+ help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50'), JSON file (array of tasks), or JSONL file (one task per line)", # noqa: E501
326
498
  ),
327
499
  full: bool = typer.Option(
328
500
  False,
329
501
  "--full",
330
502
  help="Run the entire dataset (omit for single-task debug mode)",
331
503
  ),
332
- agent: Literal["claude", "openai"] = typer.Option(
504
+ agent: Literal["claude", "openai", "vllm"] = typer.Option(
333
505
  "claude",
334
506
  "--agent",
335
- help="Agent backend to use",
507
+ help="Agent backend to use (claude, openai, or vllm for local server)",
336
508
  ),
337
509
  model: str | None = typer.Option(
338
510
  None,
@@ -374,6 +546,16 @@ def eval_command(
374
546
  "--verbose",
375
547
  help="Enable verbose output from the agent",
376
548
  ),
549
+ vllm_base_url: str | None = typer.Option(
550
+ None,
551
+ "--vllm-base-url",
552
+ help="Base URL for vLLM server (when using --agent vllm)",
553
+ ),
554
+ group_size: int = typer.Option(
555
+ 1,
556
+ "--group-size",
557
+ help="Number of times to run each task (similar to RL training)",
558
+ ),
377
559
  ) -> None:
378
560
  """🚀 Run evaluation on datasets or individual tasks with agents.
379
561
 
@@ -402,6 +584,12 @@ def eval_command(
402
584
  # Run with OpenAI Operator agent
403
585
  hud eval hud-evals/OSWorld-Gold-Beta --agent openai
404
586
 
587
+ # Use local vLLM server (default: localhost:8000)
588
+ hud eval task.json --agent vllm --model Qwen/Qwen2.5-VL-3B-Instruct
589
+
590
+ # Use custom vLLM server URL
591
+ hud eval task.json --agent vllm --vllm-base-url http://192.168.1.100:8000/v1
592
+
405
593
  # Run with verbose output for debugging
406
594
  hud eval task.json --verbose
407
595
  """
@@ -419,6 +607,12 @@ def eval_command(
419
607
  hud_console.error("OPENAI_API_KEY is required for OpenAI agent")
420
608
  hud_console.info("Set it in your environment or .env file: OPENAI_API_KEY=your-key-here")
421
609
  raise typer.Exit(1)
610
+ elif agent == "vllm":
611
+ if model:
612
+ hud_console.info(f"Using vLLM with model: {model}")
613
+ else:
614
+ hud_console.error("Model name is required for vLLM agent, specify with --model")
615
+ raise typer.Exit(1)
422
616
 
423
617
  # Check for HUD_API_KEY if using HUD services
424
618
  if not settings.api_key:
@@ -448,6 +642,8 @@ def eval_command(
448
642
  max_workers=max_workers,
449
643
  max_concurrent_per_worker=max_concurrent_per_worker,
450
644
  verbose=verbose,
645
+ vllm_base_url=vllm_base_url,
646
+ group_size=group_size,
451
647
  )
452
648
  )
453
649
  else:
@@ -459,5 +655,7 @@ def eval_command(
459
655
  allowed_tools=allowed_tools_list,
460
656
  max_steps=max_steps,
461
657
  verbose=verbose,
658
+ vllm_base_url=vllm_base_url,
659
+ group_size=group_size,
462
660
  )
463
661
  )
File without changes
hud/cli/flows/tasks.py ADDED
File without changes