hud-python 0.4.14__py3-none-any.whl → 0.4.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (42) hide show
  1. hud/agents/base.py +118 -33
  2. hud/agents/claude.py +1 -1
  3. hud/agents/openai.py +5 -16
  4. hud/agents/tests/test_openai.py +24 -79
  5. hud/cli/__init__.py +137 -15
  6. hud/cli/analyze.py +2 -4
  7. hud/cli/build.py +6 -2
  8. hud/cli/dev.py +67 -0
  9. hud/cli/eval.py +90 -35
  10. hud/cli/hf.py +406 -0
  11. hud/cli/init.py +38 -19
  12. hud/cli/rl/README.md +243 -0
  13. hud/cli/rl/__init__.py +82 -0
  14. hud/cli/rl/init.py +370 -0
  15. hud/cli/rl/pod.py +491 -0
  16. hud/cli/rl/ssh.py +288 -0
  17. hud/cli/rl/train.py +421 -0
  18. hud/cli/rl/utils.py +165 -0
  19. hud/cli/tests/test_mcp_server.py +1 -4
  20. hud/clients/base.py +2 -0
  21. hud/clients/fastmcp.py +7 -2
  22. hud/clients/mcp_use.py +3 -1
  23. hud/clients/utils/retry_transport.py +34 -8
  24. hud/datasets/__init__.py +32 -0
  25. hud/datasets/execution/__init__.py +13 -0
  26. hud/datasets/execution/parallel.py +592 -0
  27. hud/datasets/execution/runner.py +123 -0
  28. hud/datasets/task.py +107 -0
  29. hud/datasets/utils.py +118 -0
  30. hud/otel/instrumentation.py +2 -1
  31. hud/server/server.py +58 -21
  32. hud/settings.py +12 -0
  33. hud/types.py +31 -10
  34. hud/utils/design.py +168 -2
  35. hud/utils/tests/test_version.py +1 -1
  36. hud/version.py +1 -1
  37. {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/METADATA +4 -3
  38. {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/RECORD +41 -28
  39. hud/datasets.py +0 -327
  40. {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/WHEEL +0 -0
  41. {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/entry_points.txt +0 -0
  42. {hud_python-0.4.14.dist-info → hud_python-0.4.16.dist-info}/licenses/LICENSE +0 -0
@@ -83,20 +83,46 @@ class RetryTransport(AsyncHTTPTransport):
83
83
  last_exception = e
84
84
  if attempt < self.max_retries:
85
85
  delay = self.retry_delay * (self.backoff_factor**attempt)
86
- logger.warning(
87
- "%s for %s, retrying in %.1fs (attempt %d/%d)",
88
- type(e).__name__,
89
- request.url,
90
- delay,
91
- attempt + 1,
92
- self.max_retries,
93
- )
86
+ # More informative message for connection errors
87
+ if isinstance(e, httpx.ConnectError):
88
+ logger.warning(
89
+ "Could not connect to %s, retrying in %.1fs (attempt %d/%d). "
90
+ "Make sure the MCP server is running.",
91
+ request.url,
92
+ delay,
93
+ attempt + 1,
94
+ self.max_retries,
95
+ )
96
+ else:
97
+ logger.warning(
98
+ "%s for %s, retrying in %.1fs (attempt %d/%d)",
99
+ type(e).__name__,
100
+ request.url,
101
+ delay,
102
+ attempt + 1,
103
+ self.max_retries,
104
+ )
94
105
  await asyncio.sleep(delay)
95
106
  continue
96
107
  raise
97
108
 
98
109
  # If we get here, we've exhausted retries
99
110
  if last_exception:
111
+ if isinstance(last_exception, httpx.ConnectError):
112
+ # Enhance the connection error message
113
+ url = str(request.url)
114
+ if "localhost" in url or "127.0.0.1" in url:
115
+ raise httpx.ConnectError(
116
+ f"Failed to connect to {url} after {self.max_retries} attempts. "
117
+ f"Make sure the local MCP server is running (e.g., 'hud dev' in another terminal).", # noqa: E501
118
+ request=request,
119
+ ) from last_exception
120
+ else:
121
+ raise httpx.ConnectError(
122
+ f"Failed to connect to {url} after {self.max_retries} attempts. "
123
+ f"Check that the server is accessible and running.",
124
+ request=request,
125
+ ) from last_exception
100
126
  raise last_exception
101
127
  else:
102
128
  # This shouldn't happen, but just in case
@@ -0,0 +1,32 @@
1
+ """HUD datasets module.
2
+
3
+ Provides data models, utilities, and execution functions for working with HUD datasets.
4
+ """
5
+
6
+ # Data models
7
+ # Execution functions
8
+ from __future__ import annotations
9
+
10
+ from .execution import (
11
+ calculate_optimal_workers,
12
+ run_dataset,
13
+ run_dataset_parallel,
14
+ run_dataset_parallel_manual,
15
+ )
16
+ from .task import Task
17
+
18
+ # Utilities
19
+ from .utils import fetch_system_prompt_from_dataset, save_tasks
20
+
21
+ __all__ = [
22
+ # Core data model
23
+ "Task",
24
+ "calculate_optimal_workers",
25
+ # Utilities
26
+ "fetch_system_prompt_from_dataset",
27
+ # Execution
28
+ "run_dataset",
29
+ "run_dataset_parallel",
30
+ "run_dataset_parallel_manual",
31
+ "save_tasks",
32
+ ]
@@ -0,0 +1,13 @@
1
+ """Dataset execution module."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .parallel import calculate_optimal_workers, run_dataset_parallel, run_dataset_parallel_manual
6
+ from .runner import run_dataset
7
+
8
+ __all__ = [
9
+ "calculate_optimal_workers",
10
+ "run_dataset",
11
+ "run_dataset_parallel",
12
+ "run_dataset_parallel_manual",
13
+ ]
@@ -0,0 +1,592 @@
1
+ """Process-based parallel dataset runner."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ import os
8
+ import traceback
9
+ from concurrent.futures import ProcessPoolExecutor, as_completed
10
+ from functools import partial
11
+ from typing import TYPE_CHECKING, Any
12
+
13
+ if TYPE_CHECKING:
14
+ from datasets import Dataset
15
+
16
+ from hud.agents import MCPAgent
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ # Worker function that runs in a separate process
22
+ def _process_worker(
23
+ task_batch: list[tuple[int, dict[str, Any]]],
24
+ agent_class_module: str,
25
+ agent_class_name: str,
26
+ agent_config: dict[str, Any] | None,
27
+ job_id: str,
28
+ job_name: str,
29
+ max_steps: int,
30
+ auto_respond: bool,
31
+ worker_id: int,
32
+ total_workers: int,
33
+ max_concurrent_per_worker: int,
34
+ ) -> list[tuple[int, Any]]:
35
+ """
36
+ Worker function that runs in a separate process.
37
+
38
+ This function:
39
+ 1. Reinitializes telemetry in the new process
40
+ 2. Creates its own event loop
41
+ 3. Processes a batch of tasks asynchronously
42
+ 4. Returns results with their original indices
43
+
44
+ Args:
45
+ task_batch: List of (index, task_dict) tuples
46
+ agent_class_module: Module path for the agent class
47
+ agent_class_name: Name of the agent class
48
+ agent_config: Configuration for agent initialization
49
+ job_id: Job ID for telemetry tracking
50
+ job_name: Job name for logging
51
+ max_steps: Maximum steps per task
52
+ auto_respond: Whether to use ResponseAgent
53
+ worker_id: ID of this worker process
54
+ total_workers: Total number of worker processes
55
+ max_concurrent_per_worker: Maximum concurrent tasks within each worker
56
+
57
+ Returns:
58
+ List of (index, result) tuples
59
+ """
60
+ # Import inside worker to avoid pickling issues
61
+ import sys
62
+
63
+ import hud
64
+ from hud.agents.misc.response_agent import ResponseAgent
65
+ from hud.datasets.task import Task
66
+ from hud.otel import configure_telemetry
67
+
68
+ # Ensure stdout is not buffered for immediate output
69
+ try:
70
+ sys.stdout.reconfigure(line_buffering=True) # type: ignore
71
+ sys.stderr.reconfigure(line_buffering=True) # type: ignore
72
+ except AttributeError:
73
+ pass
74
+
75
+ # Reinitialize telemetry in this process
76
+ configure_telemetry()
77
+
78
+ # Dynamically import the agent class
79
+ try:
80
+ import importlib
81
+
82
+ module = importlib.import_module(agent_class_module)
83
+ agent_class = getattr(module, agent_class_name)
84
+ except (ImportError, AttributeError) as e:
85
+ logger.error("Worker %s: Failed to import agent class: %s", worker_id, e)
86
+ return [(idx, {"error": str(e), "isError": True}) for idx, _ in task_batch]
87
+
88
+ # Create new event loop for this process
89
+ loop = asyncio.new_event_loop()
90
+ asyncio.set_event_loop(loop)
91
+
92
+ async def process_batch() -> list[tuple[int, Any]]:
93
+ """Process all tasks in the batch asynchronously."""
94
+ results = []
95
+
96
+ # Use semaphore to limit concurrency within the process
97
+ sem = asyncio.Semaphore(max_concurrent_per_worker)
98
+
99
+ async def process_single_task(index: int, task_dict: dict[str, Any]) -> tuple[int, Any]:
100
+ """Process a single task with telemetry tracking."""
101
+ async with sem:
102
+ try:
103
+ # Create trace for this task (linked to the job) - match original format
104
+ task_name = task_dict.get("prompt") or f"Task {index}"
105
+
106
+ # Use the job_id to group all tasks under the same job
107
+ with hud.trace(task_name, job_id=job_id, task_id=task_dict.get("id")):
108
+ # Convert dict to Task
109
+ task = Task(**task_dict)
110
+
111
+ # Create agent instance
112
+ agent = agent_class(**(agent_config or {}))
113
+
114
+ if auto_respond:
115
+ agent.response_agent = ResponseAgent()
116
+
117
+ # Run the task
118
+ result = await agent.run(task, max_steps=max_steps)
119
+
120
+ # Extract and print evaluation score for visibility
121
+ reward = getattr(result, "reward", "N/A")
122
+ logger.info(
123
+ "[Worker %s] Task %s: ✓ Completed (reward: %s)",
124
+ worker_id,
125
+ index,
126
+ reward,
127
+ )
128
+
129
+ logger.info(
130
+ "[Worker %s] Completed task %s (reward: %s)",
131
+ worker_id,
132
+ index,
133
+ reward,
134
+ )
135
+
136
+ return (index, result)
137
+
138
+ except Exception as e:
139
+ error_msg = f"Worker {worker_id}: Task {index} failed: {e}"
140
+ logger.error(
141
+ "[Worker %s] Task %s: ✗ Failed (%s)", worker_id, index, str(e)[:100]
142
+ )
143
+ logger.error("%s\n%s", error_msg, traceback.format_exc())
144
+
145
+ return (
146
+ index,
147
+ {
148
+ "error": str(e),
149
+ "traceback": traceback.format_exc(),
150
+ "isError": True,
151
+ "reward": 0.0,
152
+ "done": False,
153
+ "content": f"Task failed: {e}",
154
+ },
155
+ )
156
+
157
+ # Process all tasks in parallel within this process
158
+ tasks = [process_single_task(idx, task_dict) for idx, task_dict in task_batch]
159
+
160
+ results = await asyncio.gather(*tasks, return_exceptions=False)
161
+ return results
162
+
163
+ try:
164
+ # Run the async batch processing
165
+ results = loop.run_until_complete(process_batch())
166
+
167
+ # CRITICAL: Ensure telemetry is fully sent before process exits
168
+ # Two things need to complete:
169
+ # 1. The trace context's __exit__ already called _update_task_status_sync (blocking)
170
+ # 2. But spans are buffered in BatchSpanProcessor and need explicit flush
171
+
172
+ from opentelemetry import trace as otel_trace
173
+
174
+ provider = otel_trace.get_tracer_provider()
175
+ if provider and hasattr(provider, "force_flush"):
176
+ # This forces BatchSpanProcessor to export all buffered spans NOW
177
+ # The method returns True if successful, False if timeout
178
+ success = provider.force_flush(timeout_millis=5000) # 5 second timeout # type: ignore
179
+ if not success:
180
+ logger.warning("Worker %s: Telemetry flush timed out", worker_id)
181
+
182
+ return results
183
+ except Exception as e:
184
+ logger.error("[Worker %s] Batch processing failed: %s", worker_id, e)
185
+ logger.error("Worker %s batch processing failed: %s", worker_id, e)
186
+ return [(idx, {"error": str(e), "isError": True}) for idx, _ in task_batch]
187
+ finally:
188
+ # Clean up the event loop
189
+ try:
190
+ loop.close()
191
+ except Exception as e:
192
+ logger.warning("Worker %s: Failed to close event loop: %s", worker_id, e)
193
+
194
+
195
+ async def run_dataset_parallel_manual(
196
+ name: str,
197
+ dataset: str | Dataset | list[dict[str, Any]],
198
+ agent_class: type[MCPAgent],
199
+ agent_config: dict[str, Any] | None = None,
200
+ max_workers: int | None = None,
201
+ max_concurrent_per_worker: int = 25,
202
+ max_concurrent: int | None = None,
203
+ metadata: dict[str, Any] | None = None,
204
+ max_steps: int = 10,
205
+ split: str = "train",
206
+ auto_respond: bool = False,
207
+ custom_system_prompt: str | None = None,
208
+ ) -> list[Any]:
209
+ """
210
+ Run all tasks in a dataset using process-based parallelism with manual configuration.
211
+
212
+ This function distributes tasks evenly across multiple processes to achieve true parallelism,
213
+ bypassing Python's GIL limitations. Each process runs its own event loop with concurrent
214
+ task execution controlled by max_concurrent_per_worker or max_concurrent.
215
+
216
+ Args:
217
+ name: Name for the job (shown in telemetry)
218
+ dataset: HuggingFace dataset identifier, Dataset object, or list of task dicts
219
+ agent_class: Agent class to use (must be importable in worker processes)
220
+ agent_config: Configuration for agent initialization
221
+ max_workers: Number of processes (defaults to CPU count)
222
+ max_concurrent_per_worker: Max concurrent tasks within each worker
223
+ max_concurrent: Optional total concurrent limit across all workers (overrides per-worker)
224
+ metadata: Optional metadata for the job
225
+ max_steps: Maximum steps per task
226
+ split: Dataset split when loading from string
227
+ auto_respond: Whether to use ResponseAgent
228
+ custom_system_prompt: Override system prompt for all tasks
229
+
230
+ Returns:
231
+ List of results in the same order as the input dataset
232
+
233
+ Example:
234
+ >>> from hud.agents import ClaudeAgent
235
+ >>> from hud.datasets import run_dataset_parallel_manual
236
+ >>> # Run with 8 workers, 10 concurrent per worker (80 total concurrent)
237
+ >>> results = await run_dataset_parallel_manual(
238
+ ... "Large Scale Eval",
239
+ ... "hud-evals/benchmark-400",
240
+ ... ClaudeAgent,
241
+ ... max_workers=8,
242
+ ... max_concurrent_per_worker=10,
243
+ ... )
244
+ >>> # OR limit total concurrent to prevent rate limits
245
+ >>> results = await run_dataset_parallel_manual(
246
+ ... "Rate Limited Eval",
247
+ ... dataset,
248
+ ... ClaudeAgent,
249
+ ... max_workers=8,
250
+ ... max_concurrent=20, # Only 20 total concurrent
251
+ ... )
252
+ """
253
+ from datasets import Dataset
254
+ from datasets import load_dataset as hf_load_dataset
255
+
256
+ import hud
257
+
258
+ # Determine optimal worker count
259
+ if max_workers is None:
260
+ max_workers = min(os.cpu_count() or 4, 16) # Cap at 16 to be reasonable
261
+
262
+ # If max_concurrent is specified, calculate per-worker concurrency
263
+ if max_concurrent is not None:
264
+ # Distribute concurrent limit across workers
265
+ # Each worker should get a fair share of the total concurrent limit
266
+ max_concurrent_per_worker = max(1, max_concurrent // max_workers)
267
+ logger.info(
268
+ "Limiting to %s total concurrent tasks %s per worker)",
269
+ max_concurrent,
270
+ max_concurrent_per_worker,
271
+ )
272
+
273
+ logger.info(
274
+ "Starting parallel dataset run with %s workers (%s concurrent per worker)",
275
+ max_workers,
276
+ max_concurrent_per_worker,
277
+ )
278
+
279
+ # Load dataset if needed
280
+ dataset_link = None
281
+ task_dicts: list[dict[str, Any]]
282
+
283
+ if isinstance(dataset, str):
284
+ logger.info("Loading dataset %s from HuggingFace...", dataset)
285
+ dataset_link = dataset
286
+ loaded_dataset = hf_load_dataset(dataset, split=split)
287
+ task_dicts = list(loaded_dataset) # type: ignore
288
+ elif isinstance(dataset, Dataset):
289
+ task_dicts = list(dataset) # type: ignore
290
+ elif isinstance(dataset, list):
291
+ task_dicts = dataset
292
+ else:
293
+ raise ValueError(f"Dataset must be string, Dataset, or list, got {type(dataset)}")
294
+
295
+ # Apply custom system prompt if provided
296
+ if custom_system_prompt:
297
+ for task_dict in task_dicts:
298
+ if "system_prompt" not in task_dict:
299
+ task_dict["system_prompt"] = custom_system_prompt
300
+
301
+ # Prepare job metadata
302
+ job_metadata = metadata or {}
303
+ job_metadata.update(
304
+ {
305
+ "agent_class": agent_class.__name__,
306
+ "agent_config": agent_config,
307
+ "parallel_mode": "process_pool",
308
+ "max_workers": max_workers,
309
+ "max_concurrent_per_worker": max_concurrent_per_worker,
310
+ "total_tasks": len(task_dicts),
311
+ }
312
+ )
313
+
314
+ # Extract dataset verification info if available (match original)
315
+ if isinstance(dataset, Dataset) and not dataset_link:
316
+ try:
317
+ general_info = next(iter(dataset.info.__dict__["download_checksums"].keys())).split("/")
318
+ project = general_info[3]
319
+ dataset_name = general_info[4].split("@")[0]
320
+ dataset_link = f"{project}/{dataset_name}"
321
+ except Exception:
322
+ logger.warning("Failed to extract dataset verification info")
323
+
324
+ # Create job context
325
+ with hud.job(name, metadata=job_metadata, dataset_link=dataset_link) as job_obj:
326
+ # Prepare agent class info for pickling
327
+ agent_module = agent_class.__module__
328
+ agent_name = agent_class.__name__
329
+
330
+ # Divide tasks evenly among workers
331
+ num_tasks = len(task_dicts)
332
+ tasks_per_worker = (num_tasks + max_workers - 1) // max_workers # Ceiling division
333
+
334
+ task_batches: list[list[tuple[int, dict[str, Any]]]] = []
335
+ for i in range(0, num_tasks, tasks_per_worker):
336
+ batch = [
337
+ (idx, task_dict)
338
+ for idx, task_dict in enumerate(task_dicts[i : i + tasks_per_worker], start=i)
339
+ ]
340
+ if batch: # Only add non-empty batches
341
+ task_batches.append(batch)
342
+
343
+ logger.info(
344
+ "Distributing %s tasks across %s workers (~%s tasks per worker)",
345
+ num_tasks,
346
+ len(task_batches),
347
+ tasks_per_worker,
348
+ )
349
+
350
+ # Initialize results list
351
+ results: list[Any] = [None] * len(task_dicts)
352
+
353
+ # Create worker function with all needed context
354
+ worker_func = partial(
355
+ _process_worker,
356
+ agent_class_module=agent_module,
357
+ agent_class_name=agent_name,
358
+ agent_config=agent_config,
359
+ job_id=job_obj.id,
360
+ job_name=name,
361
+ max_steps=max_steps,
362
+ auto_respond=auto_respond,
363
+ total_workers=min(max_workers, len(task_batches)),
364
+ max_concurrent_per_worker=max_concurrent_per_worker,
365
+ )
366
+
367
+ # Process batches in parallel using ProcessPoolExecutor
368
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
369
+ # Submit all batches to workers
370
+ future_to_batch = {
371
+ executor.submit(worker_func, batch, worker_id=i): batch
372
+ for i, batch in enumerate(task_batches)
373
+ }
374
+
375
+ # Track progress
376
+ completed = 0
377
+ total = len(task_dicts)
378
+
379
+ # Process results as they complete
380
+ for future in as_completed(future_to_batch):
381
+ batch = future_to_batch[future]
382
+
383
+ try:
384
+ # Get results from this worker
385
+ batch_results = future.result()
386
+
387
+ # Place results in correct positions
388
+ for index, result in batch_results:
389
+ results[index] = result
390
+ completed += 1
391
+
392
+ # Calculate success rate so far
393
+ successful_so_far = sum(
394
+ 1
395
+ for r in results[:completed]
396
+ if r is not None and getattr(r, "reward", 0) > 0
397
+ )
398
+
399
+ progress_msg = (
400
+ f"Progress: {completed}/{total} tasks completed "
401
+ f"({100 * completed / total:.1f}%) | "
402
+ f"Success rate: {successful_so_far}/{completed} "
403
+ f"({100 * successful_so_far / completed:.1f}%)"
404
+ )
405
+
406
+ logger.info(progress_msg)
407
+
408
+ except Exception as e:
409
+ # Handle worker failure
410
+ logger.error("Worker failed with exception: %s\n%s", e, traceback.format_exc())
411
+
412
+ # Mark all tasks in this batch as failed
413
+ for index, _ in batch:
414
+ results[index] = {
415
+ "error": f"Worker process failed: {e}",
416
+ "isError": True,
417
+ "reward": 0.0,
418
+ "done": False,
419
+ "content": f"Worker process failed: {e}",
420
+ }
421
+ completed += 1
422
+
423
+ # Verify all results are populated
424
+ missing = [i for i, r in enumerate(results) if r is None]
425
+ if missing:
426
+ logger.warning("Missing results for task indices: %s...", missing[:10])
427
+ for idx in missing:
428
+ results[idx] = {
429
+ "error": "No result returned from worker",
430
+ "isError": True,
431
+ "reward": 0.0,
432
+ "done": False,
433
+ "content": "Task was not processed",
434
+ }
435
+
436
+ # Print final summary
437
+ total_tasks = len(results)
438
+ successful_tasks = sum(1 for r in results if getattr(r, "reward", 0) > 0)
439
+ failed_tasks = sum(1 for r in results if isinstance(r, dict) and r.get("isError", False))
440
+
441
+ logger.info("\n")
442
+ logger.info("=" * 60)
443
+ logger.info("📊 Parallel Evaluation Complete!")
444
+ logger.info("=" * 60)
445
+ logger.info("Total tasks: %s", total_tasks)
446
+ logger.info("Successful: %s (%s%%)", successful_tasks, 100 * successful_tasks / total_tasks)
447
+ logger.info("Failed: %s", failed_tasks)
448
+ logger.info("Workers used: %s", max_workers)
449
+ logger.info("=" * 60)
450
+
451
+ logger.info(
452
+ "Parallel dataset run completed: %s tasks, %s successful (%s%%)",
453
+ total_tasks,
454
+ successful_tasks,
455
+ 100 * successful_tasks / total_tasks,
456
+ )
457
+
458
+ return results
459
+
460
+
461
+ def calculate_optimal_workers(num_tasks: int, reserve_system_resources: bool = True) -> int:
462
+ """
463
+ Calculate optimal number of workers based on CPU cores and task count.
464
+
465
+ Simple heuristic:
466
+ - 1 worker per CPU core (minus 1-2 for system if reserve_system_resources)
467
+ - But don't create more workers than tasks
468
+ - Cap at reasonable maximum
469
+
470
+ Args:
471
+ num_tasks: Total number of tasks to process
472
+ reserve_system_resources: Whether to leave CPU cores for system (default True)
473
+
474
+ Returns:
475
+ Optimal number of workers
476
+ """
477
+ # Get CPU count
478
+ cpu_count = os.cpu_count() or 4
479
+
480
+ # Reserve 1-2 cores for system if requested
481
+ if reserve_system_resources:
482
+ if cpu_count > 8:
483
+ available_cpus = cpu_count - 2 # Reserve 2 for systems with many cores
484
+ elif cpu_count > 2:
485
+ available_cpus = cpu_count - 1 # Reserve 1 for typical systems
486
+ else:
487
+ available_cpus = 1 # Minimum 1 worker
488
+ else:
489
+ available_cpus = cpu_count
490
+
491
+ # Cap at 32 workers to be reasonable
492
+ max_workers = min(available_cpus, 32)
493
+
494
+ # Don't create more workers than tasks
495
+ # But try to have at least 5-10 tasks per worker for efficiency
496
+ if num_tasks <= max_workers:
497
+ return min(num_tasks, max_workers)
498
+ else:
499
+ # For many tasks, use all available workers
500
+ # unless that would give us very few tasks per worker
501
+ min_tasks_per_worker = 10
502
+ ideal_workers = min(max_workers, max(1, num_tasks // min_tasks_per_worker))
503
+ return ideal_workers
504
+
505
+
506
+ async def run_dataset_parallel(
507
+ name: str,
508
+ dataset: str | Dataset | list[dict[str, Any]],
509
+ agent_class: type[MCPAgent],
510
+ agent_config: dict[str, Any] | None = None,
511
+ max_concurrent: int | None = None,
512
+ metadata: dict[str, Any] | None = None,
513
+ max_steps: int = 10,
514
+ **kwargs: Any,
515
+ ) -> list[Any]:
516
+ """
517
+ Run all tasks in a dataset using automatically optimized process-based parallelism.
518
+
519
+ This function automatically determines the optimal number of workers
520
+ and batch sizes based on system resources and dataset size. For manual control
521
+ over worker configuration, use `run_dataset_parallel_manual`.
522
+
523
+ Args:
524
+ name: Name for the job
525
+ dataset: Dataset to run
526
+ agent_class: Agent class to use
527
+ agent_config: Agent configuration
528
+ max_concurrent: Maximum total concurrent tasks across all workers (prevents rate limits)
529
+ metadata: Optional metadata
530
+ max_steps: Maximum steps per task
531
+ **kwargs: Additional arguments passed to run_dataset_parallel_manual
532
+
533
+ Example:
534
+ >>> # Automatically handles 400+ tasks efficiently
535
+ >>> results = await run_dataset_parallel(
536
+ ... "Large Evaluation",
537
+ ... "hud-evals/benchmark-400",
538
+ ... ClaudeAgent,
539
+ ... max_concurrent=50, # Limit to 50 concurrent API calls
540
+ ... )
541
+ """
542
+ # Load dataset to get size
543
+ num_tasks: int
544
+
545
+ if isinstance(dataset, str):
546
+ from datasets import load_dataset as hf_load_dataset
547
+
548
+ dataset_obj = hf_load_dataset(dataset, split=kwargs.get("split", "train"))
549
+ num_tasks = len(dataset_obj) # type: ignore
550
+ elif hasattr(dataset, "__len__"):
551
+ num_tasks = len(dataset)
552
+ else:
553
+ # Convert to list to count
554
+ dataset_list: list[dict[str, Any]] = list(dataset) # type: ignore
555
+ num_tasks = len(dataset_list)
556
+ dataset = dataset_list
557
+
558
+ # Calculate optimal configuration
559
+ num_workers = calculate_optimal_workers(num_tasks)
560
+
561
+ # Set default max_concurrent_per_worker if not using total limit
562
+ if max_concurrent is None:
563
+ max_concurrent_per_worker = 25 # Reasonable default
564
+ else:
565
+ max_concurrent_per_worker = max(1, max_concurrent // num_workers)
566
+
567
+ logger.info(
568
+ "Auto-configured for %s tasks: %s workers, %s concurrent per worker",
569
+ num_tasks,
570
+ num_workers,
571
+ max_concurrent_per_worker,
572
+ )
573
+
574
+ # Add auto-configuration info to metadata
575
+ if metadata is None:
576
+ metadata = {}
577
+ metadata["auto_configured"] = True
578
+ metadata["auto_num_workers"] = num_workers
579
+
580
+ # Run with optimized settings
581
+ return await run_dataset_parallel_manual(
582
+ name=name,
583
+ dataset=dataset,
584
+ agent_class=agent_class,
585
+ agent_config=agent_config,
586
+ max_workers=num_workers,
587
+ max_concurrent_per_worker=max_concurrent_per_worker,
588
+ max_concurrent=max_concurrent,
589
+ metadata=metadata,
590
+ max_steps=max_steps,
591
+ **kwargs,
592
+ )