hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/utils/agent_factories.py
DELETED
|
@@ -1,86 +0,0 @@
|
|
|
1
|
-
"""Factory functions for creating agents compatible with run_dataset."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
from typing import Any
|
|
6
|
-
|
|
7
|
-
from openai import AsyncOpenAI
|
|
8
|
-
|
|
9
|
-
from hud.agents.grounded_openai import GroundedOpenAIChatAgent
|
|
10
|
-
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
|
|
11
|
-
from hud.tools.grounding import GrounderConfig
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def create_openai_agent(**kwargs: Any) -> GenericOpenAIChatAgent:
|
|
15
|
-
"""Factory for GenericOpenAIChatAgent with run_dataset compatibility.
|
|
16
|
-
|
|
17
|
-
Args:
|
|
18
|
-
api_key: OpenAI API key
|
|
19
|
-
base_url: Optional custom API endpoint
|
|
20
|
-
model_name: Model to use (e.g., "gpt-4o-mini")
|
|
21
|
-
**kwargs: Additional arguments passed to GenericOpenAIChatAgent
|
|
22
|
-
|
|
23
|
-
Returns:
|
|
24
|
-
Configured GenericOpenAIChatAgent instance
|
|
25
|
-
|
|
26
|
-
Example:
|
|
27
|
-
>>> from hud.datasets import run_dataset
|
|
28
|
-
>>> from hud.utils.agent_factories import create_openai_agent
|
|
29
|
-
>>> results = await run_dataset(
|
|
30
|
-
... "My Eval",
|
|
31
|
-
... "hud-evals/SheetBench-50",
|
|
32
|
-
... create_openai_agent,
|
|
33
|
-
... {"api_key": "your-key", "model_name": "gpt-4o-mini"},
|
|
34
|
-
... )
|
|
35
|
-
"""
|
|
36
|
-
api_key = kwargs.pop("api_key", None)
|
|
37
|
-
base_url = kwargs.pop("base_url", None)
|
|
38
|
-
|
|
39
|
-
openai_client = AsyncOpenAI(api_key=api_key, base_url=base_url)
|
|
40
|
-
|
|
41
|
-
return GenericOpenAIChatAgent(openai_client=openai_client, **kwargs)
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def create_grounded_agent(**kwargs: Any) -> GroundedOpenAIChatAgent:
|
|
45
|
-
"""Factory for GroundedOpenAIChatAgent with run_dataset compatibility.
|
|
46
|
-
|
|
47
|
-
Args:
|
|
48
|
-
api_key: OpenAI API key for planning model
|
|
49
|
-
base_url: Optional custom API endpoint for planning model
|
|
50
|
-
model_name: Planning model to use (e.g., "gpt-4o-mini")
|
|
51
|
-
grounder_api_key: API key for grounding model
|
|
52
|
-
grounder_api_base: API base URL for grounding model (default: OpenRouter)
|
|
53
|
-
grounder_model: Grounding model to use (default: qwen/qwen-2.5-vl-7b-instruct)
|
|
54
|
-
**kwargs: Additional arguments passed to GroundedOpenAIChatAgent
|
|
55
|
-
|
|
56
|
-
Returns:
|
|
57
|
-
Configured GroundedOpenAIChatAgent instance
|
|
58
|
-
|
|
59
|
-
Example:
|
|
60
|
-
>>> from hud.datasets import run_dataset
|
|
61
|
-
>>> from hud.utils.agent_factories import create_grounded_agent
|
|
62
|
-
>>> results = await run_dataset(
|
|
63
|
-
... "Grounded Eval",
|
|
64
|
-
... dataset,
|
|
65
|
-
... create_grounded_agent,
|
|
66
|
-
... {
|
|
67
|
-
... "api_key": "openai-key",
|
|
68
|
-
... "grounder_api_key": "openrouter-key",
|
|
69
|
-
... "model_name": "gpt-4o-mini",
|
|
70
|
-
... },
|
|
71
|
-
... )
|
|
72
|
-
"""
|
|
73
|
-
api_key = kwargs.pop("api_key", None)
|
|
74
|
-
base_url = kwargs.pop("base_url", None)
|
|
75
|
-
grounder_api_key = kwargs.pop("grounder_api_key", None)
|
|
76
|
-
grounder_api_base = kwargs.pop("grounder_api_base", "https://openrouter.ai/api/v1")
|
|
77
|
-
grounder_model = kwargs.pop("grounder_model", "qwen/qwen-2.5-vl-7b-instruct")
|
|
78
|
-
|
|
79
|
-
openai_client = AsyncOpenAI(api_key=api_key, base_url=base_url)
|
|
80
|
-
grounder_config = GrounderConfig(
|
|
81
|
-
api_base=grounder_api_base, model=grounder_model, api_key=grounder_api_key
|
|
82
|
-
)
|
|
83
|
-
|
|
84
|
-
return GroundedOpenAIChatAgent(
|
|
85
|
-
openai_client=openai_client, grounder_config=grounder_config, **kwargs
|
|
86
|
-
)
|
hud/utils/async_utils.py
DELETED
|
@@ -1,65 +0,0 @@
|
|
|
1
|
-
"""Async utilities for HUD SDK.
|
|
2
|
-
|
|
3
|
-
This module provides utilities for running async code in various environments,
|
|
4
|
-
including Jupyter notebooks and synchronous contexts.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from __future__ import annotations
|
|
8
|
-
|
|
9
|
-
import asyncio
|
|
10
|
-
import logging
|
|
11
|
-
import threading
|
|
12
|
-
from typing import TYPE_CHECKING, Any
|
|
13
|
-
|
|
14
|
-
if TYPE_CHECKING:
|
|
15
|
-
from collections.abc import Coroutine
|
|
16
|
-
|
|
17
|
-
logger = logging.getLogger(__name__)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def fire_and_forget(coro: Coroutine[Any, Any, Any], description: str = "task") -> None:
|
|
21
|
-
"""Execute a coroutine in a fire-and-forget manner.
|
|
22
|
-
|
|
23
|
-
This function handles running async code in various contexts:
|
|
24
|
-
- When an event loop is already running (normal async context)
|
|
25
|
-
- When no event loop exists (sync context, some Jupyter setups)
|
|
26
|
-
- Gracefully handles interpreter shutdown
|
|
27
|
-
|
|
28
|
-
Args:
|
|
29
|
-
coro: The coroutine to execute
|
|
30
|
-
description: Description of the task for logging (e.g., "update job status")
|
|
31
|
-
|
|
32
|
-
Example:
|
|
33
|
-
fire_and_forget(
|
|
34
|
-
some_async_function(),
|
|
35
|
-
description="update status"
|
|
36
|
-
)
|
|
37
|
-
"""
|
|
38
|
-
try:
|
|
39
|
-
# Try to get current event loop
|
|
40
|
-
loop = asyncio.get_running_loop()
|
|
41
|
-
# Schedule the coroutine
|
|
42
|
-
task = loop.create_task(coro)
|
|
43
|
-
# Add error handler to prevent unhandled exceptions
|
|
44
|
-
task.add_done_callback(lambda t: t.exception() if not t.cancelled() else None)
|
|
45
|
-
except RuntimeError:
|
|
46
|
-
# No running event loop (e.g., Jupyter without %autoawait, sync context)
|
|
47
|
-
try:
|
|
48
|
-
# Try to run in a thread as a fallback
|
|
49
|
-
def run_in_thread() -> None:
|
|
50
|
-
loop = asyncio.new_event_loop()
|
|
51
|
-
asyncio.set_event_loop(loop)
|
|
52
|
-
try:
|
|
53
|
-
loop.run_until_complete(coro)
|
|
54
|
-
except Exception as e:
|
|
55
|
-
# Suppress warnings about interpreter shutdown
|
|
56
|
-
if "interpreter shutdown" not in str(e):
|
|
57
|
-
logger.debug("Error in threaded %s: %s", description, e)
|
|
58
|
-
|
|
59
|
-
thread = threading.Thread(target=run_in_thread, daemon=True)
|
|
60
|
-
thread.start()
|
|
61
|
-
except Exception as e:
|
|
62
|
-
# If that fails too, just log and continue
|
|
63
|
-
# Special case: suppress "cannot schedule new futures after interpreter shutdown"
|
|
64
|
-
if "interpreter shutdown" not in str(e):
|
|
65
|
-
logger.debug("Could not %s - no event loop available: %s", description, e)
|
hud/utils/group_eval.py
DELETED
|
@@ -1,223 +0,0 @@
|
|
|
1
|
-
"""Utilities for grouped evaluation of tasks, following the RL pattern."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import asyncio
|
|
6
|
-
from statistics import mean, stdev
|
|
7
|
-
from typing import Any
|
|
8
|
-
|
|
9
|
-
import numpy as np
|
|
10
|
-
|
|
11
|
-
import hud
|
|
12
|
-
from hud.datasets import Task
|
|
13
|
-
from hud.types import Trace
|
|
14
|
-
from hud.utils.hud_console import HUDConsole
|
|
15
|
-
|
|
16
|
-
hud_console = HUDConsole()
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
async def run_tasks_grouped(
|
|
20
|
-
tasks: list[Any],
|
|
21
|
-
agent_class: type | Any,
|
|
22
|
-
agent_config: dict[str, Any] | None = None,
|
|
23
|
-
group_size: int = 1,
|
|
24
|
-
max_parallel_episodes: int = 48,
|
|
25
|
-
max_steps: int = 10,
|
|
26
|
-
verbose: bool = False,
|
|
27
|
-
job_id: str | None = None,
|
|
28
|
-
) -> list[dict[str, Any]]:
|
|
29
|
-
"""
|
|
30
|
-
Run tasks with grouping, following the RL Actor pattern.
|
|
31
|
-
|
|
32
|
-
Args:
|
|
33
|
-
tasks: List of tasks to run
|
|
34
|
-
agent_class: Agent class or instance to use
|
|
35
|
-
agent_config: Configuration for agent instantiation
|
|
36
|
-
group_size: Number of times to run each task
|
|
37
|
-
max_parallel_episodes: Maximum parallel episodes to run
|
|
38
|
-
max_steps: Maximum steps per episode
|
|
39
|
-
verbose: Whether to show progress
|
|
40
|
-
job_id: Optional job ID for tracking
|
|
41
|
-
|
|
42
|
-
Returns:
|
|
43
|
-
List of statistics for each task group
|
|
44
|
-
"""
|
|
45
|
-
agent_config = agent_config or {}
|
|
46
|
-
|
|
47
|
-
# Duplicate tasks according to group_size, exactly like RL
|
|
48
|
-
grouped_tasks = []
|
|
49
|
-
task_mapping = [] # Track which group each result belongs to
|
|
50
|
-
|
|
51
|
-
for i, task in enumerate(tasks):
|
|
52
|
-
for _ in range(group_size):
|
|
53
|
-
grouped_tasks.append(task)
|
|
54
|
-
task_mapping.append(i)
|
|
55
|
-
|
|
56
|
-
hud_console.info(
|
|
57
|
-
f"Running {len(tasks)} tasks with group_size={group_size} ({len(grouped_tasks)} total runs)"
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
# Run all episodes, respecting max_parallel_episodes
|
|
61
|
-
all_traces = []
|
|
62
|
-
|
|
63
|
-
for batch_start in range(0, len(grouped_tasks), max_parallel_episodes):
|
|
64
|
-
batch_end = min(batch_start + max_parallel_episodes, len(grouped_tasks))
|
|
65
|
-
batch = grouped_tasks[batch_start:batch_end]
|
|
66
|
-
|
|
67
|
-
# Run batch in parallel
|
|
68
|
-
async def run_single_episode(task_data: dict[str, Any] | Task, idx: int) -> Trace:
|
|
69
|
-
"""Run a single episode."""
|
|
70
|
-
try:
|
|
71
|
-
# Create task if needed
|
|
72
|
-
task = Task(**task_data) if isinstance(task_data, dict) else task_data
|
|
73
|
-
|
|
74
|
-
# Create fresh agent instance
|
|
75
|
-
if isinstance(agent_class, type):
|
|
76
|
-
agent = agent_class(**agent_config)
|
|
77
|
-
else:
|
|
78
|
-
# Agent is already instantiated
|
|
79
|
-
agent = agent_class
|
|
80
|
-
|
|
81
|
-
# Run the task
|
|
82
|
-
trace_name = f"Eval | {task.id if hasattr(task, 'id') else 'Task'} | Group {task_mapping[idx]}" # noqa: E501
|
|
83
|
-
with hud.trace(trace_name, job_id=job_id):
|
|
84
|
-
result = await agent.run(task, max_steps=max_steps)
|
|
85
|
-
return result
|
|
86
|
-
|
|
87
|
-
except Exception as e:
|
|
88
|
-
hud_console.warning_log(f"Episode failed: {e}")
|
|
89
|
-
return Trace(isError=True, content=str(e), reward=0.0, done=True)
|
|
90
|
-
|
|
91
|
-
# Run batch
|
|
92
|
-
batch_results = await asyncio.gather(
|
|
93
|
-
*[run_single_episode(t, batch_start + i) for i, t in enumerate(batch)],
|
|
94
|
-
return_exceptions=True,
|
|
95
|
-
)
|
|
96
|
-
|
|
97
|
-
# Normalize exceptions to error traces
|
|
98
|
-
for res in batch_results:
|
|
99
|
-
if isinstance(res, Exception):
|
|
100
|
-
hud_console.warning_log(f"Episode error: {res}")
|
|
101
|
-
all_traces.append(Trace(isError=True, content=str(res), reward=0.0, done=True))
|
|
102
|
-
else:
|
|
103
|
-
all_traces.append(res)
|
|
104
|
-
|
|
105
|
-
if verbose:
|
|
106
|
-
hud_console.info(f"Completed batch: {len(all_traces)}/{len(grouped_tasks)} episodes")
|
|
107
|
-
|
|
108
|
-
# Group results back by original task and calculate statistics
|
|
109
|
-
return calculate_group_statistics(tasks, all_traces, task_mapping, group_size)
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
def calculate_group_statistics(
|
|
113
|
-
original_tasks: list[Any],
|
|
114
|
-
traces: list[Trace],
|
|
115
|
-
task_mapping: list[int],
|
|
116
|
-
group_size: int,
|
|
117
|
-
) -> list[dict[str, Any]]:
|
|
118
|
-
"""
|
|
119
|
-
Calculate statistics for each group, similar to preprocess_advantages.
|
|
120
|
-
|
|
121
|
-
Args:
|
|
122
|
-
original_tasks: Original task list
|
|
123
|
-
traces: All traces from grouped runs
|
|
124
|
-
task_mapping: Mapping of trace index to task index
|
|
125
|
-
group_size: Number of runs per task
|
|
126
|
-
|
|
127
|
-
Returns:
|
|
128
|
-
List of statistics for each task
|
|
129
|
-
"""
|
|
130
|
-
stats = []
|
|
131
|
-
|
|
132
|
-
# Process each original task
|
|
133
|
-
for task_idx, task in enumerate(original_tasks):
|
|
134
|
-
# Get all traces for this task
|
|
135
|
-
task_traces = [
|
|
136
|
-
traces[i] for i, mapping_idx in enumerate(task_mapping) if mapping_idx == task_idx
|
|
137
|
-
]
|
|
138
|
-
|
|
139
|
-
# Extract rewards
|
|
140
|
-
rewards = np.array([t.reward for t in task_traces])
|
|
141
|
-
errors = [t for t in task_traces if t.isError]
|
|
142
|
-
|
|
143
|
-
# Calculate statistics
|
|
144
|
-
task_stats = {
|
|
145
|
-
"task_id": task.id
|
|
146
|
-
if isinstance(task, Task) and hasattr(task, "id")
|
|
147
|
-
else f"task_{task_idx}",
|
|
148
|
-
"prompt": task.prompt if isinstance(task, Task) else task.get("prompt", ""),
|
|
149
|
-
"group_size": group_size,
|
|
150
|
-
"rewards": rewards.tolist(),
|
|
151
|
-
"mean_reward": float(np.mean(rewards)),
|
|
152
|
-
"std_reward": float(np.std(rewards)) if len(rewards) > 1 else 0.0,
|
|
153
|
-
"min_reward": float(np.min(rewards)),
|
|
154
|
-
"max_reward": float(np.max(rewards)),
|
|
155
|
-
"success_rate": float(np.sum(rewards > 0) / len(rewards)) if len(rewards) > 0 else 0.0,
|
|
156
|
-
"error_rate": len(errors) / len(task_traces) if len(task_traces) > 0 else 0.0,
|
|
157
|
-
"traces": task_traces, # Keep full traces for detailed analysis
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
# Add variance info like RL does
|
|
161
|
-
if task_stats["std_reward"] > 1e-6:
|
|
162
|
-
task_stats["normalized_rewards"] = [
|
|
163
|
-
(r - task_stats["mean_reward"]) / task_stats["std_reward"] for r in rewards
|
|
164
|
-
]
|
|
165
|
-
else:
|
|
166
|
-
task_stats["normalized_rewards"] = [0.0] * len(rewards)
|
|
167
|
-
|
|
168
|
-
stats.append(task_stats)
|
|
169
|
-
|
|
170
|
-
return stats
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
def display_group_statistics(stats: list[dict[str, Any]], show_details: bool = True) -> None:
|
|
174
|
-
"""Display statistics from grouped evaluation."""
|
|
175
|
-
from rich.console import Console
|
|
176
|
-
from rich.table import Table
|
|
177
|
-
|
|
178
|
-
console = Console()
|
|
179
|
-
|
|
180
|
-
# Overall statistics
|
|
181
|
-
all_means = [s["mean_reward"] for s in stats]
|
|
182
|
-
overall_mean = mean(all_means) if all_means else 0.0
|
|
183
|
-
overall_std = stdev(all_means) if len(all_means) > 1 else 0.0
|
|
184
|
-
|
|
185
|
-
hud_console.success("\n📊 Evaluation Summary")
|
|
186
|
-
hud_console.info(f"Tasks evaluated: {len(stats)}")
|
|
187
|
-
hud_console.info(f"Episodes per task: {stats[0]['group_size'] if stats else 0}")
|
|
188
|
-
hud_console.info(f"Total episodes: {sum(len(s['rewards']) for s in stats)}")
|
|
189
|
-
hud_console.info(f"Overall mean reward: {overall_mean:.3f} ± {overall_std:.3f}")
|
|
190
|
-
|
|
191
|
-
# Detailed table
|
|
192
|
-
if show_details and len(stats) <= 50: # Only show for reasonable dataset sizes
|
|
193
|
-
table = Table(title="\nPer-Task Performance Distribution")
|
|
194
|
-
table.add_column("Task", style="cyan", no_wrap=True)
|
|
195
|
-
table.add_column("Mean±Std", justify="right", style="green")
|
|
196
|
-
table.add_column("Min/Max", justify="right")
|
|
197
|
-
table.add_column("Success%", justify="right", style="yellow")
|
|
198
|
-
table.add_column("Rewards", style="dim")
|
|
199
|
-
|
|
200
|
-
for stat in stats:
|
|
201
|
-
task_name = stat["prompt"][:30] + "..." if len(stat["prompt"]) > 30 else stat["prompt"]
|
|
202
|
-
rewards_str = " ".join([f"{r:.2f}" for r in stat["rewards"][:5]])
|
|
203
|
-
if len(stat["rewards"]) > 5:
|
|
204
|
-
rewards_str += " ..."
|
|
205
|
-
|
|
206
|
-
table.add_row(
|
|
207
|
-
task_name,
|
|
208
|
-
f"{stat['mean_reward']:.3f}±{stat['std_reward']:.3f}",
|
|
209
|
-
f"{stat['min_reward']:.2f}/{stat['max_reward']:.2f}",
|
|
210
|
-
f"{stat['success_rate'] * 100:.0f}%",
|
|
211
|
-
rewards_str,
|
|
212
|
-
)
|
|
213
|
-
|
|
214
|
-
console.print(table)
|
|
215
|
-
|
|
216
|
-
# High variance tasks
|
|
217
|
-
high_variance_tasks = [s for s in stats if s["std_reward"] > 0.3 and s["group_size"] > 1]
|
|
218
|
-
if high_variance_tasks:
|
|
219
|
-
hud_console.warning(f"\n{len(high_variance_tasks)} tasks show high variance (std > 0.3)")
|
|
220
|
-
for task in high_variance_tasks[:3]:
|
|
221
|
-
hud_console.info(
|
|
222
|
-
f" • {task['task_id']}: μ={task['mean_reward']:.3f}, σ={task['std_reward']:.3f}" # noqa: RUF001
|
|
223
|
-
)
|
hud/utils/progress.py
DELETED
|
@@ -1,149 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import time
|
|
4
|
-
from collections import defaultdict
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class StepProgressTracker:
|
|
8
|
-
"""
|
|
9
|
-
Tracks progress across potentially parallel async tasks based on steps completed.
|
|
10
|
-
Provides estimates assuming tasks run up to max_steps_per_task.
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
def __init__(self, total_tasks: int, max_steps_per_task: int) -> None:
|
|
14
|
-
"""
|
|
15
|
-
Initialize the StepProgressTracker.
|
|
16
|
-
|
|
17
|
-
Args:
|
|
18
|
-
total_tasks: The total number of tasks to track.
|
|
19
|
-
max_steps_per_task: The maximum number of steps per task.
|
|
20
|
-
|
|
21
|
-
Raises:
|
|
22
|
-
ValueError: If total_tasks or max_steps_per_task is not positive.
|
|
23
|
-
"""
|
|
24
|
-
if total_tasks <= 0:
|
|
25
|
-
raise ValueError("total_tasks must be positive")
|
|
26
|
-
if max_steps_per_task <= 0:
|
|
27
|
-
raise ValueError("max_steps_per_task must be positive")
|
|
28
|
-
|
|
29
|
-
self.total_tasks = total_tasks
|
|
30
|
-
self.max_steps_per_task = max_steps_per_task
|
|
31
|
-
self.total_potential_steps = total_tasks * max_steps_per_task
|
|
32
|
-
|
|
33
|
-
# Use asyncio.Lock for potentially concurrent updates/reads if needed,
|
|
34
|
-
# but start without for simplicity in single-threaded asyncio.
|
|
35
|
-
# self._lock = asyncio.Lock()
|
|
36
|
-
self._task_steps: dict[str, int] = defaultdict(int)
|
|
37
|
-
self._finished_tasks: dict[str, bool] = defaultdict(bool)
|
|
38
|
-
self._tasks_started = 0
|
|
39
|
-
self._tasks_finished = 0
|
|
40
|
-
|
|
41
|
-
self.start_time: float | None = None
|
|
42
|
-
self.current_total_steps = 0
|
|
43
|
-
|
|
44
|
-
def start_task(self, task_id: str) -> None:
|
|
45
|
-
# async with self._lock: # If using lock
|
|
46
|
-
if self.start_time is None:
|
|
47
|
-
self.start_time = time.monotonic()
|
|
48
|
-
self._task_steps[task_id] = 0
|
|
49
|
-
self._finished_tasks[task_id] = False
|
|
50
|
-
self._tasks_started += 1
|
|
51
|
-
|
|
52
|
-
def increment_step(self, task_id: str) -> None:
|
|
53
|
-
# async with self._lock:
|
|
54
|
-
if (
|
|
55
|
-
not self._finished_tasks[task_id]
|
|
56
|
-
and self._task_steps[task_id] < self.max_steps_per_task
|
|
57
|
-
):
|
|
58
|
-
self._task_steps[task_id] += 1
|
|
59
|
-
# Update overall progress immediately
|
|
60
|
-
self._update_total_steps()
|
|
61
|
-
|
|
62
|
-
def finish_task(self, task_id: str) -> None:
|
|
63
|
-
# async with self._lock:
|
|
64
|
-
if not self._finished_tasks[task_id]:
|
|
65
|
-
# For calculation, consider a finished task as having completed max steps
|
|
66
|
-
self._task_steps[task_id] = self.max_steps_per_task
|
|
67
|
-
self._finished_tasks[task_id] = True
|
|
68
|
-
self._tasks_finished += 1
|
|
69
|
-
# Update overall progress
|
|
70
|
-
self._update_total_steps()
|
|
71
|
-
|
|
72
|
-
def _update_total_steps(self) -> None:
|
|
73
|
-
# This could be expensive if called extremely frequently.
|
|
74
|
-
# Called after increment or finish.
|
|
75
|
-
# async with self._lock:
|
|
76
|
-
self.current_total_steps = sum(self._task_steps.values())
|
|
77
|
-
|
|
78
|
-
def get_progress(self) -> tuple[int, int, float]:
|
|
79
|
-
"""Returns (current_steps, total_potential_steps, percentage)."""
|
|
80
|
-
# async with self._lock:
|
|
81
|
-
# Recalculate here for safety, though _update_total_steps should keep it current
|
|
82
|
-
# current_steps = sum(self._task_steps.values())
|
|
83
|
-
current_steps = self.current_total_steps
|
|
84
|
-
|
|
85
|
-
percentage = 0.0
|
|
86
|
-
if self.total_potential_steps > 0:
|
|
87
|
-
percentage = (current_steps / self.total_potential_steps) * 100
|
|
88
|
-
return current_steps, self.total_potential_steps, percentage
|
|
89
|
-
|
|
90
|
-
def get_stats(self) -> tuple[float, float | None]:
|
|
91
|
-
"""Returns (rate_steps_per_minute, eta_seconds_upper_bound)."""
|
|
92
|
-
# async with self._lock:
|
|
93
|
-
if self.start_time is None or self._tasks_started == 0:
|
|
94
|
-
return 0.0, None # No rate or ETA yet
|
|
95
|
-
|
|
96
|
-
elapsed_time = time.monotonic() - self.start_time
|
|
97
|
-
current_steps = self.current_total_steps
|
|
98
|
-
|
|
99
|
-
rate_sec = 0.0
|
|
100
|
-
if elapsed_time > 0:
|
|
101
|
-
rate_sec = current_steps / elapsed_time
|
|
102
|
-
|
|
103
|
-
rate_min = rate_sec * 60 # Convert rate to steps per minute
|
|
104
|
-
|
|
105
|
-
eta = None
|
|
106
|
-
# ETA calculation still uses rate_sec (steps/second) for time estimation in seconds
|
|
107
|
-
if rate_sec > 0:
|
|
108
|
-
remaining_steps = self.total_potential_steps - current_steps
|
|
109
|
-
eta = remaining_steps / rate_sec if remaining_steps > 0 else 0.0
|
|
110
|
-
|
|
111
|
-
return rate_min, eta # Return rate in steps/min
|
|
112
|
-
|
|
113
|
-
def is_finished(self) -> bool:
|
|
114
|
-
# async with self._lock:
|
|
115
|
-
return self._tasks_finished >= self.total_tasks
|
|
116
|
-
|
|
117
|
-
def display(self, bar_length: int = 40) -> str:
|
|
118
|
-
"""Generates a progress string similar to tqdm."""
|
|
119
|
-
current_steps, total_steps, percentage = self.get_progress()
|
|
120
|
-
rate_min, eta = self.get_stats() # Rate is now per minute
|
|
121
|
-
|
|
122
|
-
# Ensure valid values for display
|
|
123
|
-
current_steps = min(current_steps, total_steps)
|
|
124
|
-
percentage = max(0.0, min(100.0, percentage))
|
|
125
|
-
|
|
126
|
-
filled_length = int(bar_length * current_steps // total_steps) if total_steps else 0
|
|
127
|
-
bar = "█" * filled_length + "-" * (bar_length - filled_length)
|
|
128
|
-
|
|
129
|
-
# Format time
|
|
130
|
-
elapsed_str = "0:00"
|
|
131
|
-
eta_str = "??:??"
|
|
132
|
-
if self.start_time:
|
|
133
|
-
elapsed_seconds = int(time.monotonic() - self.start_time)
|
|
134
|
-
elapsed_str = f"{elapsed_seconds // 60}:{elapsed_seconds % 60:02d}"
|
|
135
|
-
if eta is not None:
|
|
136
|
-
eta_seconds = int(eta)
|
|
137
|
-
eta_str = f"{eta_seconds // 60}:{eta_seconds % 60:02d}"
|
|
138
|
-
elif self.is_finished():
|
|
139
|
-
eta_str = "0:00"
|
|
140
|
-
|
|
141
|
-
# Update rate string format
|
|
142
|
-
rate_str = f"{rate_min:.1f} steps/min" if rate_min > 0 else "?? steps/min"
|
|
143
|
-
|
|
144
|
-
# Format steps - use K/M for large numbers if desired, keep simple for now
|
|
145
|
-
steps_str = f"{current_steps}/{total_steps}"
|
|
146
|
-
|
|
147
|
-
# tasks_str = f" {self._tasks_finished}/{self.total_tasks} tasks" # Optional tasks counter
|
|
148
|
-
|
|
149
|
-
return f"{percentage:3.0f}%|{bar}| {steps_str} [{elapsed_str}<{eta_str}, {rate_str}]"
|
hud/utils/tasks.py
DELETED
|
@@ -1,127 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
|
|
6
|
-
from hud.types import Task
|
|
7
|
-
from hud.utils.hud_console import HUDConsole
|
|
8
|
-
|
|
9
|
-
hud_console = HUDConsole()
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def load_tasks(tasks_input: str | list[dict], *, raw: bool = False) -> list[Task] | list[dict]:
|
|
13
|
-
"""Load tasks from various sources.
|
|
14
|
-
|
|
15
|
-
Args:
|
|
16
|
-
tasks_input: Either:
|
|
17
|
-
- Path to a JSON file (array of tasks)
|
|
18
|
-
- Path to a JSONL file (one task per line)
|
|
19
|
-
- HuggingFace dataset name (format: "username/dataset" or "username/dataset:split")
|
|
20
|
-
- List of task dictionaries
|
|
21
|
-
raw: If True, return raw dicts without validation or env substitution
|
|
22
|
-
|
|
23
|
-
Returns:
|
|
24
|
-
- If raw=False (default): list[Task]
|
|
25
|
-
- If raw=True: list[dict]
|
|
26
|
-
"""
|
|
27
|
-
tasks: list[Task] | list[dict] = []
|
|
28
|
-
|
|
29
|
-
if isinstance(tasks_input, list):
|
|
30
|
-
# Direct list of task dicts
|
|
31
|
-
hud_console.info(f"Loading {len(tasks_input)} tasks from provided list")
|
|
32
|
-
if raw:
|
|
33
|
-
return [item for item in tasks_input if isinstance(item, dict)]
|
|
34
|
-
for item in tasks_input:
|
|
35
|
-
task = Task(**item)
|
|
36
|
-
tasks.append(task)
|
|
37
|
-
|
|
38
|
-
elif isinstance(tasks_input, str):
|
|
39
|
-
# Check if it's a file path
|
|
40
|
-
if Path(tasks_input).exists():
|
|
41
|
-
file_path = Path(tasks_input)
|
|
42
|
-
|
|
43
|
-
with open(file_path, encoding="utf-8") as f:
|
|
44
|
-
# Handle JSON files (array of tasks)
|
|
45
|
-
if file_path.suffix.lower() == ".json":
|
|
46
|
-
data = json.load(f)
|
|
47
|
-
if not isinstance(data, list):
|
|
48
|
-
raise ValueError(
|
|
49
|
-
f"JSON file must contain an array of tasks, got {type(data)}"
|
|
50
|
-
)
|
|
51
|
-
if raw:
|
|
52
|
-
return [item for item in data if isinstance(item, dict)]
|
|
53
|
-
for item in data:
|
|
54
|
-
task = Task(**item)
|
|
55
|
-
tasks.append(task)
|
|
56
|
-
|
|
57
|
-
# Handle JSONL files (one task per line)
|
|
58
|
-
else:
|
|
59
|
-
raw_items: list[dict] = []
|
|
60
|
-
for line in f:
|
|
61
|
-
line = line.strip()
|
|
62
|
-
if not line:
|
|
63
|
-
continue
|
|
64
|
-
item = json.loads(line)
|
|
65
|
-
if isinstance(item, list):
|
|
66
|
-
raw_items.extend([it for it in item if isinstance(it, dict)])
|
|
67
|
-
elif isinstance(item, dict):
|
|
68
|
-
raw_items.append(item)
|
|
69
|
-
else:
|
|
70
|
-
raise ValueError(
|
|
71
|
-
f"Invalid JSONL format: expected dict or list of dicts, got {type(item)}" # noqa: E501
|
|
72
|
-
)
|
|
73
|
-
if raw:
|
|
74
|
-
return raw_items
|
|
75
|
-
for it in raw_items:
|
|
76
|
-
task = Task(**it)
|
|
77
|
-
tasks.append(task)
|
|
78
|
-
|
|
79
|
-
# Check if it's a HuggingFace dataset
|
|
80
|
-
elif "/" in tasks_input:
|
|
81
|
-
hud_console.info(f"Loading tasks from HuggingFace dataset: {tasks_input}")
|
|
82
|
-
try:
|
|
83
|
-
from datasets import load_dataset
|
|
84
|
-
|
|
85
|
-
# Parse dataset name and optional split
|
|
86
|
-
if ":" in tasks_input:
|
|
87
|
-
dataset_name, split = tasks_input.split(":", 1)
|
|
88
|
-
else:
|
|
89
|
-
dataset_name = tasks_input
|
|
90
|
-
split = "train" # Default split
|
|
91
|
-
|
|
92
|
-
dataset = load_dataset(dataset_name, split=split)
|
|
93
|
-
|
|
94
|
-
# Convert dataset rows to Task objects
|
|
95
|
-
raw_rows: list[dict] = []
|
|
96
|
-
for item in dataset:
|
|
97
|
-
if not isinstance(item, dict):
|
|
98
|
-
raise ValueError(
|
|
99
|
-
f"Invalid HuggingFace dataset: expected dict, got {type(item)}"
|
|
100
|
-
)
|
|
101
|
-
if not item["mcp_config"] or not item["prompt"]:
|
|
102
|
-
raise ValueError(
|
|
103
|
-
f"Invalid HuggingFace dataset: expected mcp_config and prompt, got {item}" # noqa: E501
|
|
104
|
-
)
|
|
105
|
-
raw_rows.append(item)
|
|
106
|
-
if raw:
|
|
107
|
-
return raw_rows
|
|
108
|
-
for row in raw_rows:
|
|
109
|
-
task = Task(**row)
|
|
110
|
-
tasks.append(task)
|
|
111
|
-
|
|
112
|
-
except ImportError as e:
|
|
113
|
-
raise ImportError(
|
|
114
|
-
"Please install 'datasets' to load from HuggingFace: uv pip install datasets"
|
|
115
|
-
) from e
|
|
116
|
-
except Exception as e:
|
|
117
|
-
raise ValueError(f"Failed to load HuggingFace dataset '{tasks_input}': {e}") from e
|
|
118
|
-
|
|
119
|
-
else:
|
|
120
|
-
raise ValueError(
|
|
121
|
-
f"Invalid tasks input: '{tasks_input}' is neither a file path nor a HuggingFace dataset" # noqa: E501
|
|
122
|
-
)
|
|
123
|
-
|
|
124
|
-
else:
|
|
125
|
-
raise TypeError(f"tasks_input must be str or list, got {type(tasks_input)}")
|
|
126
|
-
|
|
127
|
-
return tasks
|