PyPI - hud-python - Versions diffs - 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl - Mend

hud-python 0.4.45py3-none-any.whl → 0.5.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (282) hide show

hud/__init__.py +27 -7
hud/agents/__init__.py +70 -5
hud/agents/base.py +238 -500
hud/agents/claude.py +236 -247
hud/agents/gateway.py +42 -0
hud/agents/gemini.py +264 -0
hud/agents/gemini_cua.py +324 -0
hud/agents/grounded_openai.py +98 -100
hud/agents/misc/integration_test_agent.py +51 -20
hud/agents/misc/response_agent.py +48 -36
hud/agents/openai.py +282 -296
hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
hud/agents/operator.py +199 -0
hud/agents/resolver.py +70 -0
hud/agents/tests/conftest.py +133 -0
hud/agents/tests/test_base.py +300 -622
hud/agents/tests/test_base_runtime.py +233 -0
hud/agents/tests/test_claude.py +381 -214
hud/agents/tests/test_client.py +9 -10
hud/agents/tests/test_gemini.py +369 -0
hud/agents/tests/test_grounded_openai_agent.py +65 -50
hud/agents/tests/test_openai.py +377 -140
hud/agents/tests/test_operator.py +362 -0
hud/agents/tests/test_resolver.py +192 -0
hud/agents/tests/test_run_eval.py +179 -0
hud/agents/types.py +148 -0
hud/cli/__init__.py +493 -546
hud/cli/analyze.py +43 -5
hud/cli/build.py +699 -113
hud/cli/debug.py +8 -5
hud/cli/dev.py +889 -732
hud/cli/eval.py +793 -667
hud/cli/flows/dev.py +167 -0
hud/cli/flows/init.py +191 -0
hud/cli/flows/tasks.py +153 -56
hud/cli/flows/templates.py +151 -0
hud/cli/flows/tests/__init__.py +1 -0
hud/cli/flows/tests/test_dev.py +126 -0
hud/cli/init.py +60 -58
hud/cli/pull.py +1 -1
hud/cli/push.py +38 -13
hud/cli/rft.py +311 -0
hud/cli/rft_status.py +145 -0
hud/cli/tests/test_analyze.py +5 -5
hud/cli/tests/test_analyze_metadata.py +3 -2
hud/cli/tests/test_analyze_module.py +120 -0
hud/cli/tests/test_build.py +110 -8
hud/cli/tests/test_build_failure.py +41 -0
hud/cli/tests/test_build_module.py +50 -0
hud/cli/tests/test_cli_init.py +6 -1
hud/cli/tests/test_cli_more_wrappers.py +30 -0
hud/cli/tests/test_cli_root.py +140 -0
hud/cli/tests/test_convert.py +361 -0
hud/cli/tests/test_debug.py +12 -10
hud/cli/tests/test_dev.py +197 -0
hud/cli/tests/test_eval.py +251 -0
hud/cli/tests/test_eval_bedrock.py +51 -0
hud/cli/tests/test_init.py +124 -0
hud/cli/tests/test_main_module.py +11 -5
hud/cli/tests/test_mcp_server.py +12 -100
hud/cli/tests/test_push.py +1 -1
hud/cli/tests/test_push_happy.py +74 -0
hud/cli/tests/test_push_wrapper.py +23 -0
hud/cli/tests/test_registry.py +1 -1
hud/cli/tests/test_utils.py +1 -1
hud/cli/{rl → utils}/celebrate.py +14 -12
hud/cli/utils/config.py +18 -1
hud/cli/utils/docker.py +130 -4
hud/cli/utils/env_check.py +9 -9
hud/cli/utils/git.py +136 -0
hud/cli/utils/interactive.py +39 -5
hud/cli/utils/metadata.py +70 -1
hud/cli/utils/runner.py +1 -1
hud/cli/utils/server.py +2 -2
hud/cli/utils/source_hash.py +3 -3
hud/cli/utils/tasks.py +4 -1
hud/cli/utils/tests/__init__.py +0 -0
hud/cli/utils/tests/test_config.py +58 -0
hud/cli/utils/tests/test_docker.py +93 -0
hud/cli/utils/tests/test_docker_hints.py +71 -0
hud/cli/utils/tests/test_env_check.py +74 -0
hud/cli/utils/tests/test_environment.py +42 -0
hud/cli/utils/tests/test_git.py +142 -0
hud/cli/utils/tests/test_interactive_module.py +60 -0
hud/cli/utils/tests/test_local_runner.py +50 -0
hud/cli/utils/tests/test_logging_utils.py +23 -0
hud/cli/utils/tests/test_metadata.py +49 -0
hud/cli/utils/tests/test_package_runner.py +35 -0
hud/cli/utils/tests/test_registry_utils.py +49 -0
hud/cli/utils/tests/test_remote_runner.py +25 -0
hud/cli/utils/tests/test_runner_modules.py +52 -0
hud/cli/utils/tests/test_source_hash.py +36 -0
hud/cli/utils/tests/test_tasks.py +80 -0
hud/cli/utils/version_check.py +258 -0
hud/cli/{rl → utils}/viewer.py +2 -2
hud/clients/README.md +12 -11
hud/clients/__init__.py +4 -3
hud/clients/base.py +166 -26
hud/clients/environment.py +51 -0
hud/clients/fastmcp.py +13 -6
hud/clients/mcp_use.py +45 -15
hud/clients/tests/test_analyze_scenarios.py +206 -0
hud/clients/tests/test_protocol.py +9 -3
hud/datasets/__init__.py +23 -20
hud/datasets/loader.py +326 -0
hud/datasets/runner.py +198 -105
hud/datasets/tests/__init__.py +0 -0
hud/datasets/tests/test_loader.py +221 -0
hud/datasets/tests/test_utils.py +315 -0
hud/datasets/utils.py +270 -90
hud/environment/__init__.py +52 -0
hud/environment/connection.py +258 -0
hud/environment/connectors/__init__.py +33 -0
hud/environment/connectors/base.py +68 -0
hud/environment/connectors/local.py +177 -0
hud/environment/connectors/mcp_config.py +137 -0
hud/environment/connectors/openai.py +101 -0
hud/environment/connectors/remote.py +172 -0
hud/environment/environment.py +835 -0
hud/environment/integrations/__init__.py +45 -0
hud/environment/integrations/adk.py +67 -0
hud/environment/integrations/anthropic.py +196 -0
hud/environment/integrations/gemini.py +92 -0
hud/environment/integrations/langchain.py +82 -0
hud/environment/integrations/llamaindex.py +68 -0
hud/environment/integrations/openai.py +238 -0
hud/environment/mock.py +306 -0
hud/environment/router.py +263 -0
hud/environment/scenarios.py +620 -0
hud/environment/tests/__init__.py +1 -0
hud/environment/tests/test_connection.py +317 -0
hud/environment/tests/test_connectors.py +205 -0
hud/environment/tests/test_environment.py +593 -0
hud/environment/tests/test_integrations.py +257 -0
hud/environment/tests/test_local_connectors.py +242 -0
hud/environment/tests/test_scenarios.py +1086 -0
hud/environment/tests/test_tools.py +208 -0
hud/environment/types.py +23 -0
hud/environment/utils/__init__.py +35 -0
hud/environment/utils/formats.py +215 -0
hud/environment/utils/schema.py +171 -0
hud/environment/utils/tool_wrappers.py +113 -0
hud/eval/__init__.py +67 -0
hud/eval/context.py +727 -0
hud/eval/display.py +299 -0
hud/eval/instrument.py +187 -0
hud/eval/manager.py +533 -0
hud/eval/parallel.py +268 -0
hud/eval/task.py +372 -0
hud/eval/tests/__init__.py +1 -0
hud/eval/tests/test_context.py +178 -0
hud/eval/tests/test_eval.py +210 -0
hud/eval/tests/test_manager.py +152 -0
hud/eval/tests/test_parallel.py +168 -0
hud/eval/tests/test_task.py +291 -0
hud/eval/types.py +65 -0
hud/eval/utils.py +194 -0
hud/patches/__init__.py +19 -0
hud/patches/mcp_patches.py +308 -0
hud/patches/warnings.py +54 -0
hud/samples/browser.py +4 -4
hud/server/__init__.py +2 -1
hud/server/low_level.py +2 -1
hud/server/router.py +164 -0
hud/server/server.py +567 -80
hud/server/tests/test_mcp_server_integration.py +11 -11
hud/server/tests/test_mcp_server_more.py +1 -1
hud/server/tests/test_server_extra.py +2 -0
hud/settings.py +45 -3
hud/shared/exceptions.py +36 -10
hud/shared/hints.py +26 -1
hud/shared/requests.py +15 -3
hud/shared/tests/test_exceptions.py +40 -31
hud/shared/tests/test_hints.py +167 -0
hud/telemetry/__init__.py +20 -19
hud/telemetry/exporter.py +201 -0
hud/telemetry/instrument.py +165 -253
hud/telemetry/tests/test_eval_telemetry.py +356 -0
hud/telemetry/tests/test_exporter.py +258 -0
hud/telemetry/tests/test_instrument.py +401 -0
hud/tools/__init__.py +18 -2
hud/tools/agent.py +223 -0
hud/tools/apply_patch.py +639 -0
hud/tools/base.py +54 -4
hud/tools/bash.py +2 -2
hud/tools/computer/__init__.py +36 -3
hud/tools/computer/anthropic.py +2 -2
hud/tools/computer/gemini.py +385 -0
hud/tools/computer/hud.py +23 -6
hud/tools/computer/openai.py +20 -21
hud/tools/computer/qwen.py +434 -0
hud/tools/computer/settings.py +37 -0
hud/tools/edit.py +3 -7
hud/tools/executors/base.py +4 -2
hud/tools/executors/pyautogui.py +1 -1
hud/tools/grounding/grounded_tool.py +13 -18
hud/tools/grounding/grounder.py +10 -31
hud/tools/grounding/tests/test_grounded_tool.py +26 -44
hud/tools/jupyter.py +330 -0
hud/tools/playwright.py +18 -3
hud/tools/shell.py +308 -0
hud/tools/tests/test_agent_tool.py +355 -0
hud/tools/tests/test_apply_patch.py +718 -0
hud/tools/tests/test_computer.py +4 -9
hud/tools/tests/test_computer_actions.py +24 -2
hud/tools/tests/test_jupyter_tool.py +181 -0
hud/tools/tests/test_shell.py +596 -0
hud/tools/tests/test_submit.py +85 -0
hud/tools/tests/test_types.py +193 -0
hud/tools/types.py +21 -1
hud/types.py +194 -56
hud/utils/__init__.py +2 -0
hud/utils/env.py +67 -0
hud/utils/hud_console.py +89 -18
hud/utils/mcp.py +15 -58
hud/utils/strict_schema.py +162 -0
hud/utils/tests/test_init.py +1 -2
hud/utils/tests/test_mcp.py +1 -28
hud/utils/tests/test_pretty_errors.py +186 -0
hud/utils/tests/test_tool_shorthand.py +154 -0
hud/utils/tests/test_version.py +1 -1
hud/utils/types.py +20 -0
hud/version.py +1 -1
hud_python-0.5.13.dist-info/METADATA +264 -0
hud_python-0.5.13.dist-info/RECORD +305 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
hud/agents/langchain.py +0 -261
hud/agents/lite_llm.py +0 -72
hud/cli/rl/__init__.py +0 -180
hud/cli/rl/config.py +0 -101
hud/cli/rl/display.py +0 -133
hud/cli/rl/gpu.py +0 -63
hud/cli/rl/gpu_utils.py +0 -321
hud/cli/rl/local_runner.py +0 -595
hud/cli/rl/presets.py +0 -96
hud/cli/rl/remote_runner.py +0 -463
hud/cli/rl/rl_api.py +0 -150
hud/cli/rl/vllm.py +0 -177
hud/cli/rl/wait_utils.py +0 -89
hud/datasets/parallel.py +0 -687
hud/misc/__init__.py +0 -1
hud/misc/claude_plays_pokemon.py +0 -292
hud/otel/__init__.py +0 -35
hud/otel/collector.py +0 -142
hud/otel/config.py +0 -181
hud/otel/context.py +0 -570
hud/otel/exporters.py +0 -369
hud/otel/instrumentation.py +0 -135
hud/otel/processors.py +0 -121
hud/otel/tests/__init__.py +0 -1
hud/otel/tests/test_processors.py +0 -197
hud/rl/README.md +0 -30
hud/rl/__init__.py +0 -1
hud/rl/actor.py +0 -176
hud/rl/buffer.py +0 -405
hud/rl/chat_template.jinja +0 -101
hud/rl/config.py +0 -192
hud/rl/distributed.py +0 -132
hud/rl/learner.py +0 -637
hud/rl/tests/__init__.py +0 -1
hud/rl/tests/test_learner.py +0 -186
hud/rl/train.py +0 -382
hud/rl/types.py +0 -101
hud/rl/utils/start_vllm_server.sh +0 -30
hud/rl/utils.py +0 -524
hud/rl/vllm_adapter.py +0 -143
hud/telemetry/job.py +0 -352
hud/telemetry/replay.py +0 -74
hud/telemetry/tests/test_replay.py +0 -40
hud/telemetry/tests/test_trace.py +0 -63
hud/telemetry/trace.py +0 -158
hud/utils/agent_factories.py +0 -86
hud/utils/async_utils.py +0 -65
hud/utils/group_eval.py +0 -223
hud/utils/progress.py +0 -149
hud/utils/tasks.py +0 -127
hud/utils/tests/test_async_utils.py +0 -173
hud/utils/tests/test_progress.py +0 -261
hud_python-0.4.45.dist-info/METADATA +0 -552
hud_python-0.4.45.dist-info/RECORD +0 -228
{hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0

hud/otel/tests/test_processors.py DELETED Viewed

@@ -1,197 +0,0 @@
-"""Tests for OpenTelemetry processors."""
-from __future__ import annotations
-from unittest.mock import MagicMock, patch
-from hud.otel.processors import HudEnrichmentProcessor
-class TestHudEnrichmentProcessor:
-    """Test HudEnrichmentProcessor."""
-    def test_on_start_with_run_id(self):
-        """Test on_start with current task run ID."""
-        processor = HudEnrichmentProcessor()
-        # Mock span
-        span = MagicMock()
-        span.set_attribute = MagicMock()
-        span.is_recording.return_value = True
-        # Mock baggage to return run ID
-        parent_context = {}
-        with patch("hud.otel.processors.baggage.get_baggage") as mock_get_baggage:
-            # Return run ID for task_run_id, None for job_id
-            mock_get_baggage.side_effect = (
-                lambda key, context: "test-run-123" if key == "hud.task_run_id" else None
-            )
-            processor.on_start(span, parent_context)
-        # Verify attribute was set
-        span.set_attribute.assert_called_with("hud.task_run_id", "test-run-123")
-    def test_on_start_no_run_id(self):
-        """Test on_start without current task run ID."""
-        processor = HudEnrichmentProcessor()
-        # Mock span
-        span = MagicMock()
-        span.set_attribute = MagicMock()
-        span.is_recording.return_value = True
-        span.name = "test_span"
-        # Set up attributes to return None (not matching any step type)
-        span.attributes = {}
-        # Mock baggage to return None
-        parent_context = {}
-        with patch("hud.otel.processors.baggage.get_baggage", return_value=None):
-            processor.on_start(span, parent_context)
-        # Verify only step count attributes were set (no run_id or job_id)
-        calls = span.set_attribute.call_args_list
-        set_attrs = {call[0][0] for call in calls}
-        # Should have step counts but not run_id/job_id
-        assert "hud.task_run_id" not in set_attrs
-        assert "hud.job_id" not in set_attrs
-        assert "hud.base_mcp_steps" in set_attrs
-        assert "hud.mcp_tool_steps" in set_attrs
-        assert "hud.agent_steps" in set_attrs
-    def test_on_end(self):
-        """Test on_end does nothing."""
-        processor = HudEnrichmentProcessor()
-        span = MagicMock()
-        # Should not raise
-        processor.on_end(span)
-    def test_shutdown(self):
-        """Test shutdown does nothing."""
-        processor = HudEnrichmentProcessor()
-        # Should not raise
-        processor.shutdown()
-    def test_force_flush(self):
-        """Test force_flush returns True."""
-        processor = HudEnrichmentProcessor()
-        # Should return True
-        result = processor.force_flush()
-        assert result is True
-    def test_on_start_with_job_id(self):
-        """Test on_start with job ID in baggage."""
-        processor = HudEnrichmentProcessor()
-        # Mock span
-        span = MagicMock()
-        span.set_attribute = MagicMock()
-        span.is_recording.return_value = True
-        # Mock baggage with job ID
-        parent_context = {}
-        with patch("hud.otel.processors.baggage.get_baggage") as mock_get_baggage:
-            # Return None for task_run_id, job-123 for job_id
-            mock_get_baggage.side_effect = (
-                lambda key, context: "job-123" if key == "hud.job_id" else None
-            )
-            processor.on_start(span, parent_context)
-        # Verify job ID attribute was set
-        span.set_attribute.assert_called_with("hud.job_id", "job-123")
-    def test_on_start_exception_handling(self):
-        """Test on_start handles exceptions gracefully."""
-        processor = HudEnrichmentProcessor()
-        # Mock span that raises exception
-        span = MagicMock()
-        span.is_recording.side_effect = Exception("Test error")
-        # Should not raise
-        processor.on_start(span, parent_context=None)
-    def test_on_start_exception_handling_extended(self):
-        """Test that exceptions in on_start are caught and logged."""
-        from hud.otel.processors import HudEnrichmentProcessor
-        processor = HudEnrichmentProcessor()
-        # Create a mock span that raises when setting attributes
-        mock_span = MagicMock()
-        mock_span.is_recording.return_value = True
-        mock_span.set_attribute.side_effect = RuntimeError("Attribute error")
-        parent_context = {}
-        # Patch logger and baggage to force an exception when setting attribute
-        with (
-            patch("hud.otel.processors.logger") as mock_logger,
-            patch("hud.otel.processors.baggage.get_baggage", return_value="test-id"),
-        ):
-            # Should not raise, exception should be caught
-            processor.on_start(mock_span, parent_context)
-            # Verify logger.debug was called with the exception
-            mock_logger.debug.assert_called_once()
-            args = mock_logger.debug.call_args[0]
-            assert "HudEnrichmentProcessor.on_start error" in args[0]
-            assert "Attribute error" in str(args[1])
-    def test_on_start_with_baggage_get_exception(self):
-        """Test exception handling when baggage.get_baggage fails for task_run_id."""
-        processor = HudEnrichmentProcessor()
-        mock_span = MagicMock()
-        mock_span.is_recording.return_value = True
-        parent_context = {}
-        # Make baggage.get_baggage raise an exception for task_run_id
-        with (
-            patch(
-                "hud.otel.processors.baggage.get_baggage",
-                side_effect=ValueError("Context error"),
-            ),
-            patch("hud.otel.processors.logger") as mock_logger,
-        ):
-            # Should not raise
-            processor.on_start(mock_span, parent_context)
-            # Verify logger.debug was called
-            mock_logger.debug.assert_called_once()
-            args = mock_logger.debug.call_args[0]
-            assert "Context error" in str(args[1])
-    def test_on_start_with_baggage_exception(self):
-        """Test exception handling when baggage.get_baggage fails."""
-        processor = HudEnrichmentProcessor()
-        mock_span = MagicMock()
-        mock_span.is_recording.return_value = True
-        parent_context = {}
-        # Make baggage.get_baggage raise an exception
-        with (
-            patch("hud.otel.processors.baggage.get_baggage", side_effect=KeyError("Baggage error")),
-            patch("hud.otel.processors.logger") as mock_logger,
-        ):
-            # Should not raise
-            processor.on_start(mock_span, parent_context)
-            # Verify logger.debug was called
-            mock_logger.debug.assert_called_once()
-            args = mock_logger.debug.call_args[0]
-            assert "Baggage error" in str(args[1])

hud/rl/README.md DELETED Viewed

@@ -1,30 +0,0 @@
-We suggest running hud rl (or with the --local flag) for optimal hyperparameters and native HuggingFace running.
-However, to run this independently, sping up an instance with at least 2 GPUs and run:
-```bash
-sudo apt-get update -y && sudo apt-get install -y cuda-toolkit-12-6
-uv pip install -e .[rl]
-uv pip install ninja
-uv pip install flash-attn --no-build-isolation
-```
-Launch a vllm server with:
-```bash
-export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
-export TOKENIZERS_PARALLELISM=false
-export VLLM_LOGGING_LEVEL=INFO
-export CUDA_VISIBLE_DEVICES=7 # Set this to your last GPU
-uv run vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
-    --api-key token-abc123 --host 0.0.0.0 --port 8000 --tensor-parallel-size 1 --trust-remote-code \
-    --max-model-len 16384 --enable-lora --max-lora-rank 64 --max-cpu-loras 4 --enable-auto-tool-choice \
-    --tool-call-parser hermes --disable-log-requests --dtype auto
-```
-And training with (replace 2 with your spare GPUs):
-```bash
-hud get hud-evals/2048-basic
-torchrun --nproc-per-node 2 -m hud.rl.train --tasks 2048-basic.json --verbose
-```
-Add a `--config path/to/config.json` flag to run a specific configuration (or change the defaults in config.py)

hud/rl/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- """RL module for HUD."""

hud/rl/actor.py DELETED Viewed

@@ -1,176 +0,0 @@
-"""Actor for episode collection using vLLM and HUD."""
-from __future__ import annotations
-import asyncio
-import logging
-import httpx
-from openai import AsyncOpenAI
-import hud
-from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
-from hud.clients.utils.retry_transport import create_retry_httpx_client
-from hud.types import Task, Trace
-from hud.utils.hud_console import HUDConsole
-from .config import Config
-logger = logging.getLogger(__name__)
-hud_console = HUDConsole(logger)
-class Actor:
-    """Collects episodes using vLLM-served models via HUD agents."""
-    def __init__(self, config: Config) -> None:
-        self.config = config
-        self.actor_config = config.actor
-        self.current_adapter = config.model.base_model
-        # Setup OpenAI client for vLLM
-        base_url = self.actor_config.vllm_base_url.replace("localhost", "127.0.0.1")
-        self.openai_client = self._create_openai_client(base_url)
-    def _create_openai_client(self, base_url: str) -> AsyncOpenAI:
-        """Create OpenAI client with optimized settings for vLLM."""
-        # Match connection limits to parallel_episodes to avoid bottlenecks
-        # Use shorter per-request timeout and keep retries modest to avoid long blocking
-        http_client = create_retry_httpx_client(
-            timeout=httpx.Timeout(30.0),
-        )
-        return AsyncOpenAI(
-            base_url=base_url,
-            api_key=self.actor_config.vllm_api_key,
-            http_client=http_client,
-            max_retries=2,
-        )
-    def create_agent(self) -> GenericOpenAIChatAgent:
-        """Create an agent with the current adapter."""
-        return GenericOpenAIChatAgent(
-            openai_client=self.openai_client,
-            model_name=self.current_adapter,
-            allowed_tools=self.actor_config.allowed_tools,
-            append_setup_output=False,
-            system_prompt=self.actor_config.system_prompt,
-            verbose=self.config.verbose,
-            completion_kwargs={
-                "temperature": self.actor_config.temperature,
-                "max_tokens": self.actor_config.max_new_tokens,
-                "tool_choice": "required" if self.actor_config.force_tool_choice else "auto",
-            },
-        )
-    def update_adapter(self, adapter_name: str) -> None:
-        """Update the current adapter being used."""
-        self.current_adapter = adapter_name
-        hud_console.info(f"[Actor] Using adapter: {adapter_name}")
-    async def run_tasks(self, tasks: list[Task], job_id: str) -> list[Trace]:
-        """Run tasks and collect traces."""
-        traces = []
-        # Process tasks in batches respecting max_parallel_episodes limit
-        for batch_start in range(0, len(tasks), self.actor_config.max_parallel_episodes):
-            batch_end = min(batch_start + self.actor_config.max_parallel_episodes, len(tasks))
-            batch = tasks[batch_start:batch_end]
-            # Run batch in parallel with per-episode timeout protection
-            async def run_with_timeout(t: Task) -> Trace:
-                try:
-                    return await asyncio.wait_for(
-                        self._run_task(t, job_id),
-                        timeout=self.actor_config.episode_timeout_sec,
-                    )
-                except TimeoutError:
-                    hud_console.warning_log(f"Episode timed out for task {t.id}")
-                    # Attach task so buffer grouping has key
-                    return Trace(isError=True, content="Episode timeout", task=t)
-            results = await asyncio.gather(
-                *[run_with_timeout(t) for t in batch],
-                return_exceptions=True,
-            )
-            # Normalize exceptions to error traces and ensure task is attached
-            for t, res in zip(batch, results, strict=False):
-                if isinstance(res, Exception):
-                    hud_console.warning_log(f"Episode error: {res}")
-                    traces.append(Trace(isError=True, content=str(res), task=t))
-                else:
-                    traces.append(res)
-        return traces
-    async def _run_task(self, task: Task, job_id: str) -> Trace:
-        """Run a single task."""
-        agent = self.create_agent()
-        # Run the task
-        try:
-            with hud.trace(f"Training | {task.prompt}", job_id=job_id):
-                result = await agent.run(task, max_steps=self.actor_config.max_steps_per_episode)
-        except Exception:
-            logger.info("GOT EXCEPTION")
-            # Preserve task on exception for grouping
-            return Trace(isError=True, task=task)
-        result.info["tool_spec"] = agent.get_tool_schemas()
-        return result
-if __name__ == "__main__":
-    from hud.types import Task
-    async def test_actor() -> None:
-        """Test the actor with a single 2048 task using local hud-browser image."""
-        config = Config()
-        config.actor.max_parallel_episodes = 1
-        config.actor.max_steps_per_episode = 6
-        config.actor.episode_timeout_sec = 120
-        config.verbose = True
-        # Create test task with local hud-browser image
-        task_data = {
-            "id": "test_2048_128",
-            "prompt": "Play the browser-based 2048 game and try to reach the 128 tile. Start by taking a screenshot, then make strategic moves using arrow keys.",  # noqa: E501
-            "mcp_config": {
-                "local": {
-                    "command": "sh",
-                    "args": [
-                        "-c",
-                        "docker run --rm --platform linux/amd64 -i hud-browser:latest 2>/dev/null",
-                    ],
-                }
-            },
-            "setup_tool": {"name": "launch_app", "arguments": {"app_name": "2048"}},
-            "evaluate_tool": {
-                "name": "evaluate",
-                "arguments": {"name": "game_2048_max_number", "arguments": {"target": 128}},
-            },
-            "system_prompt": "You are an expert 2048 game player. Use arrow keys to reach the target tile. First take a screenshot, then make strategic moves.",  # noqa: E501
-        }
-        task = Task(**task_data)
-        actor = Actor(config)
-        logger.info("Testing actor with task: %s", task.id)
-        logger.info("Model: %s", config.model.base_model)
-        logger.info("VLLM: %s", config.actor.vllm_base_url)
-        traces = await actor.run_tasks([task], job_id="test_2048")
-        for trace in traces:
-            if trace.isError:
-                logger.info("Error: %s", trace.content)
-            else:
-                logger.info("Success!")
-                logger.info("Trace info: %s", trace.info if hasattr(trace, "info") else "No info")
-                # Check for evaluation in the trace info
-                if hasattr(trace, "info") and "evaluation" in trace.info:
-                    logger.info("  Evaluation: %s", trace.info["evaluation"])
-    asyncio.run(test_actor())

hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

hud-python 0.4.45py3-none-any.whl → 0.5.13py3-none-any.whl