PyPI - hud-python - Versions diffs - 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

hud-python 0.4.45py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (274) hide show

hud/__init__.py +27 -7
hud/agents/__init__.py +11 -5
hud/agents/base.py +220 -500
hud/agents/claude.py +200 -240
hud/agents/gemini.py +275 -0
hud/agents/gemini_cua.py +335 -0
hud/agents/grounded_openai.py +98 -100
hud/agents/misc/integration_test_agent.py +51 -20
hud/agents/misc/response_agent.py +41 -36
hud/agents/openai.py +291 -292
hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
hud/agents/operator.py +211 -0
hud/agents/tests/conftest.py +133 -0
hud/agents/tests/test_base.py +300 -622
hud/agents/tests/test_base_runtime.py +233 -0
hud/agents/tests/test_claude.py +379 -210
hud/agents/tests/test_client.py +9 -10
hud/agents/tests/test_gemini.py +369 -0
hud/agents/tests/test_grounded_openai_agent.py +65 -50
hud/agents/tests/test_openai.py +376 -140
hud/agents/tests/test_operator.py +362 -0
hud/agents/tests/test_run_eval.py +179 -0
hud/cli/__init__.py +461 -545
hud/cli/analyze.py +43 -5
hud/cli/build.py +664 -110
hud/cli/debug.py +8 -5
hud/cli/dev.py +882 -734
hud/cli/eval.py +782 -668
hud/cli/flows/dev.py +167 -0
hud/cli/flows/init.py +191 -0
hud/cli/flows/tasks.py +153 -56
hud/cli/flows/templates.py +151 -0
hud/cli/flows/tests/__init__.py +1 -0
hud/cli/flows/tests/test_dev.py +126 -0
hud/cli/init.py +60 -58
hud/cli/push.py +29 -11
hud/cli/rft.py +311 -0
hud/cli/rft_status.py +145 -0
hud/cli/tests/test_analyze.py +5 -5
hud/cli/tests/test_analyze_metadata.py +3 -2
hud/cli/tests/test_analyze_module.py +120 -0
hud/cli/tests/test_build.py +108 -6
hud/cli/tests/test_build_failure.py +41 -0
hud/cli/tests/test_build_module.py +50 -0
hud/cli/tests/test_cli_init.py +6 -1
hud/cli/tests/test_cli_more_wrappers.py +30 -0
hud/cli/tests/test_cli_root.py +140 -0
hud/cli/tests/test_convert.py +361 -0
hud/cli/tests/test_debug.py +12 -10
hud/cli/tests/test_dev.py +197 -0
hud/cli/tests/test_eval.py +251 -0
hud/cli/tests/test_eval_bedrock.py +51 -0
hud/cli/tests/test_init.py +124 -0
hud/cli/tests/test_main_module.py +11 -5
hud/cli/tests/test_mcp_server.py +12 -100
hud/cli/tests/test_push_happy.py +74 -0
hud/cli/tests/test_push_wrapper.py +23 -0
hud/cli/tests/test_registry.py +1 -1
hud/cli/tests/test_utils.py +1 -1
hud/cli/{rl → utils}/celebrate.py +14 -12
hud/cli/utils/config.py +18 -1
hud/cli/utils/docker.py +130 -4
hud/cli/utils/env_check.py +9 -9
hud/cli/utils/git.py +136 -0
hud/cli/utils/interactive.py +39 -5
hud/cli/utils/metadata.py +69 -0
hud/cli/utils/runner.py +1 -1
hud/cli/utils/server.py +2 -2
hud/cli/utils/source_hash.py +3 -3
hud/cli/utils/tasks.py +4 -1
hud/cli/utils/tests/__init__.py +0 -0
hud/cli/utils/tests/test_config.py +58 -0
hud/cli/utils/tests/test_docker.py +93 -0
hud/cli/utils/tests/test_docker_hints.py +71 -0
hud/cli/utils/tests/test_env_check.py +74 -0
hud/cli/utils/tests/test_environment.py +42 -0
hud/cli/utils/tests/test_git.py +142 -0
hud/cli/utils/tests/test_interactive_module.py +60 -0
hud/cli/utils/tests/test_local_runner.py +50 -0
hud/cli/utils/tests/test_logging_utils.py +23 -0
hud/cli/utils/tests/test_metadata.py +49 -0
hud/cli/utils/tests/test_package_runner.py +35 -0
hud/cli/utils/tests/test_registry_utils.py +49 -0
hud/cli/utils/tests/test_remote_runner.py +25 -0
hud/cli/utils/tests/test_runner_modules.py +52 -0
hud/cli/utils/tests/test_source_hash.py +36 -0
hud/cli/utils/tests/test_tasks.py +80 -0
hud/cli/utils/version_check.py +258 -0
hud/cli/{rl → utils}/viewer.py +2 -2
hud/clients/README.md +12 -11
hud/clients/__init__.py +4 -3
hud/clients/base.py +166 -26
hud/clients/environment.py +51 -0
hud/clients/fastmcp.py +13 -6
hud/clients/mcp_use.py +40 -15
hud/clients/tests/test_analyze_scenarios.py +206 -0
hud/clients/tests/test_protocol.py +9 -3
hud/datasets/__init__.py +23 -20
hud/datasets/loader.py +327 -0
hud/datasets/runner.py +192 -105
hud/datasets/tests/__init__.py +0 -0
hud/datasets/tests/test_loader.py +221 -0
hud/datasets/tests/test_utils.py +315 -0
hud/datasets/utils.py +270 -90
hud/environment/__init__.py +50 -0
hud/environment/connection.py +206 -0
hud/environment/connectors/__init__.py +33 -0
hud/environment/connectors/base.py +68 -0
hud/environment/connectors/local.py +177 -0
hud/environment/connectors/mcp_config.py +109 -0
hud/environment/connectors/openai.py +101 -0
hud/environment/connectors/remote.py +172 -0
hud/environment/environment.py +694 -0
hud/environment/integrations/__init__.py +45 -0
hud/environment/integrations/adk.py +67 -0
hud/environment/integrations/anthropic.py +196 -0
hud/environment/integrations/gemini.py +92 -0
hud/environment/integrations/langchain.py +82 -0
hud/environment/integrations/llamaindex.py +68 -0
hud/environment/integrations/openai.py +238 -0
hud/environment/mock.py +306 -0
hud/environment/router.py +112 -0
hud/environment/scenarios.py +493 -0
hud/environment/tests/__init__.py +1 -0
hud/environment/tests/test_connection.py +317 -0
hud/environment/tests/test_connectors.py +218 -0
hud/environment/tests/test_environment.py +161 -0
hud/environment/tests/test_integrations.py +257 -0
hud/environment/tests/test_local_connectors.py +201 -0
hud/environment/tests/test_scenarios.py +280 -0
hud/environment/tests/test_tools.py +208 -0
hud/environment/types.py +23 -0
hud/environment/utils/__init__.py +35 -0
hud/environment/utils/formats.py +215 -0
hud/environment/utils/schema.py +171 -0
hud/environment/utils/tool_wrappers.py +113 -0
hud/eval/__init__.py +67 -0
hud/eval/context.py +674 -0
hud/eval/display.py +299 -0
hud/eval/instrument.py +185 -0
hud/eval/manager.py +466 -0
hud/eval/parallel.py +268 -0
hud/eval/task.py +340 -0
hud/eval/tests/__init__.py +1 -0
hud/eval/tests/test_context.py +178 -0
hud/eval/tests/test_eval.py +210 -0
hud/eval/tests/test_manager.py +152 -0
hud/eval/tests/test_parallel.py +168 -0
hud/eval/tests/test_task.py +145 -0
hud/eval/types.py +63 -0
hud/eval/utils.py +183 -0
hud/patches/__init__.py +19 -0
hud/patches/mcp_patches.py +151 -0
hud/patches/warnings.py +54 -0
hud/samples/browser.py +4 -4
hud/server/__init__.py +2 -1
hud/server/low_level.py +2 -1
hud/server/router.py +164 -0
hud/server/server.py +567 -80
hud/server/tests/test_mcp_server_integration.py +11 -11
hud/server/tests/test_mcp_server_more.py +1 -1
hud/server/tests/test_server_extra.py +2 -0
hud/settings.py +45 -3
hud/shared/exceptions.py +36 -10
hud/shared/hints.py +26 -1
hud/shared/requests.py +15 -3
hud/shared/tests/test_exceptions.py +40 -31
hud/shared/tests/test_hints.py +167 -0
hud/telemetry/__init__.py +20 -19
hud/telemetry/exporter.py +201 -0
hud/telemetry/instrument.py +158 -253
hud/telemetry/tests/test_eval_telemetry.py +356 -0
hud/telemetry/tests/test_exporter.py +258 -0
hud/telemetry/tests/test_instrument.py +401 -0
hud/tools/__init__.py +16 -2
hud/tools/apply_patch.py +639 -0
hud/tools/base.py +54 -4
hud/tools/bash.py +2 -2
hud/tools/computer/__init__.py +4 -0
hud/tools/computer/anthropic.py +2 -2
hud/tools/computer/gemini.py +385 -0
hud/tools/computer/hud.py +23 -6
hud/tools/computer/openai.py +20 -21
hud/tools/computer/qwen.py +434 -0
hud/tools/computer/settings.py +37 -0
hud/tools/edit.py +3 -7
hud/tools/executors/base.py +4 -2
hud/tools/executors/pyautogui.py +1 -1
hud/tools/grounding/grounded_tool.py +13 -18
hud/tools/grounding/grounder.py +10 -31
hud/tools/grounding/tests/test_grounded_tool.py +26 -44
hud/tools/jupyter.py +330 -0
hud/tools/playwright.py +18 -3
hud/tools/shell.py +308 -0
hud/tools/tests/test_apply_patch.py +718 -0
hud/tools/tests/test_computer.py +4 -9
hud/tools/tests/test_computer_actions.py +24 -2
hud/tools/tests/test_jupyter_tool.py +181 -0
hud/tools/tests/test_shell.py +596 -0
hud/tools/tests/test_submit.py +85 -0
hud/tools/tests/test_types.py +193 -0
hud/tools/types.py +21 -1
hud/types.py +167 -57
hud/utils/__init__.py +2 -0
hud/utils/env.py +67 -0
hud/utils/hud_console.py +61 -3
hud/utils/mcp.py +15 -58
hud/utils/strict_schema.py +162 -0
hud/utils/tests/test_init.py +1 -2
hud/utils/tests/test_mcp.py +1 -28
hud/utils/tests/test_pretty_errors.py +186 -0
hud/utils/tests/test_tool_shorthand.py +154 -0
hud/utils/tests/test_version.py +1 -1
hud/utils/types.py +20 -0
hud/version.py +1 -1
hud_python-0.5.1.dist-info/METADATA +264 -0
hud_python-0.5.1.dist-info/RECORD +299 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
hud/agents/langchain.py +0 -261
hud/agents/lite_llm.py +0 -72
hud/cli/rl/__init__.py +0 -180
hud/cli/rl/config.py +0 -101
hud/cli/rl/display.py +0 -133
hud/cli/rl/gpu.py +0 -63
hud/cli/rl/gpu_utils.py +0 -321
hud/cli/rl/local_runner.py +0 -595
hud/cli/rl/presets.py +0 -96
hud/cli/rl/remote_runner.py +0 -463
hud/cli/rl/rl_api.py +0 -150
hud/cli/rl/vllm.py +0 -177
hud/cli/rl/wait_utils.py +0 -89
hud/datasets/parallel.py +0 -687
hud/misc/__init__.py +0 -1
hud/misc/claude_plays_pokemon.py +0 -292
hud/otel/__init__.py +0 -35
hud/otel/collector.py +0 -142
hud/otel/config.py +0 -181
hud/otel/context.py +0 -570
hud/otel/exporters.py +0 -369
hud/otel/instrumentation.py +0 -135
hud/otel/processors.py +0 -121
hud/otel/tests/__init__.py +0 -1
hud/otel/tests/test_processors.py +0 -197
hud/rl/README.md +0 -30
hud/rl/__init__.py +0 -1
hud/rl/actor.py +0 -176
hud/rl/buffer.py +0 -405
hud/rl/chat_template.jinja +0 -101
hud/rl/config.py +0 -192
hud/rl/distributed.py +0 -132
hud/rl/learner.py +0 -637
hud/rl/tests/__init__.py +0 -1
hud/rl/tests/test_learner.py +0 -186
hud/rl/train.py +0 -382
hud/rl/types.py +0 -101
hud/rl/utils/start_vllm_server.sh +0 -30
hud/rl/utils.py +0 -524
hud/rl/vllm_adapter.py +0 -143
hud/telemetry/job.py +0 -352
hud/telemetry/replay.py +0 -74
hud/telemetry/tests/test_replay.py +0 -40
hud/telemetry/tests/test_trace.py +0 -63
hud/telemetry/trace.py +0 -158
hud/utils/agent_factories.py +0 -86
hud/utils/async_utils.py +0 -65
hud/utils/group_eval.py +0 -223
hud/utils/progress.py +0 -149
hud/utils/tasks.py +0 -127
hud/utils/tests/test_async_utils.py +0 -173
hud/utils/tests/test_progress.py +0 -261
hud_python-0.4.45.dist-info/METADATA +0 -552
hud_python-0.4.45.dist-info/RECORD +0 -228
{hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0

hud/datasets/utils.py CHANGED Viewed

@@ -1,118 +1,298 @@
-"""Dataset utilities for loading, saving, and fetching datasets."""
+"""Utility functions and schemas for the datasets module."""
 from __future__ import annotations
-import json
 import logging
-from typing import Any
+from typing import TYPE_CHECKING, Any
-from datasets import Dataset
+import httpx
+from pydantic import BaseModel, Field, field_validator, model_validator
-from hud.types import Task
+from hud.settings import settings
+from hud.types import AgentType, TaskInput
+from hud.utils.hud_console import HUDConsole
-logger = logging.getLogger("hud.datasets")
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+logger = logging.getLogger(__name__)
+hud_console = HUDConsole()
-async def fetch_system_prompt_from_dataset(dataset_id: str) -> str | None:
-    """
-    Fetch system_prompt.txt from a HuggingFace dataset repository.
+__all__ = [
+    "BatchRequest",
+    "SingleTaskRequest",
+    "cancel_all_jobs",
+    "cancel_job",
+    "cancel_task",
+    "submit_rollouts",
+]
-    Args:
-        dataset_id: HuggingFace dataset identifier (e.g., "hud-evals/SheetBench-50")
-    Returns:
-        System prompt text if found, None otherwise
+class SingleTaskRequest(BaseModel):
+    """Request to run a single task remotely - mirrors run_single_task() args."""
+    task: dict[str, Any] = Field(
+        description="Task definition (v4 LegacyTask or v5 Task format).",
+    )
+    agent_type: AgentType = Field(description="Agent type to execute the task.")
+    agent_params: dict[str, Any] = Field(
+        default_factory=dict,
+        description="Agent constructor parameters passed to agent.create(). "
+        "Should include fields from BaseCreateParams (auto_trace, auto_respond, verbose) "
+        "plus agent-specific config fields (e.g., checkpoint_name for ClaudeConfig).",
+    )
+    max_steps: int = Field(default=10, description="Maximum steps allowed for the agent.")
+    job_id: str = Field(description="HUD job identifier for telemetry association.")
+    task_id: str | None = Field(default=None, description="Task identifier.")
+    trace_name: str | None = Field(default=None, description="Trace name.")
+    group_id: str | None = Field(default=None, description="Optional HUD group identifier.")
+    metadata: dict[str, Any] = Field(
+        default_factory=dict,
+        description="Additional metadata to inject into the trace context.",
+    )
+    trace_id: str | None = Field(default=None, description="Pre-assigned trace ID.")
+    use_byok: bool = Field(
+        default=False,
+        description="If True, use BYOK headers from encrypted env vars for inference.",
+    )
+    @model_validator(mode="after")
+    def _validate_task(self) -> SingleTaskRequest:
+        """Validate task is either v4 LegacyTask or v5 Task format."""
+        from hud.eval.utils import is_v4_format, validate_v4_task
+        # v4 format: looks like v4 (prompt + mcp_config)?
+        if is_v4_format(self.task):
+            # Validate completeness (requires evaluate_tool too)
+            validate_v4_task(self.task)
+            return self
+        # v5 format: env required
+        if "env" in self.task:
+            return self
+        # Neither v4 nor v5
+        raise ValueError("Task must have 'env' (v5) or 'prompt'+'mcp_config'+'evaluate_tool' (v4)")
+    @field_validator("job_id")
+    @classmethod
+    def _validate_job_id(cls, value: str) -> str:
+        if not value or not value.strip():
+            raise ValueError("job_id must be a non-empty string.")
+        return value
+class BatchRequest(BaseModel):
+    """Request to run multiple tasks remotely."""
+    requests: list[SingleTaskRequest] = Field(
+        description="List of single task requests to submit.",
+        min_length=1,
+        max_length=1000,
+    )
+def _normalize_tasks(tasks: Sequence[TaskInput]) -> list[dict[str, Any]]:
+    """Convert tasks to list of dicts for remote API submission."""
+    result = []
+    for t in tasks:
+        if isinstance(t, dict):
+            result.append(t)
+        elif hasattr(t, "model_dump"):
+            result.append(t.model_dump(mode="json"))
+        else:
+            raise TypeError(f"Cannot convert {type(t).__name__} to dict")
+    return result
+async def submit_rollouts(
+    tasks: Sequence[TaskInput],
+    job_id: str,
+    agent_type: AgentType,
+    agent_params: dict[str, Any] | None = None,
+    max_steps: int = 10,
+    group_size: int = 1,
+    batch_size: int = 50,
+    metadata: dict[str, Any] | None = None,
+    use_byok: bool = False,
+) -> None:
+    """Submit rollouts to the HUD platform API for remote execution (fire-and-forget).
+    Args:
+        tasks: List of tasks (v5 Task, v4 LegacyTask, or dicts)
+        job_id: HUD job ID for telemetry grouping
+        agent_type: Agent type to use for execution
+        agent_params: Parameters passed to agent.create()
+        max_steps: Maximum steps per rollout
+        group_size: Number of rollouts per task (for variance estimation)
+        batch_size: Number of rollouts per API batch request
+        metadata: Additional metadata for each rollout
+        use_byok: If True, use BYOK keys from encrypted env vars (remote only)
     """
-    try:
-        # Import here to avoid unnecessary dependency
-        from huggingface_hub import hf_hub_download
-        from huggingface_hub.errors import EntryNotFoundError
-        # Try to download the system_prompt.txt file
-        try:
-            file_path = hf_hub_download(
-                repo_id=dataset_id, filename="system_prompt.txt", repo_type="dataset"
+    from hud.eval.utils import is_v4_format
+    if not settings.api_key:
+        raise ValueError("HUD_API_KEY is required for remote execution")
+    # Convert to dicts once for uniform processing
+    task_dicts = _normalize_tasks(tasks)
+    # Validate v4 tasks have remote-compatible mcp_config (URL-based, not command-based)
+    for i, td in enumerate(task_dicts):
+        if not is_v4_format(td):
+            continue  # v5 tasks use env config, no mcp_config to check
+        mcp_config = td.get("mcp_config") or {}
+        for server_name, server_cfg in mcp_config.items():
+            is_local = (
+                isinstance(server_cfg, dict)
+                and "command" in server_cfg
+                and not server_cfg.get("url")
+            )
+            if is_local:
+                raise ValueError(
+                    f"Remote execution requires URL-based mcp_config. "
+                    f"Task {td.get('id') or i} uses local Docker config for '{server_name}'. "
+                    "Convert to remote with: hud convert <tasks_file>"
+                )
+    # Build single task requests
+    requests: list[SingleTaskRequest] = []
+    for task_idx, td in enumerate(task_dicts):
+        base_task_id = td.get("id") or f"task_{task_idx}"
+        trace_name = td.get("prompt") or td.get("scenario") or base_task_id
+        for rollout_idx in range(group_size):
+            task_id = f"{base_task_id}_r{rollout_idx}" if group_size > 1 else base_task_id
+            requests.append(
+                SingleTaskRequest(
+                    task=td,
+                    agent_type=agent_type,
+                    agent_params=agent_params or {},
+                    max_steps=max_steps,
+                    job_id=job_id,
+                    task_id=task_id,
+                    trace_name=trace_name,
+                    group_id=base_task_id if group_size > 1 else None,
+                    metadata=metadata or {},
+                    use_byok=use_byok,
+                )
             )
-            # Read and return the content
-            with open(file_path, encoding="utf-8") as f:  # noqa: ASYNC230
-                content = f.read().strip()
-                if content:
-                    logger.info(
-                        "Loaded system prompt from %s (length: %d chars)", dataset_id, len(content)
-                    )
-                    return content
-                else:
-                    logger.warning("System prompt file is empty in %s", dataset_id)
-                    return None
-        except EntryNotFoundError:
-            logger.debug("No system_prompt.txt found in dataset %s", dataset_id)
-            return None
-    except ImportError:
-        logger.warning(
-            "huggingface_hub not installed. Install it to fetch system prompts from datasets."
+    # Submit in batches
+    api_url = f"{settings.hud_api_url.rstrip('/')}/v1/rollouts/run_list"
+    headers = {"Authorization": f"Bearer {settings.api_key}"}
+    total_accepted = 0
+    total_rejected = 0
+    async with httpx.AsyncClient(timeout=120) as client:
+        for i in range(0, len(requests), batch_size):
+            batch = requests[i : i + batch_size]
+            batch_request = BatchRequest(requests=batch)
+            try:
+                response = await client.post(
+                    api_url,
+                    json=batch_request.model_dump(mode="json"),
+                    headers=headers,
+                )
+                response.raise_for_status()
+                result = response.json()
+                total_accepted += result.get("accepted", 0)
+                total_rejected += result.get("rejected", 0)
+                for item in result.get("results", []):
+                    if isinstance(item, dict) and item.get("status") == "rejected":
+                        hud_console.warning(f"Task rejected: {item.get('error', 'Unknown reason')}")
+                batch_num = (i // batch_size) + 1
+                total_batches = (len(requests) + batch_size - 1) // batch_size
+                hud_console.info(
+                    f"Batch {batch_num}/{total_batches}: "
+                    f"{result.get('accepted', 0)}/{len(batch)} accepted"
+                )
+            except httpx.HTTPStatusError as exc:
+                if 400 <= exc.response.status_code < 500:
+                    raise ValueError(f"Submission failed: {exc.response.text}") from exc
+                hud_console.error(f"Batch submission failed: {exc.response.status_code}")
+                total_rejected += len(batch)
+            except Exception as exc:
+                hud_console.error(f"Batch submission failed: {exc}")
+                total_rejected += len(batch)
+    # Log final summary
+    if total_rejected > 0:
+        hud_console.warning(
+            f"Submitted {total_accepted}/{len(requests)} requests ({total_rejected} rejected)"
         )
-        return None
-    except Exception as e:
-        logger.error("Error fetching system prompt from %s: %s", dataset_id, e)
-        return None
+    else:
+        hud_console.info(f"Submitted {total_accepted}/{len(requests)} requests")
-def save_tasks(
-    tasks: list[dict[str, Any]], repo_id: str, fields: list[str] | None = None, **kwargs: Any
-) -> None:
+async def cancel_job(job_id: str) -> dict[str, Any]:
+    """Cancel all tasks for a specific job.
+    Args:
+        job_id: The job ID to cancel
+    Returns:
+        Response with cancellation results including total_found, cancelled counts
     """
-    Save data to HuggingFace dataset with JSON string serialization.
+    api_url = f"{settings.hud_api_url.rstrip('/')}/v1/rollouts/cancel_job"
+    headers = {"Authorization": f"Bearer {settings.api_key}"}
+    async with httpx.AsyncClient(timeout=30) as client:
+        response = await client.post(
+            api_url,
+            json={"job_id": job_id},
+            headers=headers,
+        )
+        response.raise_for_status()
+        return response.json()
-    Complex fields (dicts, lists) are serialized as JSON strings to maintain clean schema
-    and avoid null value pollution in HuggingFace datasets.
+async def cancel_task(job_id: str, task_id: str) -> dict[str, Any]:
+    """Cancel a specific task within a job.
     Args:
-        tasks: List of dictionaries to save
-        repo_id: HuggingFace repository ID (e.g., "hud-evals/my-tasks")
-        fields: Optional list of fields to save. If None, saves all fields from each dict.
-        **kwargs: Additional arguments passed to dataset.push_to_hub()
+        job_id: The job ID
+        task_id: The specific task ID to cancel
+    Returns:
+        Response with cancellation result
     """
-    # Safety check: Ensure we're not saving Task objects (which have resolved env vars)
-    if tasks and isinstance(tasks[0], Task):
-        raise ValueError(
-            "save_tasks expects dictionaries, not Task objects. "
-            "Task objects have resolved environment variables which would expose secrets. "
-            "Please pass raw dictionaries with template strings like '${HUD_API_KEY}' preserved."
-        )
+    api_url = f"{settings.hud_api_url.rstrip('/')}/v1/rollouts/cancel"
+    headers = {"Authorization": f"Bearer {settings.api_key}"}
-    # Convert to rows with JSON string fields
-    data = []
-    for i, tc_dict in enumerate(tasks):
-        # Additional safety check for each item
-        if isinstance(tc_dict, Task):
-            raise ValueError(
-                f"Item {i} is a Task object, not a dictionary. "
-                "This would expose resolved environment variables. "
-                "Please convert to dictionary format with template strings preserved."
-            )
+    async with httpx.AsyncClient(timeout=30) as client:
+        response = await client.post(
+            api_url,
+            json={"job_id": job_id, "task_id": task_id},
+            headers=headers,
+        )
+        response.raise_for_status()
+        return response.json()
-        row = {}
-        # Determine which fields to process
-        fields_to_process = fields if fields is not None else list(tc_dict.keys())
+async def cancel_all_jobs() -> dict[str, Any]:
+    """Cancel ALL active jobs for the authenticated user.
-        for field in fields_to_process:
-            if field in tc_dict:
-                value = tc_dict[field]
-                # Serialize complex types as JSON strings
-                if isinstance(value, (dict | list)):
-                    row[field] = json.dumps(value)
-                elif isinstance(value, (str | int | float | bool | type(None))):
-                    row[field] = value if value is not None else ""
-                else:
-                    # For other types, convert to string
-                    row[field] = str(value)
+    This is a "panic button" to stop all running rollouts.
-        data.append(row)
+    Returns:
+        Response with jobs_cancelled, total_tasks_cancelled, and job_details
+    """
+    api_url = f"{settings.hud_api_url.rstrip('/')}/v1/rollouts/cancel_user_jobs"
+    headers = {"Authorization": f"Bearer {settings.api_key}"}
-    # Create and push dataset
-    dataset = Dataset.from_list(data)
-    dataset.push_to_hub(repo_id, **kwargs)
+    async with httpx.AsyncClient(timeout=60) as client:
+        response = await client.post(
+            api_url,
+            json={},
+            headers=headers,
+        )
+        response.raise_for_status()
+        return response.json()

hud/environment/__init__.py ADDED Viewed

@@ -0,0 +1,50 @@
+"""
+HUD Environment - A unified abstraction for MCP environments.
+The Environment class is a server that you can also use as a client.
+It subclasses MCPServer to get server capabilities (@env.tool, serve())
+and composes FastMCP Client instances for remote connections.
+Usage:
+    from hud.environment import Environment
+    # Create and connect
+    env = Environment("my-env").connect_hub("browser", prefix="web")
+    async with env:
+        # Get tools in any format
+        openai_tools = env.as_openai_chat_tools()
+        claude_tools = env.as_claude_tools()
+        # Call tools with any format - auto-parses and returns matching format
+        result = await env.call_tool("web_navigate", url="https://google.com")
+        # Framework integrations (requires external deps)
+        agent_tools = env.as_openai_agent_tools()   # needs openai-agents
+        lc_tools = env.as_langchain_tools()         # needs langchain-core
+"""
+from hud.environment.connection import ConnectionConfig, ConnectionType, Connector
+from hud.environment.environment import Environment
+from hud.environment.mock import MockMixin, generate_mock_value
+from hud.environment.router import ConflictResolution, ToolRouter
+from hud.environment.scenarios import ScenarioMixin
+from hud.environment.types import EnvConfig
+from hud.environment.utils import ToolFormat, format_result, parse_tool_call, parse_tool_calls
+__all__ = [
+    "ConflictResolution",
+    "ConnectionConfig",
+    "ConnectionType",
+    "Connector",
+    "EnvConfig",
+    "Environment",
+    "MockMixin",
+    "ScenarioMixin",
+    "ToolFormat",
+    "ToolRouter",
+    "format_result",
+    "generate_mock_value",
+    "parse_tool_call",
+    "parse_tool_calls",
+]

hud/environment/connection.py ADDED Viewed

@@ -0,0 +1,206 @@
+"""Connection management for MCP servers."""
+from __future__ import annotations
+import logging
+from enum import Enum
+from typing import TYPE_CHECKING, Any
+import mcp.types as mcp_types
+if TYPE_CHECKING:
+    from collections.abc import Callable
+    from fastmcp.client import Client as FastMCPClient
+    from fastmcp.tools.tool import Tool
+__all__ = ["ConnectionConfig", "ConnectionType", "Connector"]
+logger = logging.getLogger(__name__)
+class ConnectionType(str, Enum):
+    """Type of connection - determines parallelization capability."""
+    LOCAL = "local"  # Stdio/Docker - single instance, not parallelizable
+    REMOTE = "remote"  # HTTP/URL - can spawn multiple instances
+class ConnectionConfig:
+    """Configuration for filtering/transforming tools from a remote connection."""
+    def __init__(
+        self,
+        *,
+        prefix: str | None = None,
+        include: list[str] | None = None,
+        exclude: list[str] | None = None,
+        transform: Callable[[Tool], Tool | None] | None = None,
+    ) -> None:
+        self.prefix = prefix
+        self.include = include
+        self.exclude = exclude
+        self.transform = transform
+class Connector:
+    """Manages a connection to an MCP server with tool caching.
+    Client creation is deferred to connect() so that:
+    1. Each parallel trace gets fresh client instances
+    2. Connection happens inside trace context (for header injection)
+    """
+    def __init__(
+        self,
+        transport: Any,
+        config: ConnectionConfig,
+        name: str,
+        connection_type: ConnectionType,
+        *,
+        auth: str | None = None,
+    ) -> None:
+        # Store transport config - client created in connect()
+        self._transport = transport
+        self._auth = auth
+        self.config = config
+        self.name = name
+        self.connection_type = connection_type
+        self.client: FastMCPClient[Any] | None = None
+        self._tools_cache: list[mcp_types.Tool] | None = None
+    def copy(self) -> Connector:
+        """Create a copy of this connector with fresh (unconnected) state.
+        The copy shares transport config but has its own client instance,
+        allowing parallel execution without conflicts.
+        """
+        return Connector(
+            transport=self._transport,
+            config=self.config,
+            name=self.name,
+            connection_type=self.connection_type,
+            auth=self._auth,
+        )
+    @property
+    def is_local(self) -> bool:
+        """True if this is a local (non-parallelizable) connection."""
+        return self.connection_type == ConnectionType.LOCAL
+    @property
+    def is_remote(self) -> bool:
+        """True if this is a remote (parallelizable) connection."""
+        return self.connection_type == ConnectionType.REMOTE
+    @property
+    def is_connected(self) -> bool:
+        return self.client is not None and self.client.is_connected()
+    @property
+    def cached_tools(self) -> list[mcp_types.Tool]:
+        return self._tools_cache or []
+    async def connect(self) -> None:
+        """Create FastMCP client and connect.
+        Client is created here (not in __init__) so that:
+        1. Each parallel trace gets fresh client instances
+        2. httpx auto-instrumentation can inject trace headers
+        """
+        from fastmcp.client import Client as FastMCPClient
+        # Create fresh client from stored transport config
+        self.client = FastMCPClient(transport=self._transport, auth=self._auth)
+        await self.client.__aenter__()
+    async def disconnect(self) -> None:
+        """Disconnect and clear cache."""
+        if self.client is not None and self.is_connected:
+            await self.client.__aexit__(None, None, None)
+        self.client = None
+        self._tools_cache = None
+    async def list_tools(self) -> list[mcp_types.Tool]:
+        """Fetch tools from server, apply filters/transforms/prefix, and cache."""
+        if self.client is None:
+            raise RuntimeError("Not connected - call connect() first")
+        tools = await self.client.list_tools()
+        result: list[mcp_types.Tool] = []
+        for tool in tools:
+            # Apply include/exclude filter
+            if self.config.include is not None and tool.name not in self.config.include:
+                continue
+            if self.config.exclude is not None and tool.name in self.config.exclude:
+                continue
+            # Apply transform
+            if self.config.transform is not None:
+                from fastmcp.tools.tool import Tool as FastMCPTool
+                fastmcp_tool = FastMCPTool.model_construct(
+                    name=tool.name,
+                    description=tool.description or "",
+                    parameters=tool.inputSchema,
+                )
+                transformed = self.config.transform(fastmcp_tool)
+                if transformed is None:
+                    continue
+                tool = mcp_types.Tool(
+                    name=transformed.name,
+                    description=transformed.description,
+                    inputSchema=transformed.parameters,
+                )
+            # Apply prefix
+            name = f"{self.config.prefix}_{tool.name}" if self.config.prefix else tool.name
+            result.append(
+                mcp_types.Tool(
+                    name=name,
+                    description=tool.description,
+                    inputSchema=tool.inputSchema,
+                )
+            )
+        self._tools_cache = result
+        return result
+    async def call_tool(
+        self, name: str, arguments: dict[str, Any] | None = None
+    ) -> mcp_types.CallToolResult:
+        """Call a tool, stripping prefix if needed."""
+        if self.client is None:
+            raise RuntimeError("Not connected - call connect() first")
+        # Strip prefix when calling remote
+        if self.config.prefix and name.startswith(f"{self.config.prefix}_"):
+            name = name[len(self.config.prefix) + 1 :]
+        return await self.client.call_tool_mcp(name, arguments or {})
+    async def list_resources(self) -> list[mcp_types.Resource]:
+        if self.client is None:
+            raise RuntimeError("Not connected - call connect() first")
+        return await self.client.list_resources()
+    async def list_prompts(self) -> list[mcp_types.Prompt]:
+        if self.client is None:
+            raise RuntimeError("Not connected - call connect() first")
+        return await self.client.list_prompts()
+    async def read_resource(
+        self, uri: str
+    ) -> list[mcp_types.TextResourceContents | mcp_types.BlobResourceContents]:
+        if self.client is None:
+            raise RuntimeError("Not connected - call connect() first")
+        return await self.client.read_resource(uri)
+    async def get_prompt(
+        self, name: str, arguments: dict[str, Any] | None = None
+    ) -> mcp_types.GetPromptResult:
+        if self.client is None:
+            raise RuntimeError("Not connected - call connect() first")
+        return await self.client.get_prompt(name, arguments)
+    def __repr__(self) -> str:
+        t = self.connection_type.value
+        return f"Connector({self.name!r}, {t}, connected={self.is_connected})"

hud/environment/connectors/__init__.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""Connection connectors - methods for connecting to various sources."""
+from hud.environment.connectors.local import LocalConnectorMixin
+from hud.environment.connectors.openai import OpenAIConnectorMixin
+from hud.environment.connectors.remote import RemoteConnectorMixin
+__all__ = ["ConnectorsMixin"]
+class ConnectorsMixin(
+    RemoteConnectorMixin,
+    LocalConnectorMixin,
+    OpenAIConnectorMixin,
+):
+    """Combined connector mixin providing all connection methods.
+    Remote connections:
+        connect_hub(slug) - HUD Hub environment
+        connect_url(url) - MCP server via URL
+        connect_openapi(spec) - Mount OpenAPI spec as MCP server
+    Local connections (in-process):
+        connect_image(image) - Docker image via stdio
+        connect_fastapi(app) - Mount FastAPI app as MCP server
+        connect_server(server) - Mount MCPServer/FastMCP directly
+    MCP config:
+        connect_mcp(config) - Single mcp_config server (auto-detects local/remote)
+        connect_mcp_config(mcp_config) - Multiple mcp_config servers
+    Framework imports:
+        connect_function_tools(tools) - Import OpenAI Agents SDK FunctionTools
+    """

hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

hud-python 0.4.45py3-none-any.whl → 0.5.1py3-none-any.whl