PyPI - hud-python - Versions diffs - 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl - Mend

hud-python 0.4.1py3-none-any.whl → 0.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (130) hide show

hud/__init__.py +22 -22
hud/agents/__init__.py +13 -15
hud/agents/base.py +599 -599
hud/agents/claude.py +373 -373
hud/agents/langchain.py +261 -250
hud/agents/misc/__init__.py +7 -7
hud/agents/misc/response_agent.py +82 -80
hud/agents/openai.py +352 -352
hud/agents/openai_chat_generic.py +154 -154
hud/agents/tests/__init__.py +1 -1
hud/agents/tests/test_base.py +742 -742
hud/agents/tests/test_claude.py +324 -324
hud/agents/tests/test_client.py +363 -363
hud/agents/tests/test_openai.py +237 -237
hud/cli/__init__.py +617 -617
hud/cli/__main__.py +8 -8
hud/cli/analyze.py +371 -371
hud/cli/analyze_metadata.py +230 -230
hud/cli/build.py +498 -427
hud/cli/clone.py +185 -185
hud/cli/cursor.py +92 -92
hud/cli/debug.py +392 -392
hud/cli/docker_utils.py +83 -83
hud/cli/init.py +280 -281
hud/cli/interactive.py +353 -353
hud/cli/mcp_server.py +764 -756
hud/cli/pull.py +330 -336
hud/cli/push.py +404 -370
hud/cli/remote_runner.py +311 -311
hud/cli/runner.py +160 -160
hud/cli/tests/__init__.py +3 -3
hud/cli/tests/test_analyze.py +284 -284
hud/cli/tests/test_cli_init.py +265 -265
hud/cli/tests/test_cli_main.py +27 -27
hud/cli/tests/test_clone.py +142 -142
hud/cli/tests/test_cursor.py +253 -253
hud/cli/tests/test_debug.py +453 -453
hud/cli/tests/test_mcp_server.py +139 -139
hud/cli/tests/test_utils.py +388 -388
hud/cli/utils.py +263 -263
hud/clients/README.md +143 -143
hud/clients/__init__.py +16 -16
hud/clients/base.py +378 -379
hud/clients/fastmcp.py +222 -222
hud/clients/mcp_use.py +298 -278
hud/clients/tests/__init__.py +1 -1
hud/clients/tests/test_client_integration.py +111 -111
hud/clients/tests/test_fastmcp.py +342 -342
hud/clients/tests/test_protocol.py +188 -188
hud/clients/utils/__init__.py +1 -1
hud/clients/utils/retry_transport.py +160 -160
hud/datasets.py +327 -322
hud/misc/__init__.py +1 -1
hud/misc/claude_plays_pokemon.py +292 -292
hud/otel/__init__.py +35 -35
hud/otel/collector.py +142 -142
hud/otel/config.py +164 -164
hud/otel/context.py +536 -536
hud/otel/exporters.py +366 -366
hud/otel/instrumentation.py +97 -97
hud/otel/processors.py +118 -118
hud/otel/tests/__init__.py +1 -1
hud/otel/tests/test_processors.py +197 -197
hud/server/__init__.py +5 -5
hud/server/context.py +114 -114
hud/server/helper/__init__.py +5 -5
hud/server/low_level.py +132 -132
hud/server/server.py +170 -166
hud/server/tests/__init__.py +3 -3
hud/settings.py +73 -73
hud/shared/__init__.py +5 -5
hud/shared/exceptions.py +180 -180
hud/shared/requests.py +264 -264
hud/shared/tests/test_exceptions.py +157 -157
hud/shared/tests/test_requests.py +275 -275
hud/telemetry/__init__.py +25 -25
hud/telemetry/instrument.py +379 -379
hud/telemetry/job.py +309 -309
hud/telemetry/replay.py +74 -74
hud/telemetry/trace.py +83 -83
hud/tools/__init__.py +33 -33
hud/tools/base.py +365 -365
hud/tools/bash.py +161 -161
hud/tools/computer/__init__.py +15 -15
hud/tools/computer/anthropic.py +437 -437
hud/tools/computer/hud.py +376 -376
hud/tools/computer/openai.py +295 -295
hud/tools/computer/settings.py +82 -82
hud/tools/edit.py +314 -314
hud/tools/executors/__init__.py +30 -30
hud/tools/executors/base.py +539 -539
hud/tools/executors/pyautogui.py +621 -621
hud/tools/executors/tests/__init__.py +1 -1
hud/tools/executors/tests/test_base_executor.py +338 -338
hud/tools/executors/tests/test_pyautogui_executor.py +165 -165
hud/tools/executors/xdo.py +511 -511
hud/tools/playwright.py +412 -412
hud/tools/tests/__init__.py +3 -3
hud/tools/tests/test_base.py +282 -282
hud/tools/tests/test_bash.py +158 -158
hud/tools/tests/test_bash_extended.py +197 -197
hud/tools/tests/test_computer.py +425 -425
hud/tools/tests/test_computer_actions.py +34 -34
hud/tools/tests/test_edit.py +259 -259
hud/tools/tests/test_init.py +27 -27
hud/tools/tests/test_playwright_tool.py +183 -183
hud/tools/tests/test_tools.py +145 -145
hud/tools/tests/test_utils.py +156 -156
hud/tools/types.py +72 -72
hud/tools/utils.py +50 -50
hud/types.py +136 -136
hud/utils/__init__.py +10 -10
hud/utils/async_utils.py +65 -65
hud/utils/design.py +236 -168
hud/utils/mcp.py +55 -55
hud/utils/progress.py +149 -149
hud/utils/telemetry.py +66 -66
hud/utils/tests/test_async_utils.py +173 -173
hud/utils/tests/test_init.py +17 -17
hud/utils/tests/test_progress.py +261 -261
hud/utils/tests/test_telemetry.py +82 -82
hud/utils/tests/test_version.py +8 -8
hud/version.py +7 -7
{hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/METADATA +10 -8
hud_python-0.4.3.dist-info/RECORD +131 -0
{hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/licenses/LICENSE +21 -21
hud/agents/art.py +0 -101
hud_python-0.4.1.dist-info/RECORD +0 -132
{hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/WHEEL +0 -0
{hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/entry_points.txt +0 -0

hud/datasets.py CHANGED Viewed

@@ -1,322 +1,327 @@
-"""Dataset utilities for working with HuggingFace datasets and Tasks."""
-from __future__ import annotations
-import asyncio
-import json
-import logging
-from string import Template
-from typing import TYPE_CHECKING, Any, cast
-from datasets import Dataset, load_dataset
-from pydantic import BaseModel, Field, field_validator
-from hud.agents.misc import ResponseAgent
-from .types import MCPToolCall
-if TYPE_CHECKING:
-    from hud.agents import MCPAgent
-logger = logging.getLogger("hud.datasets")
-class Task(BaseModel):
-    """
-    A task configuration that can be used to create a task.
-    The mcp_config field supports environment variable substitution using
-    template placeholders in the format ${VAR_NAME} or ${VAR_NAME:default_value}.
-    Example:
-        mcp_config: {
-            "hud": {
-                "url": "${HUD_MCP_URL:https://mcp.hud.so/v3/mcp}",
-                "headers": {
-                    "Authorization": "Bearer ${HUD_API_KEY}",
-                    "Mcp-Image": "your-mcp-image"
-                }
-            }
-        }
-    """
-    id: str | None = None
-    prompt: str
-    mcp_config: dict[str, Any]
-    setup_tool: MCPToolCall | list[MCPToolCall] | None = None
-    evaluate_tool: MCPToolCall | list[MCPToolCall] | None = None
-    system_prompt: str | None = None
-    metadata: dict[str, Any] = Field(default_factory=dict)
-    @field_validator("mcp_config", "metadata", mode="before")
-    @classmethod
-    def parse_json_strings(cls, v: Any) -> Any:
-        """Parse JSON strings into dictionaries."""
-        if isinstance(v, str):
-            try:
-                return json.loads(v)
-            except json.JSONDecodeError as e:
-                raise ValueError(f"Invalid JSON string: {e}") from e
-        return v
-    @field_validator("setup_tool", "evaluate_tool", mode="before")
-    @classmethod
-    def convert_dict_to_tool_call(cls, v: Any) -> Any:
-        """Convert dict to MCPToolCall instance, parsing JSON strings first."""
-        if v is None:
-            return None
-        # Parse JSON string if needed
-        if isinstance(v, str):
-            try:
-                v = json.loads(v)
-            except json.JSONDecodeError as e:
-                raise ValueError(f"Invalid JSON string: {e}") from e
-        if isinstance(v, dict):
-            return MCPToolCall(**v)
-        if isinstance(v, list):
-            return [MCPToolCall(**item) if isinstance(item, dict) else item for item in v]
-        return v
-    @field_validator("mcp_config", mode="before")
-    @classmethod
-    def resolve_env_vars(cls, v: dict[str, Any]) -> dict[str, Any]:
-        """
-        Automatically resolve environment variables in mcp_config using Template.
-        Supports ${VAR_NAME} syntax with variable substitution from
-        System environment variables (including HUD_API_KEY, etc.)
-        Missing variables resolve to empty strings.
-        """
-        import os
-        # Start with current environment variables
-        mapping = dict(os.environ)
-        def substitute_in_value(obj: Any) -> Any:
-            """Recursively substitute variables in nested structures."""
-            if isinstance(obj, str):
-                # Use Template's substitute with defaultdict - missing vars become empty strings
-                from collections import defaultdict
-                safe_mapping = defaultdict(str, mapping)
-                return Template(obj).substitute(safe_mapping)
-            elif isinstance(obj, dict):
-                return {k: substitute_in_value(v) for k, v in obj.items()}
-            elif isinstance(obj, list):
-                return [substitute_in_value(item) for item in obj]
-            else:
-                return obj
-        return substitute_in_value(v)
-async def fetch_system_prompt_from_dataset(dataset_id: str) -> str | None:
-    """
-    Fetch system_prompt.txt from a HuggingFace dataset repository.
-    Args:
-        dataset_id: HuggingFace dataset identifier (e.g., "hud-evals/SheetBench-50")
-    Returns:
-        System prompt text if found, None otherwise
-    """
-    try:
-        # Import here to avoid unnecessary dependency
-        from huggingface_hub import hf_hub_download
-        from huggingface_hub.errors import EntryNotFoundError
-        # Try to download the system_prompt.txt file
-        try:
-            file_path = hf_hub_download(
-                repo_id=dataset_id, filename="system_prompt.txt", repo_type="dataset"
-            )
-            # Read and return the content
-            with open(file_path, encoding="utf-8") as f:  # noqa: ASYNC230
-                content = f.read().strip()
-                if content:
-                    logger.info(
-                        "Loaded system prompt from %s (length: %d chars)", dataset_id, len(content)
-                    )
-                    return content
-                else:
-                    logger.warning("System prompt file is empty in %s", dataset_id)
-                    return None
-        except EntryNotFoundError:
-            logger.debug("No system_prompt.txt found in dataset %s", dataset_id)
-            return None
-    except ImportError:
-        logger.warning(
-            "huggingface_hub not installed. Install it to fetch system prompts from datasets."
-        )
-        return None
-    except Exception as e:
-        logger.error("Error fetching system prompt from %s: %s", dataset_id, e)
-        return None
-async def run_dataset(
-    name: str,
-    dataset: str | Dataset | list[dict[str, Any]],
-    agent_class: type[MCPAgent],
-    agent_config: dict[str, Any] | None = None,
-    max_concurrent: int = 50,
-    metadata: dict[str, Any] | None = None,
-    max_steps: int = 40,
-    split: str = "train",
-    auto_respond: bool = False,
-    custom_system_prompt: str | None = None,
-) -> list[Any]:
-    """
-    Run all tasks in a dataset with automatic job tracking.
-    Args:
-        name: Name for the job
-        dataset: HuggingFace dataset identifier (e.g. "hud-evals/SheetBench-50"),
-                Dataset object, OR list of Task objects
-        agent_class: Agent class to instantiate (e.g., ClaudeAgent)
-        agent_config: Configuration/kwargs for agent (model, etc.)
-        max_concurrent: Maximum parallel task execution
-        metadata: Optional metadata for the job
-        max_steps: Maximum steps per task
-        split: Dataset split to use when loading from string (default: "train")
-        auto_respond: Whether to use auto-response agent
-    Returns:
-        List of results from agent.run() in dataset order
-    Example:
-        >>> from hud.agents import ClaudeAgent
-        >>> # Option 1: From dataset string identifier
-        >>> results = await run_dataset(
-        ...     "SheetBench Eval",
-        ...     "hud-evals/SheetBench-50",
-        ...     ClaudeAgent,
-        ...     {"model": "claude-3-5-sonnet-20241022"},
-        ... )
-        >>> # Option 2: From HuggingFace dataset object
-        >>> from datasets import load_dataset
-        >>> dataset = load_dataset("hud-evals/SheetBench-50", split="train")
-        >>> results = await run_dataset("my_eval", dataset, ClaudeAgent)
-        >>> # Option 3: From list of dicts
-        >>> tasks = [{"prompt": "...", "mcp_config": {...}, ...}, ...]
-        >>> results = await run_dataset("browser_eval", tasks, ClaudeAgent)
-    """
-    # Import here to avoid circular imports
-    import hud
-    dataset_link = None
-    # Load dataset from string if needed
-    if isinstance(dataset, str):
-        logger.info("Loading dataset %s from HuggingFace...", dataset)
-        dataset_link = dataset
-        # Load dataset from HuggingFace
-        dataset = cast("Dataset", load_dataset(dataset, split=split))
-    # Create job context
-    job_metadata = metadata or {}
-    job_metadata["agent_class"] = agent_class.__name__
-    job_metadata["agent_config"] = agent_config
-    # Extract dataset verification info if available
-    if isinstance(dataset, Dataset) and not dataset_link:
-        general_info = next(iter(dataset.info.__dict__["download_checksums"].keys())).split("/")
-        project = general_info[3]
-        dataset_name = general_info[4].split("@")[0]
-        dataset_link = f"{project}/{dataset_name}"
-    with hud.job(name, metadata=job_metadata, dataset_link=dataset_link) as job_obj:
-        # Run tasks with semaphore for concurrency control
-        sem = asyncio.Semaphore(max_concurrent)
-        results: list[Any | None] = [None] * len(dataset)
-        async def _worker(index: int, task_dict: Any, max_steps: int = 40) -> None:
-            async with sem:
-                # Create trace for this task
-                task_name = task_dict.get("prompt") or f"Task {index}"
-                if "system_prompt" not in task_dict:
-                    task_dict["system_prompt"] = custom_system_prompt
-                with hud.trace(task_name, job_id=job_obj.id, task_id=task_dict.get("id")):
-                    # Convert dict to Task here, at trace level
-                    task = Task(**task_dict)
-                    agent = agent_class(**(agent_config or {}))
-                    if auto_respond:
-                        agent.response_agent = ResponseAgent()
-                    results[index] = await agent.run(task, max_steps=max_steps)
-        # Execute all tasks
-        await asyncio.gather(
-            *[_worker(i, task, max_steps=max_steps) for i, task in enumerate(dataset)],
-            return_exceptions=True,  # Don't fail entire batch on one error
-        )
-    return results
-def save_tasks(
-    tasks: list[dict[str, Any]], repo_id: str, fields: list[str] | None = None, **kwargs: Any
-) -> None:
-    """
-    Save data to HuggingFace dataset with JSON string serialization.
-    Complex fields (dicts, lists) are serialized as JSON strings to maintain clean schema
-    and avoid null value pollution in HuggingFace datasets.
-    Args:
-        tasks: List of dictionaries to save
-        repo_id: HuggingFace repository ID (e.g., "hud-evals/my-tasks")
-        fields: Optional list of fields to save. If None, saves all fields from each dict.
-        **kwargs: Additional arguments passed to dataset.push_to_hub()
-    """
-    from datasets import Dataset
-    # Safety check: Ensure we're not saving Task objects (which have resolved env vars)
-    if tasks and isinstance(tasks[0], Task):
-        raise ValueError(
-            "save_tasks expects dictionaries, not Task objects. "
-            "Task objects have resolved environment variables which would expose secrets. "
-            "Please pass raw dictionaries with template strings like '${HUD_API_KEY}' preserved."
-        )
-    # Convert to rows with JSON string fields
-    data = []
-    for i, tc_dict in enumerate(tasks):
-        # Additional safety check for each item
-        if isinstance(tc_dict, Task):
-            raise ValueError(
-                f"Item {i} is a Task object, not a dictionary. "
-                "This would expose resolved environment variables. "
-                "Please convert to dictionary format with template strings preserved."
-            )
-        row = {}
-        # Determine which fields to process
-        fields_to_process = fields if fields is not None else list(tc_dict.keys())
-        for field in fields_to_process:
-            if field in tc_dict:
-                value = tc_dict[field]
-                # Serialize complex types as JSON strings
-                if isinstance(value, (dict | list)):
-                    row[field] = json.dumps(value)
-                elif isinstance(value, (str | int | float | bool | type(None))):
-                    row[field] = value if value is not None else ""
-                else:
-                    # For other types, convert to string
-                    row[field] = str(value)
-        data.append(row)
-    # Create and push dataset
-    dataset = Dataset.from_list(data)
-    dataset.push_to_hub(repo_id, **kwargs)
+"""Dataset utilities for working with HuggingFace datasets and Tasks."""
+from __future__ import annotations
+import asyncio
+import json
+import logging
+from string import Template
+from typing import TYPE_CHECKING, Any, cast
+from datasets import Dataset, load_dataset
+from pydantic import BaseModel, Field, field_validator
+from hud.agents.misc import ResponseAgent
+from hud.settings import settings
+from .types import MCPToolCall
+if TYPE_CHECKING:
+    from hud.agents import MCPAgent
+logger = logging.getLogger("hud.datasets")
+class Task(BaseModel):
+    """
+    A task configuration that can be used to create a task.
+    The mcp_config field supports environment variable substitution using
+    template placeholders in the format ${VAR_NAME} or ${VAR_NAME:default_value}.
+    Example:
+        mcp_config: {
+            "hud": {
+                "url": "${HUD_MCP_URL:https://mcp.hud.so/v3/mcp}",
+                "headers": {
+                    "Authorization": "Bearer ${HUD_API_KEY}",
+                    "Mcp-Image": "your-mcp-image"
+                }
+            }
+        }
+    """
+    id: str | None = None
+    prompt: str
+    mcp_config: dict[str, Any]
+    setup_tool: MCPToolCall | list[MCPToolCall] | None = None
+    evaluate_tool: MCPToolCall | list[MCPToolCall] | None = None
+    system_prompt: str | None = None
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    @field_validator("mcp_config", "metadata", mode="before")
+    @classmethod
+    def parse_json_strings(cls, v: Any) -> Any:
+        """Parse JSON strings into dictionaries."""
+        if isinstance(v, str):
+            try:
+                return json.loads(v)
+            except json.JSONDecodeError as e:
+                raise ValueError(f"Invalid JSON string: {e}") from e
+        return v
+    @field_validator("setup_tool", "evaluate_tool", mode="before")
+    @classmethod
+    def convert_dict_to_tool_call(cls, v: Any) -> Any:
+        """Convert dict to MCPToolCall instance, parsing JSON strings first."""
+        if v is None:
+            return None
+        # Parse JSON string if needed
+        if isinstance(v, str):
+            try:
+                v = json.loads(v)
+            except json.JSONDecodeError as e:
+                raise ValueError(f"Invalid JSON string: {e}") from e
+        if isinstance(v, dict):
+            return MCPToolCall(**v)
+        if isinstance(v, list):
+            return [MCPToolCall(**item) if isinstance(item, dict) else item for item in v]
+        return v
+    @field_validator("mcp_config", mode="before")
+    @classmethod
+    def resolve_env_vars(cls, v: dict[str, Any]) -> dict[str, Any]:
+        """
+        Automatically resolve environment variables in mcp_config using Template.
+        Supports ${VAR_NAME} syntax with variable substitution from
+        System environment variables (including HUD_API_KEY, etc.)
+        Missing variables resolve to empty strings.
+        """
+        import os
+        # Start with current environment variables
+        mapping = dict(os.environ)
+        mapping.update(settings.model_dump())
+        if settings.api_key:
+            mapping["HUD_API_KEY"] = settings.api_key
+        def substitute_in_value(obj: Any) -> Any:
+            """Recursively substitute variables in nested structures."""
+            if isinstance(obj, str):
+                # Use Template's substitute with defaultdict - missing vars become empty strings
+                from collections import defaultdict
+                safe_mapping = defaultdict(str, mapping)
+                return Template(obj).substitute(safe_mapping)
+            elif isinstance(obj, dict):
+                return {k: substitute_in_value(v) for k, v in obj.items()}
+            elif isinstance(obj, list):
+                return [substitute_in_value(item) for item in obj]
+            else:
+                return obj
+        return substitute_in_value(v)
+async def fetch_system_prompt_from_dataset(dataset_id: str) -> str | None:
+    """
+    Fetch system_prompt.txt from a HuggingFace dataset repository.
+    Args:
+        dataset_id: HuggingFace dataset identifier (e.g., "hud-evals/SheetBench-50")
+    Returns:
+        System prompt text if found, None otherwise
+    """
+    try:
+        # Import here to avoid unnecessary dependency
+        from huggingface_hub import hf_hub_download
+        from huggingface_hub.errors import EntryNotFoundError
+        # Try to download the system_prompt.txt file
+        try:
+            file_path = hf_hub_download(
+                repo_id=dataset_id, filename="system_prompt.txt", repo_type="dataset"
+            )
+            # Read and return the content
+            with open(file_path, encoding="utf-8") as f:  # noqa: ASYNC230
+                content = f.read().strip()
+                if content:
+                    logger.info(
+                        "Loaded system prompt from %s (length: %d chars)", dataset_id, len(content)
+                    )
+                    return content
+                else:
+                    logger.warning("System prompt file is empty in %s", dataset_id)
+                    return None
+        except EntryNotFoundError:
+            logger.debug("No system_prompt.txt found in dataset %s", dataset_id)
+            return None
+    except ImportError:
+        logger.warning(
+            "huggingface_hub not installed. Install it to fetch system prompts from datasets."
+        )
+        return None
+    except Exception as e:
+        logger.error("Error fetching system prompt from %s: %s", dataset_id, e)
+        return None
+async def run_dataset(
+    name: str,
+    dataset: str | Dataset | list[dict[str, Any]],
+    agent_class: type[MCPAgent],
+    agent_config: dict[str, Any] | None = None,
+    max_concurrent: int = 50,
+    metadata: dict[str, Any] | None = None,
+    max_steps: int = 40,
+    split: str = "train",
+    auto_respond: bool = False,
+    custom_system_prompt: str | None = None,
+) -> list[Any]:
+    """
+    Run all tasks in a dataset with automatic job tracking.
+    Args:
+        name: Name for the job
+        dataset: HuggingFace dataset identifier (e.g. "hud-evals/SheetBench-50"),
+                Dataset object, OR list of Task objects
+        agent_class: Agent class to instantiate (e.g., ClaudeAgent)
+        agent_config: Configuration/kwargs for agent (model, etc.)
+        max_concurrent: Maximum parallel task execution
+        metadata: Optional metadata for the job
+        max_steps: Maximum steps per task
+        split: Dataset split to use when loading from string (default: "train")
+        auto_respond: Whether to use auto-response agent
+    Returns:
+        List of results from agent.run() in dataset order
+    Example:
+        >>> from hud.agents import ClaudeAgent
+        >>> # Option 1: From dataset string identifier
+        >>> results = await run_dataset(
+        ...     "SheetBench Eval",
+        ...     "hud-evals/SheetBench-50",
+        ...     ClaudeAgent,
+        ...     {"model": "claude-3-5-sonnet-20241022"},
+        ... )
+        >>> # Option 2: From HuggingFace dataset object
+        >>> from datasets import load_dataset
+        >>> dataset = load_dataset("hud-evals/SheetBench-50", split="train")
+        >>> results = await run_dataset("my_eval", dataset, ClaudeAgent)
+        >>> # Option 3: From list of dicts
+        >>> tasks = [{"prompt": "...", "mcp_config": {...}, ...}, ...]
+        >>> results = await run_dataset("browser_eval", tasks, ClaudeAgent)
+    """
+    # Import here to avoid circular imports
+    import hud
+    dataset_link = None
+    # Load dataset from string if needed
+    if isinstance(dataset, str):
+        logger.info("Loading dataset %s from HuggingFace...", dataset)
+        dataset_link = dataset
+        # Load dataset from HuggingFace
+        dataset = cast("Dataset", load_dataset(dataset, split=split))
+    # Create job context
+    job_metadata = metadata or {}
+    job_metadata["agent_class"] = agent_class.__name__
+    job_metadata["agent_config"] = agent_config
+    # Extract dataset verification info if available
+    if isinstance(dataset, Dataset) and not dataset_link:
+        general_info = next(iter(dataset.info.__dict__["download_checksums"].keys())).split("/")
+        project = general_info[3]
+        dataset_name = general_info[4].split("@")[0]
+        dataset_link = f"{project}/{dataset_name}"
+    with hud.job(name, metadata=job_metadata, dataset_link=dataset_link) as job_obj:
+        # Run tasks with semaphore for concurrency control
+        sem = asyncio.Semaphore(max_concurrent)
+        results: list[Any | None] = [None] * len(dataset)
+        async def _worker(index: int, task_dict: Any, max_steps: int = 40) -> None:
+            async with sem:
+                # Create trace for this task
+                task_name = task_dict.get("prompt") or f"Task {index}"
+                if "system_prompt" not in task_dict:
+                    task_dict["system_prompt"] = custom_system_prompt
+                with hud.trace(task_name, job_id=job_obj.id, task_id=task_dict.get("id")):
+                    # Convert dict to Task here, at trace level
+                    task = Task(**task_dict)
+                    agent = agent_class(**(agent_config or {}))
+                    if auto_respond:
+                        agent.response_agent = ResponseAgent()
+                    results[index] = await agent.run(task, max_steps=max_steps)
+        # Execute all tasks
+        await asyncio.gather(
+            *[_worker(i, task, max_steps=max_steps) for i, task in enumerate(dataset)],
+            return_exceptions=True,  # Don't fail entire batch on one error
+        )
+    return results
+def save_tasks(
+    tasks: list[dict[str, Any]], repo_id: str, fields: list[str] | None = None, **kwargs: Any
+) -> None:
+    """
+    Save data to HuggingFace dataset with JSON string serialization.
+    Complex fields (dicts, lists) are serialized as JSON strings to maintain clean schema
+    and avoid null value pollution in HuggingFace datasets.
+    Args:
+        tasks: List of dictionaries to save
+        repo_id: HuggingFace repository ID (e.g., "hud-evals/my-tasks")
+        fields: Optional list of fields to save. If None, saves all fields from each dict.
+        **kwargs: Additional arguments passed to dataset.push_to_hub()
+    """
+    from datasets import Dataset
+    # Safety check: Ensure we're not saving Task objects (which have resolved env vars)
+    if tasks and isinstance(tasks[0], Task):
+        raise ValueError(
+            "save_tasks expects dictionaries, not Task objects. "
+            "Task objects have resolved environment variables which would expose secrets. "
+            "Please pass raw dictionaries with template strings like '${HUD_API_KEY}' preserved."
+        )
+    # Convert to rows with JSON string fields
+    data = []
+    for i, tc_dict in enumerate(tasks):
+        # Additional safety check for each item
+        if isinstance(tc_dict, Task):
+            raise ValueError(
+                f"Item {i} is a Task object, not a dictionary. "
+                "This would expose resolved environment variables. "
+                "Please convert to dictionary format with template strings preserved."
+            )
+        row = {}
+        # Determine which fields to process
+        fields_to_process = fields if fields is not None else list(tc_dict.keys())
+        for field in fields_to_process:
+            if field in tc_dict:
+                value = tc_dict[field]
+                # Serialize complex types as JSON strings
+                if isinstance(value, (dict | list)):
+                    row[field] = json.dumps(value)
+                elif isinstance(value, (str | int | float | bool | type(None))):
+                    row[field] = value if value is not None else ""
+                else:
+                    # For other types, convert to string
+                    row[field] = str(value)
+        data.append(row)
+    # Create and push dataset
+    dataset = Dataset.from_list(data)
+    dataset.push_to_hub(repo_id, **kwargs)

hud-python 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

Potentially problematic release.

hud-python 0.4.1py3-none-any.whl → 0.4.3py3-none-any.whl