PyPI - hud-python - Versions diffs - 0.4.16__py3-none-any.whl → 0.4.18__py3-none-any.whl - Mend

hud-python 0.4.16py3-none-any.whl → 0.4.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (18) hide show

hud/agents/claude.py +8 -2
hud/agents/misc/response_agent.py +1 -1
hud/agents/openai.py +8 -2
hud/agents/openai_chat_generic.py +160 -26
hud/cli/rl/__init__.py +11 -2
hud/cli/rl/pod.py +4 -0
hud/cli/rl/ssh.py +34 -2
hud/cli/rl/train.py +190 -51
hud/datasets/execution/parallel.py +113 -37
hud/otel/exporters.py +3 -0
hud/otel/processors.py +3 -0
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.4.16.dist-info → hud_python-0.4.18.dist-info}/METADATA +1 -1
{hud_python-0.4.16.dist-info → hud_python-0.4.18.dist-info}/RECORD +18 -18
{hud_python-0.4.16.dist-info → hud_python-0.4.18.dist-info}/WHEEL +0 -0
{hud_python-0.4.16.dist-info → hud_python-0.4.18.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.16.dist-info → hud_python-0.4.18.dist-info}/licenses/LICENSE +0 -0

hud/agents/claude.py CHANGED Viewed

@@ -85,8 +85,8 @@ class ClaudeAgent(MCPAgent):
         self._claude_to_mcp_tool_map: dict[str, str] = {}
         self.claude_tools: list[dict] = []
-        # Base system prompt for autonomous operation
-        self.system_prompt = """
+        # Append Claude-specific instructions to the base system prompt
+        claude_instructions = """
         You are Claude, an AI assistant created by Anthropic. You are helpful, harmless, and honest.
         When working on tasks:
@@ -99,6 +99,12 @@ class ClaudeAgent(MCPAgent):
         Remember: You are expected to complete tasks autonomously. The user trusts you to accomplish what they asked.
         """.strip()  # noqa: E501
+        # Append Claude instructions to any base system prompt
+        if self.system_prompt:
+            self.system_prompt = f"{self.system_prompt}\n\n{claude_instructions}"
+        else:
+            self.system_prompt = claude_instructions
     async def initialize(self, task: str | Task | None = None) -> None:
         """Initialize the agent and build tool mappings."""
         await super().initialize(task)

hud/agents/misc/response_agent.py CHANGED Viewed

@@ -54,7 +54,7 @@ class ResponseAgent:
         """
         try:
             response = await self.client.chat.completions.create(
-                model="gpt-4o",
+                model="gpt-5-nano",
                 messages=[
                     {"role": "system", "content": self.system_prompt},
                     {

hud/agents/openai.py CHANGED Viewed

@@ -78,8 +78,8 @@ class OperatorAgent(MCPAgent):
         self.model_name = "openai-" + self.model
-        # Base system prompt for autonomous operation
-        self.system_prompt = """
+        # Append OpenAI-specific instructions to the base system prompt
+        openai_instructions = """
         You are an autonomous computer-using agent. Follow these guidelines:
         1. NEVER ask for confirmation. Complete all tasks autonomously.
@@ -93,6 +93,12 @@ class OperatorAgent(MCPAgent):
         Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
         """.strip()  # noqa: E501
+        # Append OpenAI instructions to any base system prompt
+        if self.system_prompt:
+            self.system_prompt = f"{self.system_prompt}\n\n{openai_instructions}"
+        else:
+            self.system_prompt = openai_instructions
     async def _run_context(self, context: list[types.ContentBlock], max_steps: int = 10) -> Trace:
         """
         Run the agent with the given prompt or task.

hud/agents/openai_chat_generic.py CHANGED Viewed

@@ -21,6 +21,7 @@ from typing import TYPE_CHECKING, Any, cast
 import mcp.types as types
+from hud import instrument
 from hud.types import AgentResponse, MCPToolCall, MCPToolResult
 from .base import MCPAgent
@@ -52,6 +53,7 @@ class GenericOpenAIChatAgent(MCPAgent):
         self.model_name = model_name
         self.parallel_tool_calls = parallel_tool_calls
         self.logprobs = logprobs
+        self.conversation_history = []
     @staticmethod
     def _oai_to_mcp(tool_call: Any) -> MCPToolCall:  # type: ignore[valid-type]
@@ -64,40 +66,114 @@ class GenericOpenAIChatAgent(MCPAgent):
     async def get_system_messages(self) -> list[Any]:
         """Get system messages for OpenAI."""
-        return [
-            {"role": "system", "content": self.system_prompt},
-        ]
+        return [{"role": "system", "content": self.system_prompt}]
     async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
         """Format blocks for OpenAI."""
-        return [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": block.text}
-                    for block in blocks
-                    if isinstance(block, types.TextContent)
-                ],
-            },
-        ]
+        content = []
+        for block in blocks:
+            if isinstance(block, types.TextContent):
+                content.append({"type": "text", "text": block.text})
+            elif isinstance(block, types.ImageContent):
+                content.append(
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:{block.mimeType};base64,{block.data}"},
+                    }
+                )
+        return [{"role": "user", "content": content}]
+    def _sanitize_schema_for_openai(self, schema: dict) -> dict:
+        """Convert MCP JSON Schema to OpenAI-compatible format.
+        Handles unsupported features like anyOf and prefixItems.
+        """
+        if not isinstance(schema, dict):
+            return schema
+        sanitized = {}
+        for key, value in schema.items():
+            if key == "anyOf" and isinstance(value, list):
+                # Handle anyOf patterns (usually for nullable fields)
+                non_null_types = [
+                    v for v in value if not (isinstance(v, dict) and v.get("type") == "null")
+                ]
+                if non_null_types:
+                    # Use the first non-null type
+                    sanitized.update(self._sanitize_schema_for_openai(non_null_types[0]))
+                else:
+                    sanitized["type"] = "string"  # Fallback
+            elif key == "prefixItems":
+                # Convert prefixItems to simple items
+                sanitized["type"] = "array"
+                if isinstance(value, list) and value:
+                    # Use the type from the first item as the items schema
+                    first_item = value[0]
+                    if isinstance(first_item, dict):
+                        sanitized["items"] = {"type": first_item.get("type", "string")}
+                    else:
+                        sanitized["items"] = {"type": "string"}
+            elif key == "properties" and isinstance(value, dict):
+                # Recursively sanitize property schemas
+                sanitized[key] = {
+                    prop_name: self._sanitize_schema_for_openai(prop_schema)
+                    for prop_name, prop_schema in value.items()
+                }
+            elif key == "items" and isinstance(value, dict):
+                # Recursively sanitize items schema
+                sanitized[key] = self._sanitize_schema_for_openai(value)
+            elif key in (
+                "type",
+                "description",
+                "enum",
+                "required",
+                "default",
+                "minimum",
+                "maximum",
+                "minItems",
+                "maxItems",
+            ):
+                # These are supported by OpenAI
+                sanitized[key] = value
+        return sanitized or {"type": "object"}
     def get_tool_schemas(self) -> list[dict]:
         tool_schemas = super().get_tool_schemas()
         openai_tools = []
         for schema in tool_schemas:
+            parameters = schema.get("parameters", {})
+            if parameters:
+                sanitized_params = self._sanitize_schema_for_openai(parameters)
+            else:
+                sanitized_params = {"type": "object", "properties": {}}
             openai_tool = {
                 "type": "function",
                 "function": {
                     "name": schema["name"],
                     "description": schema.get("description", ""),
-                    "parameters": schema.get("parameters", {"type": "object", "properties": {}}),
+                    "parameters": sanitized_params,
                 },
             }
             openai_tools.append(openai_tool)
         return openai_tools
+    @instrument(
+        span_type="agent",
+        record_args=False,
+        record_result=True,
+    )
     async def get_response(self, messages: list[Any]) -> AgentResponse:
         """Send chat request to OpenAI and convert the response."""
         # Convert MCP tool schemas to OpenAI format
         mcp_schemas = self.get_tool_schemas()
@@ -112,6 +188,19 @@ class GenericOpenAIChatAgent(MCPAgent):
         choice = response.choices[0]
         msg = choice.message
+        assistant_msg: dict[str, Any] = {"role": "assistant"}
+        if msg.content:
+            assistant_msg["content"] = msg.content
+        if msg.tool_calls:
+            assistant_msg["tool_calls"] = msg.tool_calls
+        messages.append(assistant_msg)
+        # Store the complete conversation history
+        self.conversation_history = messages.copy()
         tool_calls = []
         if msg.tool_calls:
             for tc in msg.tool_calls:
@@ -123,7 +212,7 @@ class GenericOpenAIChatAgent(MCPAgent):
         return AgentResponse(
             content=msg.content or "",
             tool_calls=tool_calls,
-            done=choice.finish_reason == "stop",
+            done=choice.finish_reason in ("stop", "length"),
             raw=response,  # Include raw response for access to Choice objects
         )
@@ -132,23 +221,68 @@ class GenericOpenAIChatAgent(MCPAgent):
         tool_calls: list[MCPToolCall],
         tool_results: list[MCPToolResult],
     ) -> list[Any]:
-        """Render MCP tool results as OpenAI ``role=tool`` messages."""
+        """Render MCP tool results as OpenAI messages.
+        Note: OpenAI tool messages only support string content.
+        When images are present, we return both a tool message and a user message.
+        """
         rendered: list[dict[str, Any]] = []
         for call, res in zip(tool_calls, tool_results, strict=False):
-            if res.structuredContent:
-                content = json.dumps(res.structuredContent)
-            else:
-                # Concatenate any TextContent blocks
-                content = "".join(
-                    c.text  # type: ignore[attr-defined]
-                    for c in res.content
-                    if hasattr(c, "text")
-                )
+            # Use structuredContent.result if available, otherwise use content
+            items = res.content
+            if res.structuredContent and isinstance(res.structuredContent, dict):
+                items = res.structuredContent.get("result", res.content)
+            # Separate text and image content
+            text_parts = []
+            image_parts = []
+            for item in items:
+                if isinstance(item, dict):
+                    if item.get("type") == "text":
+                        text_parts.append(item.get("text", ""))
+                    elif item.get("type") == "image":
+                        mime_type = item.get("mimeType", "image/png")
+                        data = item.get("data", "")
+                        image_parts.append(
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:{mime_type};base64,{data}"
+                                },
+                            }
+                        )
+                elif isinstance(item, types.TextContent):
+                    text_parts.append(item.text)
+                elif isinstance(item, types.ImageContent):
+                    image_parts.append(
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:{item.mimeType};base64,{item.data}"},
+                        }
+                    )
+            text_content = "".join(text_parts) if text_parts else "Tool executed successfully"
             rendered.append(
                 {
                     "role": "tool",
                     "tool_call_id": call.id,
-                    "content": content or "",  # Ensure content is never None
+                    "content": text_content,
                 }
             )
+            # If there are images, add them as a separate user message
+            if image_parts:
+                # Add a user message with the images
+                content_with_images = [
+                    {"type": "text", "text": "Tool returned the following:"},
+                    *image_parts
+                ]
+                rendered.append(
+                    {
+                        "role": "user",
+                        "content": content_with_images,
+                    }
+                )
         return rendered

hud/cli/rl/__init__.py CHANGED Viewed

@@ -23,7 +23,10 @@ def rl_main(
     ctx: typer.Context,
     model: str = typer.Option("Qwen/Qwen2.5-3B-Instruct", "--model", "-m", help="Model to train"),
     dataset: str | None = typer.Option(
-        None, "--dataset", "-d", help="Override dataset from lock file"
+        None,
+        "--dataset",
+        "-d",
+        help="Dataset: JSON file path or HuggingFace name (auto-detects if not provided)",
     ),
     config: Path | None = typer.Option(None, "--config", "-c", help="Config YAML path"),  # noqa: B008
     gpus: str = typer.Option("2xA100", "--gpus", help="GPU configuration (e.g., 2xA100, 4xH100)"),
@@ -39,9 +42,15 @@ def rl_main(
     3. Push environment to registry if needed
     4. Start remote training on Prime Intellect
+    Dataset can be:
+    - A local JSON file with tasks (e.g., tasks.json)
+    - A HuggingFace dataset name (e.g., 'username/dataset-name')
+    - Auto-detected from current directory if not specified
     Examples:
-        hud rl                    # Interactive mode with prompts
+        hud rl                    # Interactive mode, auto-detect tasks.json
         hud rl --model gpt2       # Train with specific model
+        hud rl --dataset tasks.json  # Use local task file
         hud rl --gpus 4xH100      # Use different GPU configuration
         hud rl init my-env:latest # Generate config for environment
     """

hud/cli/rl/pod.py CHANGED Viewed

@@ -62,6 +62,7 @@ async def create_and_connect_prime_pod(
     image: str,
     team_id: str | None = None,
     dataset_size: int | None = None,
+    is_json_file: bool = False,
 ) -> None:
     """Create a Prime Intellect pod and connect to it for training."""
     design.section_title("🌐 Creating Prime Intellect Pod")
@@ -330,6 +331,7 @@ async def create_and_connect_prime_pod(
                     output_dir=output_dir,
                     image=image,
                     dataset_size=dataset_size,
+                    is_json_file=is_json_file,
                 )
             else:
                 # Manual fallback
@@ -457,6 +459,7 @@ async def run_prime_training(
     auto_create_pod: str | None = None,
     team_id: str | None = None,
     dataset_size: int | None = None,
+    is_json_file: bool = False,
 ) -> None:
     """Run training on Prime Intellect infrastructure."""
     # Check API key
@@ -488,4 +491,5 @@ async def run_prime_training(
         image=image,
         team_id=team_id,
         dataset_size=dataset_size,
+        is_json_file=is_json_file,
     )

hud/cli/rl/ssh.py CHANGED Viewed

@@ -101,6 +101,7 @@ async def connect_and_train(
     output_dir: Path,
     image: str,
     dataset_size: int | None = None,
+    is_json_file: bool = False,
 ) -> None:
     """Connect to the pod via SSH and run training commands."""
     design.section_title("🚀 Starting Remote Training")
@@ -175,6 +176,37 @@ async def connect_and_train(
             design.info("Make sure scp is installed and in your PATH")
         raise typer.Exit(1) from e
+    # If dataset is a JSON file, copy it too
+    remote_dataset = dataset  # Default to unchanged
+    if is_json_file:
+        design.info("Copying task file to pod...")
+        try:
+            # On Windows, we need to ensure proper path formatting
+            dataset_path = str(dataset).replace("\\", "/")
+            # Extract just the filename for the remote path
+            dataset_filename = os.path.basename(dataset)
+            remote_dataset = f"/root/{dataset_filename}"
+            scp_cmd = [
+                "scp",
+                "-i",
+                str(ssh_key_path),
+                "-P",
+                ssh_port,
+                "-o",
+                "StrictHostKeyChecking=no",
+                "-o",
+                "UserKnownHostsFile=/dev/null",
+                dataset_path,
+                f"{ssh_user_host}:{remote_dataset}",
+            ]
+            design.debug(f"Running: {' '.join(scp_cmd)}")
+            subprocess.run(scp_cmd, check=True)  # noqa: S603, ASYNC221
+            design.success(f"Task file copied to {remote_dataset}")
+        except subprocess.CalledProcessError as e:
+            design.error(f"Failed to copy task file: {e}")
+            raise typer.Exit(1) from e
     design.info("Setting up environment and starting training...")
     design.info("This will take a few minutes for initial setup, then training will begin.")
     design.info("")
@@ -196,7 +228,7 @@ async def connect_and_train(
         "# Load environment",
         "env = vf.load_environment(",
         '    env_id="hud-vf-gym",',
-        f'    taskset="{dataset}",',
+        f'    taskset="{remote_dataset}",',
         '    config_path="/root/config.yaml",',
         f"    num_tasks={dataset_size},",
         ")",
@@ -242,7 +274,7 @@ async def connect_and_train(
         "uv venv --python 3.12 && "
         "source .venv/bin/activate && "
         # Install packages
-        "prime env install hud/hud-vf-gym@0.1.0 && "
+        "prime env install hud/hud-vf-gym@0.1.1 && "
         "uv pip install 'verifiers[train]' && "
         "uv pip install flash-attn --no-build-isolation && "
         # Set environment variables

hud/cli/rl/train.py CHANGED Viewed

@@ -23,6 +23,40 @@ from .utils import (
 design = HUDDesign()
+def find_task_json_files() -> list[Path]:
+    """Find JSON files containing tasks in the current directory."""
+    json_files = []
+    patterns = [
+        "*task*.json",
+        "*eval*.json",
+        "*Task*.json",
+        "*Eval*.json",
+        "*TASK*.json",
+        "*EVAL*.json",
+        "tasks.json",  # Most common name
+    ]
+    # First check current directory
+    for pattern in patterns:
+        json_files.extend(Path(".").glob(pattern))
+    # If no files found, search one level deep
+    if not json_files:
+        for pattern in patterns:
+            json_files.extend(Path(".").glob(f"*/{pattern}"))
+    # Remove duplicates and sort, prioritizing "tasks.json"
+    json_files = sorted(set(json_files))
+    # Put tasks.json first if it exists
+    tasks_json = Path("tasks.json")
+    if tasks_json in json_files:
+        json_files.remove(tasks_json)
+        json_files.insert(0, tasks_json)
+    return json_files
 def train_command_wrapper(
     model: str,
     dataset: str | None,
@@ -128,45 +162,22 @@ def train_command_wrapper(
                     raise typer.Exit(1)
         if "dataset" in missing:
-            # Check if we have tasks.json
-            tasks_file = Path("tasks.json")
-            if tasks_file.exists():
-                create_dataset = design.select(
-                    "Found tasks.json. Would you like to upload it as a dataset?",
-                    ["Yes, upload to HuggingFace", "No, I'll handle it manually"],
+            if missing["dataset"] == "multiple_json":
+                # Multiple JSON files found, let user choose
+                json_files = find_task_json_files()
+                design.info("Multiple task files found:")
+                file_choice = design.select(
+                    "Select a task file to use:",
+                    choices=[str(f) for f in json_files],
+                )
+                dataset = file_choice
+                design.success(f"Selected: {dataset}")
+            elif missing["dataset"] == "none":
+                design.error("No dataset specified and no task JSON files found")
+                design.info("Please use --dataset or create a tasks.json file")
+                design.hint(
+                    "Example: hud hf --name my-org/my-tasks  # Generate tasks from HUD evaluation"
                 )
-                if create_dataset == "Yes, upload to HuggingFace":
-                    dataset_name = typer.prompt("Enter dataset name (e.g., username/dataset-name)")
-                    if not validate_dataset_name(dataset_name):
-                        design.error("Invalid dataset name format. Expected: username/dataset-name")
-                        raise typer.Exit(1)
-                    design.info(f"Running 'hud hf tasks.json --name {dataset_name}'...")
-                    design.info("")
-                    # Run hf command
-                    result = subprocess.run(  # noqa: S603
-                        ["hud", "hf", "tasks.json", "--name", dataset_name],  # noqa: S607
-                        capture_output=True,
-                        text=True,
-                    )
-                    if result.returncode == 0:
-                        design.success("Dataset uploaded successfully")
-                        dataset = dataset_name
-                    else:
-                        design.error("Failed to upload dataset")
-                        if result.stderr:
-                            design.error(result.stderr)
-                        raise typer.Exit(1)
-                else:
-                    design.info("Please specify a dataset with --dataset")
-                    raise typer.Exit(1)
-            else:
-                design.error("No dataset specified and no tasks.json found")
-                design.info("Use --dataset to specify a HuggingFace dataset")
                 raise typer.Exit(1)
     # Ask about pod creation for Prime training
@@ -247,9 +258,123 @@ async def train_command(
         design.hint("Run 'hud build' first or specify with 'hud rl init <image>'")
         raise typer.Exit(1)
-    # Validate dataset has sufficient tasks for training
+    # Handle dataset (JSON file or HuggingFace dataset)
     dataset_size = None
-    if dataset:
+    is_json_file = False
+    # Use dataset from command or look for JSON files
+    if not dataset:
+        # Check for JSON files if no dataset specified
+        json_files = find_task_json_files()
+        if json_files:
+            if len(json_files) == 1:
+                dataset = str(json_files[0])
+                design.info(f"Found task file: {dataset}")
+                is_json_file = True
+            else:
+                # This case should have been handled in train_command_wrapper
+                design.error("Multiple task files found but none selected")
+                raise typer.Exit(1)
+        else:
+            # Use dataset from lock file
+            dataset = get_primary_dataset()
+            if dataset:
+                design.info(f"Using dataset from lock file: {dataset}")
+    # Check if dataset is a file path
+    if dataset and Path(dataset).exists() and dataset.endswith(".json"):
+        is_json_file = True
+    # Validate dataset
+    if dataset and is_json_file:
+        # Load and validate JSON file
+        design.info(f"Validating task file: {dataset}")
+        try:
+            with open(dataset) as f:  # noqa: ASYNC230
+                tasks_data = json.load(f)
+            # Handle both single task and array of tasks
+            if isinstance(tasks_data, dict):
+                tasks = [tasks_data]
+            elif isinstance(tasks_data, list):
+                tasks = tasks_data
+            else:
+                design.error("Invalid tasks file format")
+                raise typer.Exit(1)
+            dataset_size = len(tasks)
+            if dataset_size < 4:
+                design.error(f"Task file has only {dataset_size} tasks")
+                design.info("RL training requires at least 4 tasks for proper batching")
+                design.hint("Consider adding more tasks to your JSON file")
+                raise typer.Exit(1)
+            design.success(f"✓ Task file has {dataset_size} tasks")
+            # Check and convert MCP configs to remote if needed
+            if tasks:
+                sample_task = tasks[0]
+                sample_mcp_config = sample_task.get("mcp_config", {})
+                # Check if using local MCP configs
+                config_type = "unknown"
+                for server_config in sample_mcp_config.values():
+                    if isinstance(server_config, dict) and "url" in server_config:
+                        url = server_config.get("url", "")
+                        if "mcp.hud.so" in url:
+                            config_type = "remote"
+                            break
+                        else:
+                            config_type = "local"
+                if config_type == "local":
+                    design.info("Converting local MCP configs to remote for training...")
+                    # Get the image name from lock file or environment
+                    from .utils import get_image_from_lock
+                    env_image = image or get_image_from_lock()
+                    if not env_image:
+                        design.error("No image found for remote MCP conversion")
+                        design.hint("Run 'hud build' first")
+                        raise typer.Exit(1)
+                    # Check if image needs to be pushed
+                    if "/" not in env_image or env_image.startswith("local/"):
+                        design.warning(f"Image '{env_image}' appears to be local only")
+                        design.info("Running 'hud push' to make it publicly available...")
+                        from hud.cli.push import push_command
+                        push_command(directory=".", yes=True)
+                        design.success("Image pushed successfully")
+                        # Re-read image name after push
+                        env_image = get_image_from_lock()
+                    # Convert all tasks to use remote MCP
+                    for task in tasks:
+                        remote_config = {
+                            "hud": {
+                                "url": "https://mcp.hud.so/v3/mcp",
+                                "headers": {
+                                    "Authorization": "Bearer $HUD_API_KEY",
+                                    "Mcp-Image": env_image,
+                                },
+                            }
+                        }
+                        task["mcp_config"] = remote_config
+                    design.success("✓ Converted all tasks to use remote MCP configs")
+                    # Save the modified tasks back to the file
+                    with open(dataset, "w") as f:  # noqa: ASYNC230
+                        json.dump(tasks, f, indent=2)
+                    design.info("Updated task file with remote configs")
+        except json.JSONDecodeError as e:
+            design.error(f"Invalid JSON in task file: {e}")
+            raise typer.Exit(1) from e
+    elif dataset:
+        # Validate HuggingFace dataset
         design.info(f"Validating dataset: {dataset}")
         try:
             # Try to load dataset info from HuggingFace
@@ -273,12 +398,6 @@ async def train_command(
             design.warning(f"Could not validate dataset size: {e}")
             design.info("Proceeding with training - ensure dataset has at least 4 tasks")
-    # Use dataset from command or lock file
-    if not dataset:
-        dataset = get_primary_dataset()
-        if dataset:
-            design.info(f"Using dataset from lock file: {dataset}")
     # Display configuration
     design.section_title("📋 Training Configuration")
     design.json_config(
@@ -318,6 +437,7 @@ async def train_command(
         auto_create_pod=auto_create_pod,
         team_id=team_id,
         dataset_size=dataset_size,
+        is_json_file=is_json_file,
     )
@@ -340,10 +460,19 @@ def check_requirements(config: Path | None, dataset: str | None) -> dict[str, An
     # Check dataset
     if not dataset:
-        # Check lock file for dataset
-        primary_dataset = get_primary_dataset()
-        if not primary_dataset:
-            missing["dataset"] = "none"
+        # First check for JSON files (preferred method)
+        json_files = find_task_json_files()
+        if json_files:
+            if len(json_files) == 1:
+                # Will be auto-selected
+                pass
+            else:
+                missing["dataset"] = "multiple_json"
+        else:
+            # Check lock file for HuggingFace dataset
+            primary_dataset = get_primary_dataset()
+            if not primary_dataset:
+                missing["dataset"] = "none"
     return missing
@@ -407,13 +536,23 @@ async def run_remote_training(
     auto_create_pod: str | None = None,
     team_id: str | None = None,
     dataset_size: int | None = None,
+    is_json_file: bool = False,
 ) -> None:
     """Run training on remote infrastructure."""
     design.section_title("🚀 Remote Training")
     if provider == "prime":
         await run_prime_training(
-            model, dataset, config, gpus, output_dir, image, auto_create_pod, team_id, dataset_size
+            model,
+            dataset,
+            config,
+            gpus,
+            output_dir,
+            image,
+            auto_create_pod,
+            team_id,
+            dataset_size,
+            is_json_file,
         )
     else:
         design.error(f"Provider '{provider}' not yet supported")

hud/datasets/execution/parallel.py CHANGED Viewed

@@ -40,6 +40,7 @@ def _process_worker(
     2. Creates its own event loop
     3. Processes a batch of tasks asynchronously
     4. Returns results with their original indices
+    5. Handles interruption signals gracefully
     Args:
         task_batch: List of (index, task_dict) tuples
@@ -58,6 +59,7 @@ def _process_worker(
         List of (index, result) tuples
     """
     # Import inside worker to avoid pickling issues
+    import signal
     import sys
     import hud
@@ -72,6 +74,14 @@ def _process_worker(
     except AttributeError:
         pass
+    # Set up signal handler for clean interruption
+    def signal_handler(signum: int, frame: Any) -> None:
+        logger.warning("Worker %s: Received interrupt signal", worker_id)
+        # Raise KeyboardInterrupt to actually interrupt the worker
+        raise KeyboardInterrupt(f"Worker {worker_id} interrupted by user")
+    signal.signal(signal.SIGINT, signal_handler)
     # Reinitialize telemetry in this process
     configure_telemetry()
@@ -157,8 +167,25 @@ def _process_worker(
         # Process all tasks in parallel within this process
         tasks = [process_single_task(idx, task_dict) for idx, task_dict in task_batch]
-        results = await asyncio.gather(*tasks, return_exceptions=False)
-        return results
+        try:
+            results = await asyncio.gather(*tasks, return_exceptions=False)
+            return results
+        except asyncio.CancelledError:
+            logger.info("Worker %s: Tasks cancelled due to interruption", worker_id)
+            # Return error results for all tasks
+            return [
+                (
+                    idx,
+                    {
+                        "error": "Task cancelled (Ctrl+C)",
+                        "isError": True,
+                        "reward": 0.0,
+                        "done": False,
+                        "content": "Task cancelled",
+                    },
+                )
+                for idx, _ in task_batch
+            ]
     try:
         # Run the async batch processing
@@ -180,6 +207,24 @@ def _process_worker(
                 logger.warning("Worker %s: Telemetry flush timed out", worker_id)
         return results
+    except KeyboardInterrupt:
+        logger.info("Worker %s: Interrupted by user, stopping gracefully", worker_id)
+        # Return partial results for tasks that completed
+        partial_results = []
+        for idx, _ in task_batch:
+            partial_results.append(
+                (
+                    idx,
+                    {
+                        "error": "Worker interrupted by user (Ctrl+C)",
+                        "isError": True,
+                        "reward": 0.0,
+                        "done": False,
+                        "content": "Task interrupted",
+                    },
+                )
+            )
+        return partial_results
     except Exception as e:
         logger.error("[Worker %s] Batch processing failed: %s", worker_id, e)
         logger.error("Worker %s batch processing failed: %s", worker_id, e)
@@ -365,7 +410,8 @@ async def run_dataset_parallel_manual(
         )
         # Process batches in parallel using ProcessPoolExecutor
-        with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        executor = ProcessPoolExecutor(max_workers=max_workers)
+        try:
             # Submit all batches to workers
             future_to_batch = {
                 executor.submit(worker_func, batch, worker_id=i): batch
@@ -377,48 +423,78 @@ async def run_dataset_parallel_manual(
             total = len(task_dicts)
             # Process results as they complete
-            for future in as_completed(future_to_batch):
-                batch = future_to_batch[future]
-                try:
-                    # Get results from this worker
-                    batch_results = future.result()
-                    # Place results in correct positions
-                    for index, result in batch_results:
-                        results[index] = result
-                        completed += 1
-                    # Calculate success rate so far
-                    successful_so_far = sum(
-                        1
-                        for r in results[:completed]
-                        if r is not None and getattr(r, "reward", 0) > 0
-                    )
+            try:
+                for future in as_completed(future_to_batch):
+                    batch = future_to_batch[future]
+                    try:
+                        # Get results from this worker
+                        batch_results = future.result()
+                        # Place results in correct positions
+                        for index, result in batch_results:
+                            results[index] = result
+                            completed += 1
+                        # Calculate success rate so far
+                        successful_so_far = sum(
+                            1
+                            for r in results[:completed]
+                            if r is not None and getattr(r, "reward", 0) > 0
+                        )
-                    progress_msg = (
-                        f"Progress: {completed}/{total} tasks completed "
-                        f"({100 * completed / total:.1f}%) | "
-                        f"Success rate: {successful_so_far}/{completed} "
-                        f"({100 * successful_so_far / completed:.1f}%)"
-                    )
+                        progress_msg = (
+                            f"Progress: {completed}/{total} tasks completed "
+                            f"({100 * completed / total:.1f}%) | "
+                            f"Success rate: {successful_so_far}/{completed} "
+                            f"({100 * successful_so_far / completed:.1f}%)"
+                        )
-                    logger.info(progress_msg)
+                        logger.info(progress_msg)
-                except Exception as e:
-                    # Handle worker failure
-                    logger.error("Worker failed with exception: %s\n%s", e, traceback.format_exc())
+                    except Exception as e:
+                        # Handle worker failure
+                        logger.error(
+                            "Worker failed with exception: %s\n%s", e, traceback.format_exc()
+                        )
-                    # Mark all tasks in this batch as failed
-                    for index, _ in batch:
-                        results[index] = {
-                            "error": f"Worker process failed: {e}",
+                        # Mark all tasks in this batch as failed
+                        for index, _ in batch:
+                            results[index] = {
+                                "error": f"Worker process failed: {e}",
+                                "isError": True,
+                                "reward": 0.0,
+                                "done": False,
+                                "content": f"Worker process failed: {e}",
+                            }
+                            completed += 1
+            except KeyboardInterrupt:
+                logger.warning("\n⚠️  Parallel evaluation interrupted by user (Ctrl+C)")
+                logger.info("Cancelling pending tasks...")
+                # Cancel all pending futures
+                for future in future_to_batch:
+                    if not future.done():
+                        future.cancel()
+                # Mark uncompleted tasks as interrupted
+                for i, r in enumerate(results):
+                    if r is None:
+                        results[i] = {
+                            "error": "Evaluation interrupted by user",
                             "isError": True,
                             "reward": 0.0,
                             "done": False,
-                            "content": f"Worker process failed: {e}",
+                            "content": "Task interrupted (Ctrl+C)",
                         }
-                        completed += 1
+                logger.info("Interrupted after %s/%s tasks", completed, total)
+                raise  # Re-raise to propagate the interrupt
+        finally:
+            # Always shutdown the executor properly
+            executor.shutdown(wait=False, cancel_futures=True)
         # Verify all results are populated
         missing = [i for i, r in enumerate(results) if r is None]

hud/otel/exporters.py CHANGED Viewed

@@ -14,6 +14,7 @@ from __future__ import annotations
 import contextlib
 import json
 import logging
+import time
 from collections import defaultdict
 from datetime import UTC, datetime
 from typing import TYPE_CHECKING, Any
@@ -362,5 +363,7 @@ class HudSpanExporter(SpanExporter):
         pass
     def force_flush(self, timeout_millis: int | None = None) -> bool:  # type: ignore[override]
+        if timeout_millis:
+            time.sleep(timeout_millis / 1000)
         # Synchronous export, nothing buffered here
         return True

hud/otel/processors.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import logging
+import time
 from typing import Any
 from opentelemetry import baggage
@@ -115,4 +116,6 @@ class HudEnrichmentProcessor(SpanProcessor):
         pass
     def force_flush(self, timeout_millis: int | None = None) -> bool:  # type: ignore[override]
+        if timeout_millis:
+            time.sleep(timeout_millis / 1000)
         return True

hud/utils/tests/test_version.py CHANGED Viewed

@@ -5,4 +5,4 @@ def test_import():
     """Test that the package can be imported."""
     import hud
-    assert hud.__version__ == "0.4.16"
+    assert hud.__version__ == "0.4.18"

hud/version.py CHANGED Viewed

@@ -4,4 +4,4 @@ Version information for the HUD SDK.
 from __future__ import annotations
-__version__ = "0.4.16"
+__version__ = "0.4.18"

{hud_python-0.4.16.dist-info → hud_python-0.4.18.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hud-python
-Version: 0.4.16
+Version: 0.4.18
 Summary: SDK for the HUD platform.
 Project-URL: Homepage, https://github.com/hud-evals/hud-python
 Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues

{hud_python-0.4.16.dist-info → hud_python-0.4.18.dist-info}/RECORD RENAMED Viewed

@@ -2,15 +2,15 @@ hud/__init__.py,sha256=BjAhZtsHbGN371Q8t3o4v4jltedkmDE85xW0yOILU9g,397
 hud/__main__.py,sha256=YR8Dq8OhINOsVfQ55PmRXXg4fEK84Rt_-rMtJ5rvhWo,145
 hud/settings.py,sha256=q9aZiHjvbL4oLE-N8AttTW4rmzS8zPMnsca-iMGyEGc,2362
 hud/types.py,sha256=gNnyS1G7aYHIR5sT3k3bOfSTFnPylUO6lNGLWbjbeYk,5149
-hud/version.py,sha256=6t66BWvg9Ogi1zfYHDfU7EW-PaxTj515uBQwruhu3X8,105
+hud/version.py,sha256=8Ag1N-qzwxUt5QwVLTJ5Z43L6M6O6FLpCKva6zONOfc,105
 hud/agents/__init__.py,sha256=UoIkljWdbq4bM0LD-mSaw6w826EqdEjOk7r6glNYwYQ,286
 hud/agents/base.py,sha256=rbwYP_a6XTwhY_5CaBlE7SWflnTq1EOuDiNY2XeUWdM,28275
-hud/agents/claude.py,sha256=in-dRByL7wU1gsuwFvscltQ1PGZURb2bIeU3QFhfpQU,14271
+hud/agents/claude.py,sha256=_eD_XKZhVJ6grkHQfbS6JskztueomQcmJeGJMbfNdmE,14534
 hud/agents/langchain.py,sha256=1EgCy8jfjunsWxlPC5XfvfLS6_XZVrIF1ZjtHcrvhYw,9584
-hud/agents/openai.py,sha256=9G7u17oN9yne7ObiuaBV67a3U3byC-NPJchDT4s5kZ0,13994
-hud/agents/openai_chat_generic.py,sha256=jTJ-KY6HkglPK0iwZH5v3PVnaUjDsWc9IbRo3AbXlyE,5322
+hud/agents/openai.py,sha256=tvFYsZ5yaoLkfjMnHe-COxRttMsLRXBLPdSqgeipQRk,14257
+hud/agents/openai_chat_generic.py,sha256=Q6eKlKQIF2o04eGpIcBAyqpdcgRvuolbxmgWTT6ktEQ,10478
 hud/agents/misc/__init__.py,sha256=BYi4Ytp9b_vycpZFXnr5Oyw6ncKLNNGml8Jrb7bWUb4,136
-hud/agents/misc/response_agent.py,sha256=MsnIVElXM4ABrSJfEc_MMYp1Y_Rxmkq4kEGJ9vDX7hw,3098
+hud/agents/misc/response_agent.py,sha256=pnaomb4H-QJm1YKU3tC1YnZXxOlDbTHIXaIH-6Nkb6I,3102
 hud/agents/tests/__init__.py,sha256=W-O-_4i34d9TTyEHV-O_q1Ai1gLhzwDaaPo02_TWQIY,34
 hud/agents/tests/test_base.py,sha256=F39ajSqASGUbPyPoWSY9KARFav62qNTK74W11Tr1Tg4,28970
 hud/agents/tests/test_claude.py,sha256=wqEKlzEvx8obz1sSm4NY0j-Zyt1qWNfDOmRqYIuAEd0,13069
@@ -31,11 +31,11 @@ hud/cli/pull.py,sha256=JHwCwUwRO0Nzbgm9mkjsz6EpxbxgwQVhgNSY64nNZ-s,11969
 hud/cli/push.py,sha256=4KrEHj0_i3xJNCB3eRjANmHFhSW4MFfpnld3nfVYENs,17904
 hud/cli/remove.py,sha256=USAvB6pbMA3jd19xUtLEBiMsklVTEfE2Maw9nYcpSAE,6619
 hud/cli/rl/README.md,sha256=3pqRZMrnwD-lJwWGCCNZNhGdZG6zyydLBOer0e8BkLw,5983
-hud/cli/rl/__init__.py,sha256=_C95dDud3-YMpqfuRBbW0RyMboPGWzhT7aVcly0n5N8,3074
+hud/cli/rl/__init__.py,sha256=g_Crqn5o0m9xANrTOkQZENWVlwHAV6MWiobte-FfqiY,3412
 hud/cli/rl/init.py,sha256=GXVOXLrX8CVAgpJ1pHuk6Y6oujbh46Rtz8kG18jGzk8,13789
-hud/cli/rl/pod.py,sha256=ARXEG4RZyI1zOrQnsEjLiQDZuM-gv_yNc6JP7-VPRfI,18691
-hud/cli/rl/ssh.py,sha256=4KrdFQP1uv-RUCVe-Sw01SH84ogk9cwts9i0oyaTFSg,10267
-hud/cli/rl/train.py,sha256=40vzQeDrYa9hxI0HwwcFu-zWfhoH2vM8-AUx_JJVHiA,14702
+hud/cli/rl/pod.py,sha256=ZiXI-RG9YsnKx1EWzufcqklBdaD_d6XFtD45a0H8KpM,18837
+hud/cli/rl/ssh.py,sha256=bHAieonseJPON7P1mwB2GPWKLDlLZuvQniONmr5ZfcE,11523
+hud/cli/rl/train.py,sha256=sjY4J0TCp8647kzuIHyEeIsFVGtE0tllT0GzhkPPrWY,19895
 hud/cli/rl/utils.py,sha256=ZW3sjl5KaHZaOCjAbut_QIpQvxgzlxjPGuM6fuYkU9I,4836
 hud/cli/tests/__init__.py,sha256=ZrGVkmH7DHXGqOvjOSNGZeMYaFIRB2K8c6hwr8FPJ-8,68
 hud/cli/tests/test_analyze.py,sha256=SwxvRlnw-VaEwKN2nd1FJAxfhieujPjh7PdQh_LYJ5E,11050
@@ -79,7 +79,7 @@ hud/datasets/__init__.py,sha256=74T4mrjELKtE04XkZKwU8QAJcg2wjqXLqRO9s4GlPr4,678
 hud/datasets/task.py,sha256=V82HzRb2_c2MO9EG5ZcY-PMsLt3234Uks7WlkMta5HY,3615
 hud/datasets/utils.py,sha256=3hKvZTkZuCRkTeITB86nNdA1dtHZAqFfAdSPMtcTUhs,4275
 hud/datasets/execution/__init__.py,sha256=4m1AEpMQaUSJFVN_iAXvY6zFttVgZKwE6oQtC0Rrk7U,330
-hud/datasets/execution/parallel.py,sha256=AxtbEgX1v9UFO3nHN91vQyhtfeU6oe65rV50ubDWBkg,22182
+hud/datasets/execution/parallel.py,sha256=4aL1XpS3vOBqZjgs0vrMZJ4eAoi86Td8C-m5SUtVxMs,25231
 hud/datasets/execution/runner.py,sha256=EEvb90vvAqFXXx8NyVKLfK5p-gtsfJqiFJAoqSjyfXg,4695
 hud/misc/__init__.py,sha256=m_pprQQ-G-Y0Sd0NEiR8MtAMbElnuFZ2OWT8TXrw7c4,43
 hud/misc/claude_plays_pokemon.py,sha256=IthAkjDVr2Q-GNvX-QLJyMzN7-0pHqqJbagGNv2m7yo,10453
@@ -87,9 +87,9 @@ hud/otel/__init__.py,sha256=ii17ayoWiS5vAhA7UAmZ8TkmP52gs2pWyHsD46-uYbE,1003
 hud/otel/collector.py,sha256=jLZymZ8r7xt2VDuWexfbnT7PY1-0aiyLMgjBy8KDY1M,4497
 hud/otel/config.py,sha256=6np_C2UXhtKHHjY41HQxZElua2Eh_EUCBiRB_YuiSuc,6249
 hud/otel/context.py,sha256=C9MvO99cRSNNDEDC7ehO3eoTPnb6J7AemUYvEp57yEU,17774
-hud/otel/exporters.py,sha256=TP7SF6ySCP-gFV1i-u5-HbpYsK3n9GP3OjW_ZBfsj-w,14246
+hud/otel/exporters.py,sha256=RLAjWa8b2DJEU21740Idq4fmeIuabLEqGGUspcFDcH4,14331
 hud/otel/instrumentation.py,sha256=xTjrkn2p490lJ8UlSD1SfzkPZsD8XKDocQqYQfwMMKo,3775
-hud/otel/processors.py,sha256=yI5BWsDBMEPfwMzD-iWbJd4KWH3qUDSe-5-C1yT6fjU,4615
+hud/otel/processors.py,sha256=-gGRbwifplcExDQBLfx_9tqWreDImULJNcENgO9q7VU,4700
 hud/otel/tests/__init__.py,sha256=VNJKBMaxTtbn7trW-1Ph50zCvCok_wTSGcI1HD6GOLA,43
 hud/otel/tests/test_processors.py,sha256=np0R4ssd9j6LJSJykJ5bNjl0POwNYNhgb7BqOZHwcMY,6778
 hud/server/__init__.py,sha256=8LUwgsXO8xiViWP7uImDwcOsWLu01r5F4r8U8qH3rSY,91
@@ -157,10 +157,10 @@ hud/utils/tests/test_init.py,sha256=2QLQSGgyP9wJhOvPCusm_zjJad0qApOZi1BXpxcdHXQ,
 hud/utils/tests/test_mcp.py,sha256=0pUa16mL-bqbZDXp5NHBnt1gO5o10BOg7zTMHZ1DNPM,4023
 hud/utils/tests/test_progress.py,sha256=QSF7Kpi03Ff_l3mAeqW9qs1nhK50j9vBiSobZq7T4f4,7394
 hud/utils/tests/test_telemetry.py,sha256=5jl7bEx8C8b-FfFUko5pf4UY-mPOR-9HaeL98dGtVHM,2781
-hud/utils/tests/test_version.py,sha256=M21hhb0gKclB4LA6y_X7tX7vrZqXbiSo91NgKX0wug4,160
+hud/utils/tests/test_version.py,sha256=Ur5o4UVJbPy4rYJUIc3yBCTK-mk9CAf_7bHv2qSPJEI,160
 hud/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-hud_python-0.4.16.dist-info/METADATA,sha256=0tx5JSLCozbv8youtq3Cp-HuDe0OMHoRA2afl5RrU28,20287
-hud_python-0.4.16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-hud_python-0.4.16.dist-info/entry_points.txt,sha256=jJbodNFg1m0-CDofe5AHvB4zKBq7sSdP97-ohaQ3ae4,63
-hud_python-0.4.16.dist-info/licenses/LICENSE,sha256=yIzBheVUf86FC1bztAcr7RYWWNxyd3B-UJQ3uddg1HA,1078
-hud_python-0.4.16.dist-info/RECORD,,
+hud_python-0.4.18.dist-info/METADATA,sha256=vvUR4EBJmH6WqrLg2OxsupIJLs_6S8aVPaCRJjN3sJI,20287
+hud_python-0.4.18.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+hud_python-0.4.18.dist-info/entry_points.txt,sha256=jJbodNFg1m0-CDofe5AHvB4zKBq7sSdP97-ohaQ3ae4,63
+hud_python-0.4.18.dist-info/licenses/LICENSE,sha256=yIzBheVUf86FC1bztAcr7RYWWNxyd3B-UJQ3uddg1HA,1078
+hud_python-0.4.18.dist-info/RECORD,,

{hud_python-0.4.16.dist-info → hud_python-0.4.18.dist-info}/WHEEL RENAMED Viewed

File without changes

{hud_python-0.4.16.dist-info → hud_python-0.4.18.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{hud_python-0.4.16.dist-info → hud_python-0.4.18.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

hud-python 0.4.16__py3-none-any.whl → 0.4.18__py3-none-any.whl

Potentially problematic release.

hud-python 0.4.16py3-none-any.whl → 0.4.18py3-none-any.whl