hud-python 0.4.14__py3-none-any.whl → 0.4.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

@@ -0,0 +1,123 @@
1
+ """Standard asyncio-based dataset runner."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ from typing import TYPE_CHECKING, Any, cast
8
+
9
+ from datasets import Dataset, load_dataset
10
+
11
+ from hud.agents.misc import ResponseAgent
12
+ from hud.datasets.task import Task
13
+
14
+ if TYPE_CHECKING:
15
+ from hud.agents import MCPAgent
16
+
17
+ logger = logging.getLogger("hud.datasets")
18
+
19
+
20
+ async def run_dataset(
21
+ name: str,
22
+ dataset: str | Dataset | list[dict[str, Any]],
23
+ agent_class: type[MCPAgent],
24
+ agent_config: dict[str, Any] | None = None,
25
+ max_concurrent: int = 50,
26
+ metadata: dict[str, Any] | None = None,
27
+ max_steps: int = 10,
28
+ split: str = "train",
29
+ auto_respond: bool = False,
30
+ custom_system_prompt: str | None = None,
31
+ ) -> list[Any]:
32
+ """
33
+ Run all tasks in a dataset with automatic job tracking.
34
+
35
+ Args:
36
+ name: Name for the job
37
+ dataset: HuggingFace dataset identifier (e.g. "hud-evals/SheetBench-50"),
38
+ Dataset object, OR list of Task objects
39
+ agent_class: Agent class to instantiate (e.g., ClaudeAgent)
40
+ agent_config: Configuration/kwargs for agent (model, etc.)
41
+ max_concurrent: Maximum parallel task execution
42
+ metadata: Optional metadata for the job
43
+ max_steps: Maximum steps per task
44
+ split: Dataset split to use when loading from string (default: "train")
45
+ auto_respond: Whether to use auto-response agent
46
+ custom_system_prompt: Override system prompt for all tasks
47
+
48
+ Returns:
49
+ List of results from agent.run() in dataset order
50
+
51
+ Example:
52
+ >>> from hud.agents import ClaudeAgent
53
+ >>> # Option 1: From dataset string identifier
54
+ >>> results = await run_dataset(
55
+ ... "SheetBench Eval",
56
+ ... "hud-evals/SheetBench-50",
57
+ ... ClaudeAgent,
58
+ ... {"model": "claude-3-5-sonnet-20241022"},
59
+ ... )
60
+ >>> # Option 2: From HuggingFace dataset object
61
+ >>> from datasets import load_dataset
62
+ >>> dataset = load_dataset("hud-evals/SheetBench-50", split="train")
63
+ >>> results = await run_dataset("my_eval", dataset, ClaudeAgent)
64
+ >>> # Option 3: From list of dicts
65
+ >>> tasks = [{"prompt": "...", "mcp_config": {...}, ...}, ...]
66
+ >>> results = await run_dataset("browser_eval", tasks, ClaudeAgent)
67
+ """
68
+ # Import here to avoid circular imports
69
+ import hud
70
+
71
+ dataset_link = None
72
+
73
+ # Load dataset from string if needed
74
+ if isinstance(dataset, str):
75
+ logger.info("Loading dataset %s from HuggingFace...", dataset)
76
+ dataset_link = dataset
77
+
78
+ # Load dataset from HuggingFace
79
+ dataset = cast("Dataset", load_dataset(dataset, split=split))
80
+
81
+ # Create job context
82
+ job_metadata = metadata or {}
83
+ job_metadata["agent_class"] = agent_class.__name__
84
+ job_metadata["agent_config"] = agent_config
85
+
86
+ # Extract dataset verification info if available
87
+ if isinstance(dataset, Dataset) and not dataset_link:
88
+ try:
89
+ general_info = next(iter(dataset.info.__dict__["download_checksums"].keys())).split("/")
90
+ project = general_info[3]
91
+ dataset_name = general_info[4].split("@")[0]
92
+ dataset_link = f"{project}/{dataset_name}"
93
+ except Exception:
94
+ logger.warning("Failed to extract dataset verification info")
95
+
96
+ with hud.job(name, metadata=job_metadata, dataset_link=dataset_link) as job_obj:
97
+ # Run tasks with semaphore for concurrency control
98
+ sem = asyncio.Semaphore(max_concurrent)
99
+ results: list[Any | None] = [None] * len(dataset)
100
+
101
+ async def _worker(index: int, task_dict: Any, max_steps: int = 10) -> None:
102
+ async with sem:
103
+ # Create trace for this task
104
+ task_name = task_dict.get("prompt") or f"Task {index}"
105
+ if custom_system_prompt and "system_prompt" not in task_dict:
106
+ task_dict["system_prompt"] = custom_system_prompt
107
+ with hud.trace(task_name, job_id=job_obj.id, task_id=task_dict.get("id")):
108
+ # Convert dict to Task here, at trace level
109
+ task = Task(**task_dict)
110
+
111
+ agent = agent_class(**(agent_config or {}))
112
+
113
+ if auto_respond:
114
+ agent.response_agent = ResponseAgent()
115
+ results[index] = await agent.run(task, max_steps=max_steps)
116
+
117
+ # Execute all tasks
118
+ await asyncio.gather(
119
+ *[_worker(i, task, max_steps=max_steps) for i, task in enumerate(dataset)],
120
+ return_exceptions=True, # Don't fail entire batch on one error
121
+ )
122
+
123
+ return results
hud/datasets/task.py ADDED
@@ -0,0 +1,107 @@
1
+ """Task model for HUD datasets."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from collections import defaultdict
7
+ from string import Template
8
+ from typing import Any
9
+
10
+ from pydantic import BaseModel, Field, field_validator
11
+
12
+ from hud.settings import settings
13
+ from hud.types import MCPToolCall
14
+
15
+
16
+ class Task(BaseModel):
17
+ """
18
+ A task configuration that can be used to create a task.
19
+
20
+ The mcp_config field supports environment variable substitution using
21
+ template placeholders in the format ${VAR_NAME} or ${VAR_NAME:default_value}.
22
+
23
+ Example:
24
+ mcp_config: {
25
+ "hud": {
26
+ "url": "${HUD_MCP_URL:https://mcp.hud.so/v3/mcp}",
27
+ "headers": {
28
+ "Authorization": "Bearer ${HUD_API_KEY}",
29
+ "Mcp-Image": "your-mcp-image"
30
+ }
31
+ }
32
+ }
33
+ """
34
+
35
+ id: str | None = None
36
+ prompt: str
37
+ mcp_config: dict[str, Any]
38
+ setup_tool: MCPToolCall | list[MCPToolCall] | None = None
39
+ evaluate_tool: MCPToolCall | list[MCPToolCall] | None = None
40
+ system_prompt: str | None = None
41
+ metadata: dict[str, Any] = Field(default_factory=dict)
42
+
43
+ @field_validator("mcp_config", "metadata", mode="before")
44
+ @classmethod
45
+ def parse_json_strings(cls, v: Any) -> Any:
46
+ """Parse JSON strings into dictionaries."""
47
+ if isinstance(v, str):
48
+ try:
49
+ return json.loads(v)
50
+ except json.JSONDecodeError as e:
51
+ raise ValueError(f"Invalid JSON string: {e}") from e
52
+ return v
53
+
54
+ @field_validator("setup_tool", "evaluate_tool", mode="before")
55
+ @classmethod
56
+ def convert_dict_to_tool_call(cls, v: Any) -> Any:
57
+ """Convert dict to MCPToolCall instance, parsing JSON strings first."""
58
+ if v is None:
59
+ return None
60
+
61
+ # Parse JSON string if needed
62
+ if isinstance(v, str):
63
+ try:
64
+ v = json.loads(v)
65
+ except json.JSONDecodeError as e:
66
+ raise ValueError(f"Invalid JSON string: {e}") from e
67
+
68
+ if isinstance(v, dict):
69
+ return MCPToolCall(**v)
70
+ if isinstance(v, list):
71
+ return [MCPToolCall(**item) if isinstance(item, dict) else item for item in v]
72
+ return v
73
+
74
+ @field_validator("mcp_config", mode="before")
75
+ @classmethod
76
+ def resolve_env_vars(cls, v: dict[str, Any]) -> dict[str, Any]:
77
+ """
78
+ Automatically resolve environment variables in mcp_config using Template.
79
+
80
+ Supports ${VAR_NAME} syntax with variable substitution from
81
+ System environment variables (including HUD_API_KEY, etc.)
82
+
83
+ Missing variables resolve to empty strings.
84
+ """
85
+ import os
86
+
87
+ # Start with current environment variables
88
+ mapping = dict(os.environ)
89
+ mapping.update(settings.model_dump())
90
+
91
+ if settings.api_key:
92
+ mapping["HUD_API_KEY"] = settings.api_key
93
+
94
+ def substitute_in_value(obj: Any) -> Any:
95
+ """Recursively substitute variables in nested structures."""
96
+ if isinstance(obj, str):
97
+ # Use Template's substitute with defaultdict - missing vars become empty strings
98
+ safe_mapping = defaultdict(str, mapping)
99
+ return Template(obj).substitute(safe_mapping)
100
+ elif isinstance(obj, dict):
101
+ return {k: substitute_in_value(v) for k, v in obj.items()}
102
+ elif isinstance(obj, list):
103
+ return [substitute_in_value(item) for item in obj]
104
+ else:
105
+ return obj
106
+
107
+ return substitute_in_value(v)
hud/datasets/utils.py ADDED
@@ -0,0 +1,118 @@
1
+ """Dataset utilities for loading, saving, and fetching datasets."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ from typing import Any
8
+
9
+ from datasets import Dataset
10
+
11
+ from .task import Task
12
+
13
+ logger = logging.getLogger("hud.datasets")
14
+
15
+
16
+ async def fetch_system_prompt_from_dataset(dataset_id: str) -> str | None:
17
+ """
18
+ Fetch system_prompt.txt from a HuggingFace dataset repository.
19
+
20
+ Args:
21
+ dataset_id: HuggingFace dataset identifier (e.g., "hud-evals/SheetBench-50")
22
+
23
+ Returns:
24
+ System prompt text if found, None otherwise
25
+ """
26
+ try:
27
+ # Import here to avoid unnecessary dependency
28
+ from huggingface_hub import hf_hub_download
29
+ from huggingface_hub.errors import EntryNotFoundError
30
+
31
+ # Try to download the system_prompt.txt file
32
+ try:
33
+ file_path = hf_hub_download(
34
+ repo_id=dataset_id, filename="system_prompt.txt", repo_type="dataset"
35
+ )
36
+
37
+ # Read and return the content
38
+ with open(file_path, encoding="utf-8") as f: # noqa: ASYNC230
39
+ content = f.read().strip()
40
+ if content:
41
+ logger.info(
42
+ "Loaded system prompt from %s (length: %d chars)", dataset_id, len(content)
43
+ )
44
+ return content
45
+ else:
46
+ logger.warning("System prompt file is empty in %s", dataset_id)
47
+ return None
48
+
49
+ except EntryNotFoundError:
50
+ logger.debug("No system_prompt.txt found in dataset %s", dataset_id)
51
+ return None
52
+
53
+ except ImportError:
54
+ logger.warning(
55
+ "huggingface_hub not installed. Install it to fetch system prompts from datasets."
56
+ )
57
+ return None
58
+ except Exception as e:
59
+ logger.error("Error fetching system prompt from %s: %s", dataset_id, e)
60
+ return None
61
+
62
+
63
+ def save_tasks(
64
+ tasks: list[dict[str, Any]], repo_id: str, fields: list[str] | None = None, **kwargs: Any
65
+ ) -> None:
66
+ """
67
+ Save data to HuggingFace dataset with JSON string serialization.
68
+
69
+ Complex fields (dicts, lists) are serialized as JSON strings to maintain clean schema
70
+ and avoid null value pollution in HuggingFace datasets.
71
+
72
+ Args:
73
+ tasks: List of dictionaries to save
74
+ repo_id: HuggingFace repository ID (e.g., "hud-evals/my-tasks")
75
+ fields: Optional list of fields to save. If None, saves all fields from each dict.
76
+ **kwargs: Additional arguments passed to dataset.push_to_hub()
77
+ """
78
+ # Safety check: Ensure we're not saving Task objects (which have resolved env vars)
79
+ if tasks and isinstance(tasks[0], Task):
80
+ raise ValueError(
81
+ "save_tasks expects dictionaries, not Task objects. "
82
+ "Task objects have resolved environment variables which would expose secrets. "
83
+ "Please pass raw dictionaries with template strings like '${HUD_API_KEY}' preserved."
84
+ )
85
+
86
+ # Convert to rows with JSON string fields
87
+ data = []
88
+ for i, tc_dict in enumerate(tasks):
89
+ # Additional safety check for each item
90
+ if isinstance(tc_dict, Task):
91
+ raise ValueError(
92
+ f"Item {i} is a Task object, not a dictionary. "
93
+ "This would expose resolved environment variables. "
94
+ "Please convert to dictionary format with template strings preserved."
95
+ )
96
+
97
+ row = {}
98
+
99
+ # Determine which fields to process
100
+ fields_to_process = fields if fields is not None else list(tc_dict.keys())
101
+
102
+ for field in fields_to_process:
103
+ if field in tc_dict:
104
+ value = tc_dict[field]
105
+ # Serialize complex types as JSON strings
106
+ if isinstance(value, (dict | list)):
107
+ row[field] = json.dumps(value)
108
+ elif isinstance(value, (str | int | float | bool | type(None))):
109
+ row[field] = value if value is not None else ""
110
+ else:
111
+ # For other types, convert to string
112
+ row[field] = str(value)
113
+
114
+ data.append(row)
115
+
116
+ # Create and push dataset
117
+ dataset = Dataset.from_list(data)
118
+ dataset.push_to_hub(repo_id, **kwargs)
@@ -32,8 +32,9 @@ def install_mcp_instrumentation(provider: TracerProvider) -> None:
32
32
  try:
33
33
  # First, patch the _instruments to use our fork
34
34
  import opentelemetry.instrumentation.mcp.instrumentation as mcp_inst
35
+
35
36
  mcp_inst._instruments = ("hud-mcp-python-sdk >= 3.13.1",)
36
-
37
+
37
38
  from opentelemetry.instrumentation.mcp.instrumentation import (
38
39
  McpInstrumentor,
39
40
  )
hud/server/server.py CHANGED
@@ -33,6 +33,8 @@ def _run_with_sigterm(coro_fn: Callable[..., Any], *args: Any, **kwargs: Any) ->
33
33
  """Run *coro_fn* via anyio.run() and cancel on SIGTERM or SIGINT (POSIX)."""
34
34
  global _sigterm_received
35
35
 
36
+ sys.stderr.flush()
37
+
36
38
  async def _runner() -> None:
37
39
  stop_evt: asyncio.Event | None = None
38
40
  if sys.platform != "win32" and os.getenv("FASTMCP_DISABLE_SIGTERM_HANDLER") != "1":
@@ -43,28 +45,46 @@ def _run_with_sigterm(coro_fn: Callable[..., Any], *args: Any, **kwargs: Any) ->
43
45
  def handle_sigterm() -> None:
44
46
  global _sigterm_received
45
47
  _sigterm_received = True
46
- logger.info("Received SIGTERM signal")
48
+ logger.info("Received SIGTERM signal, setting shutdown flag")
49
+ stop_evt.set()
50
+
51
+ # Handle SIGINT for hot-reload
52
+ def handle_sigint() -> None:
53
+ logger.info("Received SIGINT signal, triggering hot reload...")
54
+ # Don't set _sigterm_received for SIGINT
47
55
  stop_evt.set()
48
56
 
49
57
  # Handle both SIGTERM and SIGINT for graceful shutdown
50
- if signal.getsignal(signal.SIGTERM) is signal.SIG_DFL:
58
+ # In Docker containers, we always want to register our handlers
59
+ try:
51
60
  loop.add_signal_handler(signal.SIGTERM, handle_sigterm)
52
- if signal.getsignal(signal.SIGINT) is signal.SIG_DFL:
53
- loop.add_signal_handler(signal.SIGINT, stop_evt.set)
54
-
55
- async with anyio.create_task_group() as tg:
56
- tg.start_soon(coro_fn, *args, **kwargs)
57
-
58
- if stop_evt is not None:
59
-
60
- async def _watch() -> None:
61
- logger.info("Waiting for SIGTERM or SIGINT")
62
- if stop_evt is not None:
63
- await stop_evt.wait()
64
- logger.debug("Received shutdown signal, cancelling tasks...")
65
- tg.cancel_scope.cancel()
66
-
67
- tg.start_soon(_watch)
61
+ logger.info("SIGTERM handler registered")
62
+ except (ValueError, OSError) as e:
63
+ logger.warning("Could not register SIGTERM handler: %s", e)
64
+
65
+ try:
66
+ loop.add_signal_handler(signal.SIGINT, handle_sigint)
67
+ logger.info("SIGINT handler registered")
68
+ except (ValueError, OSError) as e:
69
+ logger.warning("Could not register SIGINT handler: %s", e)
70
+
71
+ try:
72
+ async with anyio.create_task_group() as tg:
73
+ tg.start_soon(coro_fn, *args, **kwargs)
74
+
75
+ if stop_evt is not None:
76
+
77
+ async def _watch() -> None:
78
+ logger.info("Signal handler ready, waiting for SIGTERM or SIGINT")
79
+ if stop_evt is not None:
80
+ await stop_evt.wait()
81
+ logger.info("Shutdown signal received, initiating graceful shutdown...")
82
+ tg.cancel_scope.cancel()
83
+
84
+ tg.start_soon(_watch)
85
+ except* asyncio.CancelledError:
86
+ # This ensures the task group cleans up properly
87
+ logger.info("Task group cancelled, cleaning up...")
68
88
 
69
89
  anyio.run(_runner)
70
90
 
@@ -101,12 +121,29 @@ class MCPServer(FastMCP):
101
121
  yield {}
102
122
  finally:
103
123
  # Only call shutdown handler if SIGTERM was received
124
+ logger.info("Lifespan `finally` block reached. Checking for SIGTERM.")
125
+ # Force flush logs to ensure they're visible
126
+ sys.stderr.flush()
127
+
104
128
  if self._shutdown_fn is not None and _sigterm_received:
105
- logger.info("SIGTERM received, calling shutdown handler")
106
- await self._shutdown_fn()
129
+ logger.info("SIGTERM detected! Calling @mcp.shutdown handler...")
130
+ sys.stderr.flush()
131
+ try:
132
+ await self._shutdown_fn()
133
+ logger.info("@mcp.shutdown handler completed successfully.")
134
+ sys.stderr.flush()
135
+ except Exception as e:
136
+ logger.error("Error during @mcp.shutdown: %s", e)
137
+ sys.stderr.flush()
107
138
  _sigterm_received = False
108
139
  elif self._shutdown_fn is not None:
109
- logger.debug("Normal shutdown (hot reload), skipping shutdown handler")
140
+ logger.info(
141
+ "No SIGTERM. This is a hot reload (SIGINT) or normal exit. Skipping @mcp.shutdown handler." # noqa: E501
142
+ )
143
+ sys.stderr.flush()
144
+ else:
145
+ logger.info("No shutdown handler registered.")
146
+ sys.stderr.flush()
110
147
 
111
148
  fastmcp_kwargs["lifespan"] = _lifespan
112
149
 
hud/settings.py CHANGED
@@ -44,6 +44,18 @@ class Settings(BaseSettings):
44
44
  validation_alias="OPENAI_API_KEY",
45
45
  )
46
46
 
47
+ wandb_api_key: str | None = Field(
48
+ default=None,
49
+ description="API key for Weights & Biases",
50
+ validation_alias="WANDB_API_KEY",
51
+ )
52
+
53
+ prime_api_key: str | None = Field(
54
+ default=None,
55
+ description="API key for Prime Intellect",
56
+ validation_alias="PRIME_API_KEY",
57
+ )
58
+
47
59
  telemetry_enabled: bool = Field(
48
60
  default=True,
49
61
  description="Enable telemetry for the HUD SDK",
hud/types.py CHANGED
@@ -1,8 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import json
3
4
  import uuid
4
5
  from typing import Any, Literal
5
6
 
7
+ import mcp.types as types
6
8
  from mcp.types import CallToolRequestParams, CallToolResult
7
9
  from pydantic import BaseModel, ConfigDict, Field
8
10
 
@@ -13,22 +15,41 @@ class MCPToolCall(CallToolRequestParams):
13
15
  id: str = Field(default_factory=lambda: str(uuid.uuid4())) # Unique identifier for reference
14
16
 
15
17
  def __str__(self) -> str:
16
- response = f"Tool: {self.name}"
17
- if self.arguments:
18
- response += f"\nArguments: {self.arguments}"
19
- return response
18
+ """Format tool call with Rich markup for HUD design."""
19
+ from hud.utils.design import design
20
+
21
+ return design.format_tool_call(self.name, self.arguments)
20
22
 
21
23
 
22
24
  class MCPToolResult(CallToolResult):
23
25
  """A tool result."""
24
26
 
25
27
  def __str__(self) -> str:
26
- response = f"Content: {self.content}"
27
- if self.structuredContent:
28
- response += f"\nStructured Content: {self.structuredContent}"
29
- if self.isError:
30
- response += f"\nError: {self.isError}"
31
- return response
28
+ """Format tool result with Rich markup for HUD design - compact version."""
29
+ from hud.utils.design import design
30
+
31
+ # Extract content summary
32
+ content_summary = ""
33
+ if self.content:
34
+ for block in self.content:
35
+ if isinstance(block, types.TextContent):
36
+ # Get first line or truncate
37
+ text = block.text.strip()
38
+ first_line = text.split("\n")[0] if "\n" in text else text
39
+ content_summary = first_line
40
+ break
41
+ elif isinstance(block, types.ImageContent):
42
+ content_summary = "📷 Image"
43
+ break
44
+
45
+ # Or use structured content if no text content
46
+ if not content_summary and self.structuredContent:
47
+ try:
48
+ content_summary = json.dumps(self.structuredContent, separators=(",", ":"))
49
+ except (TypeError, ValueError):
50
+ content_summary = str(self.structuredContent)
51
+
52
+ return design.format_tool_result(content_summary, self.isError)
32
53
 
33
54
 
34
55
  class AgentResponse(BaseModel):