hud-python 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (50) hide show
  1. hud/__init__.py +22 -2
  2. hud/adapters/claude/adapter.py +9 -2
  3. hud/adapters/claude/tests/__init__.py +1 -0
  4. hud/adapters/claude/tests/test_adapter.py +519 -0
  5. hud/adapters/common/types.py +5 -1
  6. hud/adapters/operator/adapter.py +4 -0
  7. hud/adapters/operator/tests/__init__.py +1 -0
  8. hud/adapters/operator/tests/test_adapter.py +370 -0
  9. hud/agent/__init__.py +4 -0
  10. hud/agent/base.py +18 -2
  11. hud/agent/claude.py +20 -17
  12. hud/agent/claude_plays_pokemon.py +282 -0
  13. hud/agent/langchain.py +12 -7
  14. hud/agent/misc/__init__.py +3 -0
  15. hud/agent/misc/response_agent.py +80 -0
  16. hud/agent/operator.py +27 -19
  17. hud/agent/tests/__init__.py +1 -0
  18. hud/agent/tests/test_base.py +202 -0
  19. hud/env/docker_client.py +28 -18
  20. hud/env/environment.py +33 -17
  21. hud/env/local_docker_client.py +83 -42
  22. hud/env/remote_client.py +1 -3
  23. hud/env/remote_docker_client.py +72 -15
  24. hud/exceptions.py +12 -0
  25. hud/gym.py +71 -53
  26. hud/job.py +52 -7
  27. hud/settings.py +6 -0
  28. hud/task.py +45 -33
  29. hud/taskset.py +44 -4
  30. hud/telemetry/__init__.py +21 -0
  31. hud/telemetry/_trace.py +173 -0
  32. hud/telemetry/context.py +193 -0
  33. hud/telemetry/exporter.py +417 -0
  34. hud/telemetry/instrumentation/__init__.py +3 -0
  35. hud/telemetry/instrumentation/mcp.py +498 -0
  36. hud/telemetry/instrumentation/registry.py +59 -0
  37. hud/telemetry/mcp_models.py +331 -0
  38. hud/telemetry/tests/__init__.py +1 -0
  39. hud/telemetry/tests/test_context.py +203 -0
  40. hud/telemetry/tests/test_trace.py +270 -0
  41. hud/types.py +10 -26
  42. hud/utils/common.py +22 -2
  43. hud/utils/misc.py +53 -0
  44. hud/utils/tests/test_version.py +1 -1
  45. hud/version.py +7 -0
  46. {hud_python-0.2.3.dist-info → hud_python-0.2.5.dist-info}/METADATA +90 -22
  47. hud_python-0.2.5.dist-info/RECORD +84 -0
  48. hud_python-0.2.3.dist-info/RECORD +0 -62
  49. {hud_python-0.2.3.dist-info → hud_python-0.2.5.dist-info}/WHEEL +0 -0
  50. {hud_python-0.2.3.dist-info → hud_python-0.2.5.dist-info}/licenses/LICENSE +0 -0
hud/taskset.py CHANGED
@@ -5,15 +5,19 @@ from venv import logger
5
5
 
6
6
  from pydantic import BaseModel
7
7
 
8
+ from hud.env.environment import create_remote_config
8
9
  from hud.server import make_request
9
10
  from hud.settings import settings
10
11
  from hud.task import Task
12
+ from hud.utils.config import REMOTE_EVALUATE, REMOTE_SETUP
11
13
 
12
14
  if TYPE_CHECKING:
13
15
  from collections.abc import Iterator
14
16
 
15
17
  from inspect_ai.dataset import Dataset
16
18
 
19
+ from hud.agent import Agent
20
+
17
21
 
18
22
  class TaskSet(BaseModel):
19
23
  """
@@ -21,11 +25,13 @@ class TaskSet(BaseModel):
21
25
 
22
26
  Attributes:
23
27
  id: Unique identifier for the taskset
28
+ name: Name of the taskset
24
29
  description: Description of the taskset
25
30
  tasks: List of Task objects in the taskset
26
31
  """
27
32
 
28
33
  id: str | None = None
34
+ name: str | None = None
29
35
  description: str | None = None
30
36
  tasks: list[Task] = []
31
37
 
@@ -61,16 +67,38 @@ class TaskSet(BaseModel):
61
67
 
62
68
  async def upload(
63
69
  self,
64
- name: str,
70
+ name: str | None = None,
65
71
  description: str | None = None,
66
72
  api_key: str | None = None,
67
73
  ) -> None:
68
74
  """
69
75
  Uploads the taskset to the server.
70
76
  """
77
+ if name is None:
78
+ name = self.name
79
+
80
+ if name is None:
81
+ raise ValueError("Taskset name is required")
82
+
71
83
  if api_key is None:
72
84
  api_key = settings.api_key
73
85
 
86
+ # Convert all tasks to expanded configs
87
+ processed_tasks = []
88
+ for task in self.tasks:
89
+ setup_config = create_remote_config(None, task.setup, REMOTE_SETUP)[0].args[0]
90
+ evaluate_config = create_remote_config(None, task.evaluate, REMOTE_EVALUATE)[0].args[0]
91
+
92
+ processed_tasks.append(
93
+ {
94
+ "prompt": task.prompt,
95
+ "gym": task.gym,
96
+ "setup": setup_config.model_dump(),
97
+ "evaluate": evaluate_config.model_dump(),
98
+ "config": task.config,
99
+ }
100
+ )
101
+
74
102
  await make_request(
75
103
  method="POST",
76
104
  url=f"{settings.base_url}/v2/tasksets",
@@ -78,13 +106,25 @@ class TaskSet(BaseModel):
78
106
  json={
79
107
  "name": name,
80
108
  "description": description,
81
- "tasks": [task.model_dump() for task in self.tasks],
109
+ "tasks": processed_tasks,
82
110
  },
83
111
  )
84
112
  logger.info(
85
- "[HUD] Taskset %s uploaded successfully, see it on app.hud.so/tasksets/%s", name, name
113
+ "Taskset %s uploaded successfully, see it on app.hud.so/evalsets/%s", name, name
86
114
  )
87
115
 
116
+ async def fit(self, agent: Agent | type[Agent]) -> None:
117
+ """
118
+ Automatically adapts the taskset to the agent's transfer_gyms.
119
+ """
120
+ if isinstance(agent, type):
121
+ agent = agent()
122
+
123
+ for task in self.tasks:
124
+ if task.gym is None:
125
+ continue
126
+ task.gym = agent.transfer_gyms.get(task.gym, task.gym)
127
+
88
128
 
89
129
  async def load_taskset(taskset_id: str, api_key: str | None = None) -> TaskSet:
90
130
  """
@@ -107,7 +147,7 @@ async def load_taskset(taskset_id: str, api_key: str | None = None) -> TaskSet:
107
147
  api_key=api_key,
108
148
  )
109
149
 
110
- logger.info(f"[HUD] Taskset {taskset_id} loaded successfully")
150
+ logger.info(f"Taskset {taskset_id} loaded successfully")
111
151
 
112
152
  return TaskSet.model_validate(
113
153
  {
@@ -0,0 +1,21 @@
1
+ """
2
+ HUD telemetry module for capturing and reporting telemetry data from MCP calls.
3
+
4
+ This module provides functionality to trace MCP calls and export telemetry data
5
+ to the HUD platform for analysis.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from hud.telemetry._trace import init_telemetry, register_trace, trace
11
+ from hud.telemetry.context import get_current_task_run_id, set_current_task_run_id
12
+ from hud.telemetry.exporter import flush
13
+
14
+ __all__ = [
15
+ "flush",
16
+ "get_current_task_run_id",
17
+ "init_telemetry",
18
+ "register_trace",
19
+ "set_current_task_run_id",
20
+ "trace",
21
+ ]
@@ -0,0 +1,173 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ import time
6
+ import uuid
7
+ from contextlib import contextmanager
8
+ from functools import wraps
9
+ from typing import (
10
+ TYPE_CHECKING,
11
+ Any,
12
+ ParamSpec,
13
+ TypeVar,
14
+ overload,
15
+ )
16
+
17
+ from hud.telemetry import exporter
18
+ from hud.telemetry.context import (
19
+ flush_buffer,
20
+ get_current_task_run_id,
21
+ is_root_trace,
22
+ set_current_task_run_id,
23
+ )
24
+ from hud.telemetry.exporter import submit_to_worker_loop
25
+ from hud.telemetry.instrumentation.registry import registry
26
+
27
+ if TYPE_CHECKING:
28
+ from collections.abc import (
29
+ Callable,
30
+ Coroutine,
31
+ Generator,
32
+ )
33
+
34
+ from hud.telemetry.mcp_models import BaseMCPCall
35
+
36
+ logger = logging.getLogger("hud.telemetry")
37
+ T = TypeVar("T")
38
+
39
+
40
+ def init_telemetry() -> None:
41
+ """Initialize telemetry instrumentors and ensure worker is started if telemetry is active."""
42
+ registry.install_all()
43
+ logger.info("Telemetry initialized.")
44
+
45
+
46
+ @contextmanager
47
+ def trace(
48
+ name: str | None = None,
49
+ attributes: dict[str, Any] | None = None,
50
+ ) -> Generator[str, None, None]:
51
+ """
52
+ Context manager for tracing a block of code.
53
+ The task_run_id is always generated internally as a UUID.
54
+ Telemetry export is handled by a background worker thread.
55
+
56
+ Args:
57
+ attributes: Optional dictionary of attributes to associate with this trace
58
+ name: Optional name for this trace, will be added to attributes.
59
+
60
+ Returns:
61
+ The generated task run ID (UUID string) used for this trace
62
+ """
63
+ task_run_id = str(uuid.uuid4())
64
+
65
+ local_attributes = attributes.copy() if attributes is not None else {}
66
+ if name is not None:
67
+ local_attributes["trace_name"] = name
68
+
69
+ start_time = time.time()
70
+ logger.debug("Starting trace %s (Name: %s)", task_run_id, name if name else "Unnamed")
71
+
72
+ previous_task_id = get_current_task_run_id()
73
+ was_root = is_root_trace.get()
74
+
75
+ set_current_task_run_id(task_run_id)
76
+ is_root = previous_task_id is None
77
+ is_root_trace.set(is_root)
78
+
79
+ try:
80
+ yield task_run_id
81
+ finally:
82
+ end_time = time.time()
83
+ duration = end_time - start_time
84
+
85
+ mcp_calls: list[BaseMCPCall] = flush_buffer()
86
+
87
+ trace_attributes_final = {
88
+ **local_attributes,
89
+ "start_time": start_time,
90
+ "end_time": end_time,
91
+ "duration": duration,
92
+ "is_root": is_root,
93
+ }
94
+
95
+ if is_root and mcp_calls:
96
+ try:
97
+ coro_to_submit = exporter.export_telemetry(
98
+ task_run_id=task_run_id,
99
+ trace_attributes=trace_attributes_final,
100
+ mcp_calls=mcp_calls,
101
+ )
102
+ future = submit_to_worker_loop(coro_to_submit)
103
+ if future:
104
+ logger.debug(
105
+ "Telemetry for trace %s submitted to background worker.", task_run_id
106
+ )
107
+ else:
108
+ logger.warning(
109
+ "Failed to submit telemetry for trace %s to"
110
+ "background worker (loop not available).",
111
+ task_run_id,
112
+ )
113
+ except Exception as e:
114
+ logger.warning("Failed to submit telemetry for trace %s: %s", task_run_id, e)
115
+
116
+ set_current_task_run_id(previous_task_id)
117
+ is_root_trace.set(was_root)
118
+
119
+ logger.debug(
120
+ "Ended trace %s (Name: %s) with %d MCP call(s)",
121
+ task_run_id,
122
+ name if name else "Unnamed",
123
+ len(mcp_calls),
124
+ )
125
+
126
+ logger.info("View trace at https://app.hud.so/jobs/traces/%s", task_run_id)
127
+
128
+
129
+ P = ParamSpec("P")
130
+ R = TypeVar("R")
131
+
132
+
133
+ def register_trace(
134
+ name: str | None = None, attributes: dict[str, Any] | None = None
135
+ ) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
136
+ """
137
+ Decorator to wrap a synchronous or asynchronous function call
138
+ within a hud._telemetry.trace context.
139
+
140
+ Args:
141
+ name: Optional name for the trace.
142
+ attributes: Optional dictionary of attributes for the trace.
143
+ """
144
+
145
+ @overload
146
+ def decorator(
147
+ func: Callable[P, Coroutine[Any, Any, R]],
148
+ ) -> Callable[P, Coroutine[Any, Any, R]]: ...
149
+
150
+ @overload
151
+ def decorator(func: Callable[P, R]) -> Callable[P, R]: ...
152
+
153
+ def decorator(func: Callable[P, Any]) -> Callable[P, Any]:
154
+ if asyncio.iscoroutinefunction(func):
155
+
156
+ @wraps(func)
157
+ async def async_wrapper(*args: P.args, **kwargs: P.kwargs) -> Any:
158
+ effective_name = name if name else func.__name__
159
+ with trace(name=effective_name, attributes=attributes):
160
+ return await func(*args, **kwargs)
161
+
162
+ return async_wrapper
163
+ else:
164
+
165
+ @wraps(func)
166
+ def sync_wrapper(*args: P.args, **kwargs: P.kwargs) -> Any:
167
+ effective_name = name if name else func.__name__
168
+ with trace(name=effective_name, attributes=attributes):
169
+ return func(*args, **kwargs)
170
+
171
+ return sync_wrapper
172
+
173
+ return decorator
@@ -0,0 +1,193 @@
1
+ from __future__ import annotations
2
+
3
+ import contextvars
4
+ import logging
5
+ from datetime import datetime
6
+ from typing import Any, TypeVar
7
+
8
+ from hud.telemetry.mcp_models import (
9
+ BaseMCPCall,
10
+ MCPManualTestCall,
11
+ MCPNotificationCall,
12
+ MCPRequestCall,
13
+ MCPResponseCall,
14
+ MCPTelemetryRecord,
15
+ StatusType,
16
+ )
17
+
18
+ logger = logging.getLogger("hud.telemetry")
19
+
20
+ # Context variables for tracing
21
+ current_task_run_id: contextvars.ContextVar[str | None] = contextvars.ContextVar(
22
+ "current_task_run_id", default=None
23
+ )
24
+ mcp_calls_buffer: contextvars.ContextVar[list[BaseMCPCall] | None] = contextvars.ContextVar(
25
+ "mcp_calls_buffer", default=None
26
+ )
27
+ is_root_trace: contextvars.ContextVar[bool] = contextvars.ContextVar("is_root_trace", default=False)
28
+
29
+ # Maximum buffer size before automatic flush
30
+ MAX_BUFFER_SIZE = 100
31
+
32
+ # Type variable for record factories
33
+ T = TypeVar("T", bound=BaseMCPCall)
34
+
35
+
36
+ def get_current_task_run_id() -> str | None:
37
+ """Get the task_run_id for the current trace context."""
38
+ value = current_task_run_id.get()
39
+ # Convert empty string sentinel back to None
40
+ return None if value == "" else value
41
+
42
+
43
+ def set_current_task_run_id(task_run_id: str | None) -> None:
44
+ """Set the task_run_id for the current trace context."""
45
+ # Handle None value by using empty string as sentinel
46
+ value_to_set = "" if task_run_id is None else task_run_id
47
+ current_task_run_id.set(value_to_set)
48
+
49
+
50
+ def buffer_mcp_call(record: BaseMCPCall | dict[str, Any]) -> None:
51
+ """
52
+ Add an MCP call to the buffer for the current trace.
53
+
54
+ Args:
55
+ record: Either a Pydantic model instance or dictionary with MCP call data
56
+ """
57
+ # Only buffer if we have an active trace
58
+ task_run_id = get_current_task_run_id()
59
+ if task_run_id is not None and task_run_id != "":
60
+ buffer = mcp_calls_buffer.get()
61
+ if buffer is None:
62
+ buffer = []
63
+
64
+ # Convert dictionary to proper model if needed
65
+ if isinstance(record, dict):
66
+ record = BaseMCPCall.from_dict(record)
67
+
68
+ # Ensure the record has the current task_run_id
69
+ if record.task_run_id != task_run_id:
70
+ # Create a copy with the current task_run_id
71
+ record_dict = record.model_dump()
72
+ record_dict["task_run_id"] = task_run_id
73
+ record = BaseMCPCall.from_dict(record_dict)
74
+
75
+ # Add to buffer
76
+ buffer.append(record)
77
+ mcp_calls_buffer.set(buffer)
78
+
79
+ # Auto-flush if buffer gets too large
80
+ if len(buffer) >= MAX_BUFFER_SIZE:
81
+ logger.debug("MCP calls buffer reached size %d, auto-flushing", len(buffer))
82
+ flush_buffer(export=True)
83
+
84
+
85
+ def flush_buffer(export: bool = False) -> list[BaseMCPCall]:
86
+ """
87
+ Clear the MCP calls buffer and return its contents.
88
+
89
+ Args:
90
+ export: Whether to trigger export of this buffer
91
+
92
+ Returns:
93
+ The list of buffered MCP calls
94
+ """
95
+ buffer = mcp_calls_buffer.get()
96
+ if buffer is None:
97
+ buffer = []
98
+ # Reset buffer to empty list
99
+ mcp_calls_buffer.set([])
100
+
101
+ if export and buffer and len(buffer) > 0:
102
+ task_id = buffer[0].task_run_id if buffer else None
103
+ if task_id:
104
+ logger.debug("Exporting %d MCP calls for task run %s", len(buffer), task_id)
105
+ # Create a telemetry record for export
106
+ _telemetry_record = MCPTelemetryRecord(task_run_id=task_id, records=buffer)
107
+ # In the future, we could call an export function here
108
+ # For now, just log that we have telemetry
109
+ logger.debug("MCP telemetry record created with %d calls", len(buffer))
110
+ else:
111
+ logger.warning("No task_run_id found in buffer, skipping export")
112
+
113
+ return buffer
114
+
115
+
116
+ def create_request_record(
117
+ method: str, status: StatusType = StatusType.STARTED, **kwargs: Any
118
+ ) -> MCPRequestCall:
119
+ """Create and buffer a request record"""
120
+ task_run_id = get_current_task_run_id()
121
+ if not task_run_id:
122
+ logger.warning("No active task_run_id, request record will not be created")
123
+ raise ValueError("No active task_run_id")
124
+
125
+ record = MCPRequestCall(
126
+ task_run_id=task_run_id,
127
+ method=method,
128
+ status=status,
129
+ start_time=kwargs.pop("start_time", None) or datetime.now().timestamp(),
130
+ **kwargs,
131
+ )
132
+ buffer_mcp_call(record)
133
+ return record
134
+
135
+
136
+ def create_response_record(
137
+ method: str, related_request_id: str | int | None = None, is_error: bool = False, **kwargs: Any
138
+ ) -> MCPResponseCall:
139
+ """Create and buffer a response record"""
140
+ task_run_id = get_current_task_run_id()
141
+ if not task_run_id:
142
+ logger.warning("No active task_run_id, response record will not be created")
143
+ raise ValueError("No active task_run_id")
144
+
145
+ record = MCPResponseCall(
146
+ task_run_id=task_run_id,
147
+ method=method,
148
+ status=StatusType.COMPLETED,
149
+ related_request_id=related_request_id,
150
+ is_error=is_error,
151
+ **kwargs,
152
+ )
153
+ buffer_mcp_call(record)
154
+ return record
155
+
156
+
157
+ def create_notification_record(
158
+ method: str, status: StatusType = StatusType.STARTED, **kwargs: Any
159
+ ) -> MCPNotificationCall:
160
+ """Create and buffer a notification record"""
161
+ task_run_id = get_current_task_run_id()
162
+ if not task_run_id:
163
+ logger.warning("No active task_run_id, notification record will not be created")
164
+ raise ValueError("No active task_run_id")
165
+
166
+ record = MCPNotificationCall(
167
+ task_run_id=task_run_id,
168
+ method=method,
169
+ status=status,
170
+ start_time=kwargs.pop("start_time", None) or datetime.now().timestamp(),
171
+ **kwargs,
172
+ )
173
+ buffer_mcp_call(record)
174
+ return record
175
+
176
+
177
+ def create_manual_test_record(**custom_data: Any) -> MCPManualTestCall | None:
178
+ """Create and buffer a manual test record"""
179
+ task_run_id = get_current_task_run_id()
180
+ if not task_run_id:
181
+ logger.warning("No active task_run_id, manual test record will not be created")
182
+ return None
183
+
184
+ record = MCPManualTestCall.create(task_run_id=task_run_id, **custom_data)
185
+ buffer_mcp_call(record)
186
+ return record
187
+
188
+
189
+ def reset_context() -> None:
190
+ """Reset all telemetry context variables. Useful for test isolation."""
191
+ set_current_task_run_id(None)
192
+ mcp_calls_buffer.set([])
193
+ is_root_trace.set(False)