hud-python 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (50) hide show
  1. hud/__init__.py +22 -2
  2. hud/adapters/claude/adapter.py +9 -2
  3. hud/adapters/claude/tests/__init__.py +1 -0
  4. hud/adapters/claude/tests/test_adapter.py +519 -0
  5. hud/adapters/common/types.py +5 -1
  6. hud/adapters/operator/adapter.py +4 -0
  7. hud/adapters/operator/tests/__init__.py +1 -0
  8. hud/adapters/operator/tests/test_adapter.py +370 -0
  9. hud/agent/__init__.py +4 -0
  10. hud/agent/base.py +18 -2
  11. hud/agent/claude.py +20 -17
  12. hud/agent/claude_plays_pokemon.py +282 -0
  13. hud/agent/langchain.py +12 -7
  14. hud/agent/misc/__init__.py +3 -0
  15. hud/agent/misc/response_agent.py +80 -0
  16. hud/agent/operator.py +27 -19
  17. hud/agent/tests/__init__.py +1 -0
  18. hud/agent/tests/test_base.py +202 -0
  19. hud/env/docker_client.py +28 -18
  20. hud/env/environment.py +32 -16
  21. hud/env/local_docker_client.py +83 -42
  22. hud/env/remote_client.py +1 -3
  23. hud/env/remote_docker_client.py +72 -15
  24. hud/exceptions.py +12 -0
  25. hud/gym.py +71 -53
  26. hud/job.py +52 -7
  27. hud/settings.py +6 -0
  28. hud/task.py +45 -33
  29. hud/taskset.py +44 -4
  30. hud/telemetry/__init__.py +21 -0
  31. hud/telemetry/_trace.py +173 -0
  32. hud/telemetry/context.py +193 -0
  33. hud/telemetry/exporter.py +417 -0
  34. hud/telemetry/instrumentation/__init__.py +3 -0
  35. hud/telemetry/instrumentation/mcp.py +498 -0
  36. hud/telemetry/instrumentation/registry.py +59 -0
  37. hud/telemetry/mcp_models.py +331 -0
  38. hud/telemetry/tests/__init__.py +1 -0
  39. hud/telemetry/tests/test_context.py +203 -0
  40. hud/telemetry/tests/test_trace.py +270 -0
  41. hud/types.py +10 -26
  42. hud/utils/common.py +22 -2
  43. hud/utils/misc.py +53 -0
  44. hud/utils/tests/test_version.py +1 -1
  45. hud/version.py +7 -0
  46. {hud_python-0.2.4.dist-info → hud_python-0.2.5.dist-info}/METADATA +90 -22
  47. hud_python-0.2.5.dist-info/RECORD +84 -0
  48. hud_python-0.2.4.dist-info/RECORD +0 -62
  49. {hud_python-0.2.4.dist-info → hud_python-0.2.5.dist-info}/WHEEL +0 -0
  50. {hud_python-0.2.4.dist-info → hud_python-0.2.5.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,417 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import concurrent.futures # For run_coroutine_threadsafe return type
5
+ import json
6
+ import logging
7
+ import threading
8
+ import time
9
+ from datetime import datetime, timezone # For ISO timestamp conversion
10
+ from typing import TYPE_CHECKING, Any
11
+
12
+ if TYPE_CHECKING:
13
+ from collections.abc import Coroutine
14
+
15
+ import httpx
16
+
17
+ from hud.settings import settings
18
+
19
+ # Import BaseMCPCall and TrajectoryStep for type hinting and transformation
20
+ from hud.telemetry.mcp_models import ( # MCPResponseCall for isinstance check
21
+ BaseMCPCall,
22
+ MCPResponseCall,
23
+ TrajectoryStep,
24
+ )
25
+
26
+ logger = logging.getLogger("hud.telemetry")
27
+
28
+ # --- Worker Thread and Event Loop Management ---
29
+ _worker_thread: threading.Thread | None = None
30
+ _worker_loop: asyncio.AbstractEventLoop | None = None
31
+ _worker_lock = threading.Lock() # For protecting worker thread/loop startup
32
+ _worker_loop_ready_event = threading.Event() # Event for sync between threads
33
+
34
+ # --- Async Queue and Task (managed by the worker loop) ---
35
+ _SENTINEL_FOR_WORKER_SHUTDOWN = object() # Sentinel for queue-based shutdown signaling
36
+ _export_queue_async: list[dict[str, Any] | object] = [] # Queue can hold dicts or sentinel
37
+ _export_lock_async = asyncio.Lock() # Async lock for the async queue
38
+ _export_task_async: asyncio.Task | None = None # Async task for processing the queue
39
+
40
+ # --- Constants ---
41
+ EXPORT_INTERVAL = 5.0 # seconds
42
+ # MAX_BATCH_SIZE removed as we send one trace payload at a time
43
+
44
+
45
+ def _run_worker_loop() -> None:
46
+ """Target function for the worker thread. Runs its own asyncio event loop."""
47
+ global _worker_loop
48
+ logger.debug("Telemetry worker thread: Starting event loop.")
49
+ _worker_loop = asyncio.new_event_loop()
50
+ asyncio.set_event_loop(_worker_loop)
51
+
52
+ _worker_loop_ready_event.set() # Signal that loop is created and set for this thread
53
+
54
+ try:
55
+ logger.debug("Telemetry worker thread: Event loop running.")
56
+ _worker_loop.run_forever()
57
+ except Exception as e:
58
+ logger.exception("Telemetry worker loop encountered an unhandled exception: %s", e)
59
+ finally:
60
+ logger.debug("Telemetry worker loop: Starting cleanup...")
61
+ if _export_task_async and not _export_task_async.done():
62
+ logger.debug("Telemetry worker loop: Cancelling active export processing task.")
63
+ _export_task_async.cancel()
64
+ try:
65
+ # Wait for the task to acknowledge cancellation
66
+ _worker_loop.run_until_complete(
67
+ asyncio.gather(_export_task_async, return_exceptions=True)
68
+ )
69
+ except asyncio.CancelledError:
70
+ logger.debug(
71
+ "Telemetry worker loop: Export processing task acknowledged cancellation."
72
+ )
73
+ except Exception as e_gather:
74
+ logger.debug(
75
+ "Telemetry worker loop: Exception during export task cleanup: %s", e_gather
76
+ )
77
+
78
+ logger.debug("Telemetry worker loop: Closing.")
79
+ _worker_loop.close()
80
+ logger.debug("Telemetry worker thread: Event loop closed.")
81
+ # _worker_loop_ready_event.clear() # Should be cleared by starter if thread is to be reused
82
+
83
+
84
+ def _start_worker_if_needed() -> None:
85
+ """Starts the background worker thread if not already running. Assumes _worker_lock is held."""
86
+ global _worker_thread # _worker_loop is set by the thread itself
87
+ if _worker_thread is None or not _worker_thread.is_alive():
88
+ logger.debug("Telemetry: Worker thread not alive, starting new one.")
89
+ # _worker_loop should be None here or will be replaced by the new thread
90
+ _worker_loop_ready_event.clear()
91
+ _worker_thread = threading.Thread(
92
+ target=_run_worker_loop, daemon=True, name="HUDTelemetryWorker"
93
+ )
94
+ _worker_thread.start()
95
+
96
+ logger.debug("Telemetry: Waiting for worker thread event loop to be ready...")
97
+ if not _worker_loop_ready_event.wait(timeout=5.0): # Wait up to 5 seconds
98
+ logger.error(
99
+ "Telemetry: Worker thread failed to signal event loop readiness within timeout."
100
+ )
101
+ # This is a problem, subsequent submissions might fail.
102
+ return
103
+
104
+ # Minor delay to ensure loop might have started run_forever if wait was too tight
105
+ time.sleep(0.05)
106
+ if _worker_loop is None or not _worker_loop.is_running():
107
+ logger.error("Telemetry: Worker loop is not ready or not running after event was set.")
108
+ else:
109
+ logger.debug("Telemetry: Worker thread event loop is ready.")
110
+
111
+
112
+ def submit_to_worker_loop(coro: Coroutine[Any, Any, Any]) -> concurrent.futures.Future[Any] | None:
113
+ """Submits a coroutine to be run on the worker thread's event loop."""
114
+ with _worker_lock: # Protects check-and-start of worker thread/loop
115
+ _start_worker_if_needed()
116
+
117
+ # Check _worker_loop status AFTER attempting to start and waiting for readiness event
118
+ if _worker_loop is None or not _worker_loop.is_running():
119
+ logger.error(
120
+ "Telemetry: Worker loop not available or not running for submitting coroutine."
121
+ )
122
+ return None
123
+
124
+ try:
125
+ future = asyncio.run_coroutine_threadsafe(coro, _worker_loop)
126
+ return future
127
+ except Exception as e:
128
+ # This can happen if the loop is shut down right as we try to submit
129
+ logger.exception("Telemetry: Failed to submit coroutine to worker loop: %s", e)
130
+ return None
131
+
132
+
133
+ # --- Telemetry Export Logic (runs on worker thread's loop) ---
134
+
135
+
136
+ async def export_telemetry(
137
+ task_run_id: str,
138
+ trace_attributes: dict[str, Any],
139
+ mcp_calls: list[BaseMCPCall], # Type hint is now list[BaseMCPCall]
140
+ ) -> None:
141
+ """
142
+ Export telemetry data to the HUD telemetry service.
143
+
144
+ Args:
145
+ task_run_id: The task run ID associated with this telemetry
146
+ trace_attributes: Attributes of the trace
147
+ mcp_calls: List of MCP call Pydantic models to export
148
+ """
149
+ trajectory_steps_data: list[dict[str, Any]] = []
150
+ for mcp_call_model in mcp_calls:
151
+ action_data = mcp_call_model.model_dump()
152
+
153
+ start_ts_iso = None
154
+ end_ts_iso = None
155
+
156
+ # Get start_time if available (e.g. on MCPRequestCall, MCPNotificationCall)
157
+ actual_start_time_float = getattr(mcp_call_model, "start_time", None)
158
+ if actual_start_time_float:
159
+ start_ts_iso = (
160
+ datetime.fromtimestamp(actual_start_time_float, timezone.utc)
161
+ .isoformat()
162
+ .replace("+00:00", "Z")
163
+ )
164
+
165
+ # Use 'end_time' if available, otherwise fall back to 'timestamp' for the end_timestamp
166
+ actual_end_time_float = getattr(mcp_call_model, "end_time", None)
167
+ effective_end_timestamp_float = (
168
+ actual_end_time_float if actual_end_time_float else mcp_call_model.timestamp
169
+ )
170
+
171
+ if effective_end_timestamp_float:
172
+ end_ts_iso = (
173
+ datetime.fromtimestamp(effective_end_timestamp_float, timezone.utc)
174
+ .isoformat()
175
+ .replace("+00:00", "Z")
176
+ )
177
+
178
+ # For events that are more like points in time (e.g., a received response that
179
+ # doesn't have a separate start_time field) set start_timestamp to be the same as
180
+ # end_timestamp if start_timestamp wasn't explicitly set.
181
+ if end_ts_iso and not start_ts_iso:
182
+ start_ts_iso = end_ts_iso
183
+
184
+ step_metadata: dict[str, Any] = {
185
+ "mcp_method": mcp_call_model.method,
186
+ "mcp_status": mcp_call_model.status.value,
187
+ "mcp_call_type_original": mcp_call_model.call_type,
188
+ }
189
+ if mcp_call_model.direction:
190
+ step_metadata["mcp_direction"] = mcp_call_model.direction.value
191
+ if mcp_call_model.message_id is not None:
192
+ step_metadata["mcp_message_id"] = str(mcp_call_model.message_id) # Ensure string
193
+
194
+ # Specific handling for MCPResponseCall fields in metadata
195
+ if isinstance(mcp_call_model, MCPResponseCall):
196
+ step_metadata["mcp_is_error"] = mcp_call_model.is_error # bool is fine for JSON Any
197
+ if mcp_call_model.is_error:
198
+ if mcp_call_model.error is not None:
199
+ step_metadata["mcp_error_details"] = str(mcp_call_model.error) # Ensure string
200
+ if mcp_call_model.error_type is not None:
201
+ step_metadata["mcp_error_type"] = str(
202
+ mcp_call_model.error_type
203
+ ) # Ensure string
204
+
205
+ obs_text = None
206
+ if isinstance(mcp_call_model, MCPResponseCall) and mcp_call_model.response_data:
207
+ result_data = mcp_call_model.response_data.get("result")
208
+ if result_data is not None:
209
+ try:
210
+ obs_text = json.dumps(result_data)
211
+ except (TypeError, OverflowError):
212
+ obs_text = str(result_data)
213
+
214
+ trajectory_step = TrajectoryStep(
215
+ type="mcp-step",
216
+ actions=[action_data],
217
+ start_timestamp=start_ts_iso,
218
+ end_timestamp=end_ts_iso,
219
+ metadata=step_metadata,
220
+ observation_text=obs_text,
221
+ )
222
+ trajectory_steps_data.append(trajectory_step.model_dump())
223
+
224
+ payload_to_queue = {
225
+ "task_run_id": task_run_id,
226
+ "attributes": trace_attributes,
227
+ "mcp_calls": trajectory_steps_data,
228
+ "timestamp": time.time(),
229
+ }
230
+
231
+ await _queue_for_export_async(payload_to_queue)
232
+
233
+
234
+ async def _queue_for_export_async(payload: dict[str, Any] | object) -> None:
235
+ """Adds a payload or sentinel to the async export queue. Runs on worker loop."""
236
+ global _export_task_async, _worker_loop
237
+ if not _worker_loop or not _worker_loop.is_running():
238
+ logger.error("Cannot queue telemetry, worker loop not running or not set.")
239
+ return
240
+
241
+ async with _export_lock_async:
242
+ _export_queue_async.append(payload)
243
+ if _export_task_async is None or _export_task_async.done():
244
+ _export_task_async = _worker_loop.create_task(_process_export_queue_async())
245
+ logger.debug("Started/Restarted async telemetry export processing task on worker loop.")
246
+
247
+
248
+ async def _process_export_queue_async() -> None:
249
+ """Processes the async export queue. Runs on worker loop via _export_task_async."""
250
+ global _export_task_async
251
+ try:
252
+ while True:
253
+ payload_to_process: dict[str, Any] | object | None = None
254
+ async with _export_lock_async:
255
+ if not _export_queue_async:
256
+ logger.debug("Async export queue empty, processing task will pause.")
257
+ _export_task_async = None
258
+ return
259
+ payload_to_process = _export_queue_async.pop(0)
260
+
261
+ if payload_to_process is _SENTINEL_FOR_WORKER_SHUTDOWN:
262
+ logger.debug("Shutdown sentinel received by processing task, stopping.")
263
+ _export_task_async = None
264
+ return
265
+
266
+ if isinstance(payload_to_process, dict): # Ensure it's a dict before processing as such
267
+ await _export_trace_payload_async(payload_to_process)
268
+ else:
269
+ # Should not happen if only dicts and sentinel are queued
270
+ logger.warning("Unexpected item in telemetry queue: %s", type(payload_to_process))
271
+
272
+ await asyncio.sleep(EXPORT_INTERVAL)
273
+
274
+ except asyncio.CancelledError:
275
+ logger.debug("Async telemetry export processing task cancelled.")
276
+ _export_task_async = None
277
+ raise
278
+ except Exception as e:
279
+ logger.exception("Error in async telemetry export processing task: %s", e)
280
+ _export_task_async = None
281
+
282
+
283
+ async def _export_trace_payload_async(payload: dict[str, Any]) -> None:
284
+ """Export a single trace payload to the HUD telemetry service."""
285
+ if not settings.telemetry_enabled:
286
+ logger.debug("Telemetry export skipped - telemetry not enabled")
287
+ return
288
+
289
+ task_run_id = payload.get("task_run_id")
290
+ if not task_run_id:
291
+ logger.warning("Payload missing task_run_id, skipping export")
292
+ return
293
+
294
+ # The payload itself is what we want to send (containing attributes and mcp_calls list)
295
+ # The mcp_calls within the payload are already dumped dictionaries.
296
+ data_to_send = {
297
+ "metadata": payload.get("attributes", {}),
298
+ "telemetry": payload.get("mcp_calls", []),
299
+ }
300
+
301
+ # Ensure mcp_calls is not empty if that's a requirement, or send as is. For now, send as is.
302
+ # if not data_to_send["mcp_calls"]:
303
+ # logger.debug("No MCP calls in payload for task run %s, skipping specific export if "
304
+ # "desired.", task_run_id)
305
+ # # Depending on backend, might not want to send empty mcp_calls list, or it's fine.
306
+
307
+ telemetry_url = f"{settings.base_url}/v2/task_runs/{task_run_id}/telemetry-upload"
308
+
309
+ try:
310
+ async with httpx.AsyncClient() as client:
311
+ headers = {
312
+ "Content-Type": "application/json",
313
+ "Authorization": f"Bearer {settings.api_key}",
314
+ }
315
+
316
+ logger.debug(
317
+ "Exporting telemetry for task run %s to %s",
318
+ task_run_id,
319
+ telemetry_url,
320
+ )
321
+ response = await client.post(
322
+ telemetry_url,
323
+ json=data_to_send, # Send the structured attributes and mcp_calls
324
+ headers=headers,
325
+ timeout=30.0,
326
+ )
327
+
328
+ if response.status_code >= 200 and response.status_code < 300:
329
+ logger.debug(
330
+ "Successfully exported telemetry for task run %s. Status: %s",
331
+ task_run_id,
332
+ response.status_code,
333
+ )
334
+ else:
335
+ logger.warning(
336
+ "Failed to export telemetry for task run %s: HTTP %s - %s",
337
+ task_run_id,
338
+ response.status_code,
339
+ response.text,
340
+ )
341
+ except Exception as e:
342
+ logger.exception("Error exporting telemetry for task run %s: %s", task_run_id, e)
343
+
344
+
345
+ # --- Public Shutdown Function ---
346
+ def flush(timeout: float = 10.0) -> None:
347
+ """Flushes pending telemetry data and stops the worker thread."""
348
+ global _worker_thread, _worker_loop, _export_task_async, _export_queue_async
349
+ logger.debug("Initiating telemetry flush and shutdown.")
350
+
351
+ shutdown_future: concurrent.futures.Future | None = None
352
+ if _worker_loop and _worker_loop.is_running():
353
+ logger.debug("Submitting shutdown sentinel to telemetry worker's queue.")
354
+ coro = _queue_for_export_async(_SENTINEL_FOR_WORKER_SHUTDOWN)
355
+ try:
356
+ shutdown_future = asyncio.run_coroutine_threadsafe(coro, _worker_loop)
357
+ except Exception as e: # Catch errors during submission (e.g. if loop is shutting down)
358
+ logger.warning("Exception during submission of shutdown sentinel: %s", e, exc_info=True)
359
+ # Proceed to attempt thread join if possible
360
+
361
+ if shutdown_future:
362
+ try:
363
+ shutdown_future.result(timeout / 2 if timeout else None)
364
+ logger.debug("Shutdown sentinel successfully queued.")
365
+ except concurrent.futures.TimeoutError:
366
+ logger.warning("Timeout waiting for shutdown sentinel to be queued.")
367
+ except Exception as e:
368
+ logger.warning(
369
+ "Error waiting for shutdown sentinel to be queued: %s", e, exc_info=True
370
+ )
371
+
372
+ # Wait for the current _export_task_async to see the sentinel and finish.
373
+ # This is tricky because the task lives on another thread's loop.
374
+ # The best way is for _process_export_queue_async to clear _export_task_async when it exits.
375
+ # We then wait a bit for that to happen.
376
+ if _export_task_async is not None: # Check if a task was even known to be running
377
+ # This check is racy, but it's the best we can do without more complex inter-thread
378
+ # sync for task completion. Give some time for the task to process the sentinel and
379
+ # clear itself.
380
+ # Max wait for task to clear
381
+ attempt_timeout = time.time() + (timeout / 2 if timeout else 2.0)
382
+ while _export_task_async is not None and time.time() < attempt_timeout:
383
+ time.sleep(0.1)
384
+ # _export_task_async is set to None by _process_export_queue_async upon its exit.
385
+ if _export_task_async is not None:
386
+ logger.warning(
387
+ "Telemetry processing task did not clear itself after sentinel. May still be "
388
+ "running or stuck."
389
+ )
390
+ else:
391
+ logger.debug("Telemetry processing task appears to have completed after sentinel.")
392
+
393
+ if _worker_loop and _worker_loop.is_running():
394
+ logger.debug("Requesting telemetry worker event loop to stop.")
395
+ # Ask the loop to stop running run_forever
396
+ _worker_loop.call_soon_threadsafe(_worker_loop.stop)
397
+
398
+ if _worker_thread and _worker_thread.is_alive():
399
+ logger.debug(
400
+ "Joining telemetry worker thread (up to remaining timeout)...",
401
+ )
402
+ # Calculate remaining timeout for join
403
+ remaining_timeout = timeout - (timeout / 2) if timeout else None # Simplistic split
404
+ if remaining_timeout is not None and remaining_timeout < 0:
405
+ remaining_timeout = 0
406
+
407
+ _worker_thread.join(remaining_timeout)
408
+ if _worker_thread.is_alive():
409
+ logger.warning("Telemetry worker thread did not shut down cleanly after timeout.")
410
+ else:
411
+ logger.debug("Telemetry worker thread successfully joined.")
412
+
413
+ _worker_thread = None
414
+ _worker_loop = None
415
+ _export_task_async = None
416
+ # _export_queue_async.clear() # Optionally clear the queue
417
+ logger.debug("Telemetry flush and shutdown process completed.")
@@ -0,0 +1,3 @@
1
+ """MCP instrumentation for telemetry collection."""
2
+
3
+ from __future__ import annotations