hud-python 0.2.10__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (86) hide show
  1. hud/__init__.py +20 -8
  2. hud/adapters/common/adapter.py +14 -3
  3. hud/adapters/common/tests/test_adapter.py +16 -4
  4. hud/datasets.py +188 -0
  5. hud/env/docker_client.py +15 -3
  6. hud/env/environment.py +10 -7
  7. hud/env/local_docker_client.py +29 -7
  8. hud/env/remote_client.py +1 -1
  9. hud/env/remote_docker_client.py +2 -2
  10. hud/exceptions.py +2 -1
  11. hud/gym.py +0 -9
  12. hud/mcp/__init__.py +17 -0
  13. hud/mcp/base.py +631 -0
  14. hud/mcp/claude.py +321 -0
  15. hud/mcp/client.py +312 -0
  16. hud/mcp/langchain.py +250 -0
  17. hud/mcp/openai.py +334 -0
  18. hud/mcp/tests/__init__.py +1 -0
  19. hud/mcp/tests/test_base.py +512 -0
  20. hud/mcp/tests/test_claude.py +294 -0
  21. hud/mcp/tests/test_client.py +324 -0
  22. hud/mcp/tests/test_openai.py +238 -0
  23. hud/settings.py +20 -2
  24. hud/task.py +5 -88
  25. hud/taskset.py +2 -23
  26. hud/telemetry/__init__.py +16 -7
  27. hud/telemetry/_trace.py +246 -72
  28. hud/telemetry/context.py +88 -27
  29. hud/telemetry/exporter.py +171 -11
  30. hud/telemetry/instrumentation/mcp.py +174 -410
  31. hud/telemetry/job.py +141 -0
  32. hud/telemetry/mcp_models.py +13 -74
  33. hud/telemetry/tests/test_context.py +9 -6
  34. hud/telemetry/tests/test_trace.py +120 -78
  35. hud/tools/__init__.py +34 -0
  36. hud/tools/base.py +65 -0
  37. hud/tools/bash.py +137 -0
  38. hud/tools/computer/__init__.py +13 -0
  39. hud/tools/computer/anthropic.py +411 -0
  40. hud/tools/computer/hud.py +315 -0
  41. hud/tools/computer/openai.py +283 -0
  42. hud/tools/edit.py +290 -0
  43. hud/tools/executors/__init__.py +30 -0
  44. hud/tools/executors/base.py +331 -0
  45. hud/tools/executors/pyautogui.py +619 -0
  46. hud/tools/executors/tests/__init__.py +1 -0
  47. hud/tools/executors/tests/test_base_executor.py +338 -0
  48. hud/tools/executors/tests/test_pyautogui_executor.py +165 -0
  49. hud/tools/executors/xdo.py +503 -0
  50. hud/tools/helper/README.md +56 -0
  51. hud/tools/helper/__init__.py +9 -0
  52. hud/tools/helper/mcp_server.py +78 -0
  53. hud/tools/helper/server_initialization.py +115 -0
  54. hud/tools/helper/utils.py +58 -0
  55. hud/tools/playwright_tool.py +379 -0
  56. hud/tools/tests/__init__.py +3 -0
  57. hud/tools/tests/test_bash.py +152 -0
  58. hud/tools/tests/test_computer.py +52 -0
  59. hud/tools/tests/test_computer_actions.py +34 -0
  60. hud/tools/tests/test_edit.py +240 -0
  61. hud/tools/tests/test_init.py +27 -0
  62. hud/tools/tests/test_playwright_tool.py +183 -0
  63. hud/tools/tests/test_tools.py +157 -0
  64. hud/tools/tests/test_utils.py +156 -0
  65. hud/tools/utils.py +50 -0
  66. hud/trajectory.py +5 -1
  67. hud/types.py +10 -1
  68. hud/utils/tests/test_init.py +21 -0
  69. hud/utils/tests/test_version.py +1 -1
  70. hud/version.py +1 -1
  71. {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/METADATA +27 -18
  72. hud_python-0.3.1.dist-info/RECORD +119 -0
  73. hud/evaluators/__init__.py +0 -9
  74. hud/evaluators/base.py +0 -32
  75. hud/evaluators/inspect.py +0 -24
  76. hud/evaluators/judge.py +0 -189
  77. hud/evaluators/match.py +0 -156
  78. hud/evaluators/remote.py +0 -65
  79. hud/evaluators/tests/__init__.py +0 -0
  80. hud/evaluators/tests/test_inspect.py +0 -12
  81. hud/evaluators/tests/test_judge.py +0 -231
  82. hud/evaluators/tests/test_match.py +0 -115
  83. hud/evaluators/tests/test_remote.py +0 -98
  84. hud_python-0.2.10.dist-info/RECORD +0 -85
  85. {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/WHEEL +0 -0
  86. {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/licenses/LICENSE +0 -0
hud/telemetry/exporter.py CHANGED
@@ -2,17 +2,19 @@ from __future__ import annotations
2
2
 
3
3
  import asyncio
4
4
  import concurrent.futures # For run_coroutine_threadsafe return type
5
+ import enum
5
6
  import json
6
7
  import logging
7
8
  import threading
8
9
  import time
9
- from datetime import datetime, timezone # For ISO timestamp conversion
10
+ from datetime import UTC, datetime # For ISO timestamp conversion
10
11
  from typing import TYPE_CHECKING, Any
11
12
 
12
13
  if TYPE_CHECKING:
13
14
  from collections.abc import Coroutine
14
15
 
15
16
  import httpx
17
+ from pydantic import BaseModel
16
18
 
17
19
  from hud.settings import settings
18
20
 
@@ -25,6 +27,41 @@ from hud.telemetry.mcp_models import ( # MCPResponseCall for isinstance check
25
27
 
26
28
  logger = logging.getLogger("hud.telemetry")
27
29
 
30
+
31
+ # --- Task Run Status Models ---
32
+ class TaskRunStatus(enum.StrEnum):
33
+ INITIALIZING = "initializing"
34
+ RUNNING = "running"
35
+ EVALUATING = "evaluating"
36
+ COMPLETED = "completed"
37
+ ERROR = "error"
38
+
39
+
40
+ class TaskRunStatusUpdateRequest(BaseModel):
41
+ """Request model for updating task run status."""
42
+
43
+ status: TaskRunStatus
44
+ error_message: str | None = None # Optional error message if status is ERROR
45
+ metadata: dict[str, Any] | None = None # Optional metadata for context
46
+ job_id: str | None = None # Optional parent job ID
47
+
48
+
49
+ # --- Job Status Models ---
50
+ class JobStatus(enum.StrEnum):
51
+ RUNNING = "running"
52
+ COMPLETED = "completed"
53
+ ERROR = "error"
54
+
55
+
56
+ class JobStatusUpdateRequest(BaseModel):
57
+ """Request model for updating job status."""
58
+
59
+ status: JobStatus
60
+ error_message: str | None = None # Optional error message if status is ERROR
61
+ metadata: dict[str, Any] | None = None # Optional metadata for context
62
+ taskset_name: str | None = None # Optional dataset/taskset name
63
+
64
+
28
65
  # --- Worker Thread and Event Loop Management ---
29
66
  _worker_thread: threading.Thread | None = None
30
67
  _worker_loop: asyncio.AbstractEventLoop | None = None
@@ -38,7 +75,8 @@ _export_lock_async = asyncio.Lock() # Async lock for the async queue
38
75
  _export_task_async: asyncio.Task | None = None # Async task for processing the queue
39
76
 
40
77
  # --- Constants ---
41
- EXPORT_INTERVAL = 5.0 # seconds
78
+ EXPORT_INTERVAL = 5.0 # seconds - delay between non-incremental exports
79
+ MIN_EXPORT_INTERVAL = 0.1 # seconds - minimum delay between any exports to avoid overwhelming
42
80
  # MAX_BATCH_SIZE removed as we send one trace payload at a time
43
81
 
44
82
 
@@ -157,7 +195,7 @@ async def export_telemetry(
157
195
  actual_start_time_float = getattr(mcp_call_model, "start_time", None)
158
196
  if actual_start_time_float:
159
197
  start_ts_iso = (
160
- datetime.fromtimestamp(actual_start_time_float, timezone.utc)
198
+ datetime.fromtimestamp(actual_start_time_float, UTC)
161
199
  .isoformat()
162
200
  .replace("+00:00", "Z")
163
201
  )
@@ -170,7 +208,7 @@ async def export_telemetry(
170
208
 
171
209
  if effective_end_timestamp_float:
172
210
  end_ts_iso = (
173
- datetime.fromtimestamp(effective_end_timestamp_float, timezone.utc)
211
+ datetime.fromtimestamp(effective_end_timestamp_float, UTC)
174
212
  .isoformat()
175
213
  .replace("+00:00", "Z")
176
214
  )
@@ -265,12 +303,19 @@ async def _process_export_queue_async() -> None:
265
303
 
266
304
  if isinstance(payload_to_process, dict): # Ensure it's a dict before processing as such
267
305
  await _export_trace_payload_async(payload_to_process)
306
+
307
+ # Apply appropriate delay based on export type
308
+ is_incremental = payload_to_process.get("attributes", {}).get("incremental", False)
309
+ if is_incremental:
310
+ # Small delay for incremental exports to avoid overwhelming the server
311
+ await asyncio.sleep(MIN_EXPORT_INTERVAL)
312
+ else:
313
+ # Longer delay for final exports
314
+ await asyncio.sleep(EXPORT_INTERVAL)
268
315
  else:
269
316
  # Should not happen if only dicts and sentinel are queued
270
317
  logger.warning("Unexpected item in telemetry queue: %s", type(payload_to_process))
271
318
 
272
- await asyncio.sleep(EXPORT_INTERVAL)
273
-
274
319
  except asyncio.CancelledError:
275
320
  logger.debug("Async telemetry export processing task cancelled.")
276
321
  _export_task_async = None
@@ -340,6 +385,119 @@ async def send_telemetry_to_server(task_run_id: str, data: dict[str, Any]) -> No
340
385
  logger.exception("Error exporting telemetry for task run %s: %s", task_run_id, e)
341
386
 
342
387
 
388
+ async def update_task_run_status(
389
+ task_run_id: str,
390
+ status: TaskRunStatus,
391
+ error_message: str | None = None,
392
+ metadata: dict[str, Any] | None = None,
393
+ job_id: str | None = None,
394
+ ) -> None:
395
+ """Update the status of a task run."""
396
+ if not settings.telemetry_enabled:
397
+ logger.debug("Status update skipped - telemetry not enabled")
398
+ return
399
+
400
+ status_url = f"{settings.base_url}/v2/task_runs/{task_run_id}/status"
401
+
402
+ try:
403
+ async with httpx.AsyncClient() as client:
404
+ headers = {
405
+ "Content-Type": "application/json",
406
+ "Authorization": f"Bearer {settings.api_key}",
407
+ }
408
+
409
+ request_data = TaskRunStatusUpdateRequest(
410
+ status=status, error_message=error_message, metadata=metadata, job_id=job_id
411
+ )
412
+
413
+ logger.debug(
414
+ "Updating status for task run %s to %s",
415
+ task_run_id,
416
+ status,
417
+ )
418
+
419
+ response = await client.post(
420
+ status_url,
421
+ json=request_data.model_dump(exclude_none=True),
422
+ headers=headers,
423
+ timeout=10.0,
424
+ )
425
+
426
+ if response.status_code >= 200 and response.status_code < 300:
427
+ logger.debug(
428
+ "Successfully updated status for task run %s to %s",
429
+ task_run_id,
430
+ status,
431
+ )
432
+ else:
433
+ logger.warning(
434
+ "Failed to update status for task run %s: HTTP %s - %s",
435
+ task_run_id,
436
+ response.status_code,
437
+ response.text,
438
+ )
439
+ except Exception as e:
440
+ logger.exception("Error updating status for task run %s: %s", task_run_id, e)
441
+
442
+
443
+ async def update_job_status(
444
+ job_id: str,
445
+ status: JobStatus,
446
+ error_message: str | None = None,
447
+ metadata: dict[str, Any] | None = None,
448
+ taskset_name: str | None = None,
449
+ ) -> None:
450
+ """Update the status of a job."""
451
+ if not settings.telemetry_enabled:
452
+ logger.debug("Job status update skipped - telemetry not enabled")
453
+ return
454
+
455
+ status_url = f"{settings.base_url}/v2/jobs/{job_id}/status"
456
+
457
+ try:
458
+ async with httpx.AsyncClient() as client:
459
+ headers = {
460
+ "Content-Type": "application/json",
461
+ "Authorization": f"Bearer {settings.api_key}",
462
+ }
463
+
464
+ request_data = JobStatusUpdateRequest(
465
+ status=status,
466
+ error_message=error_message,
467
+ metadata=metadata,
468
+ taskset_name=taskset_name,
469
+ )
470
+
471
+ logger.debug(
472
+ "Updating status for job %s to %s",
473
+ job_id,
474
+ status,
475
+ )
476
+
477
+ response = await client.post(
478
+ status_url,
479
+ json=request_data.model_dump(exclude_none=True),
480
+ headers=headers,
481
+ timeout=10.0,
482
+ )
483
+
484
+ if response.status_code >= 200 and response.status_code < 300:
485
+ logger.debug(
486
+ "Successfully updated status for job %s to %s",
487
+ job_id,
488
+ status,
489
+ )
490
+ else:
491
+ logger.warning(
492
+ "Failed to update status for job %s: HTTP %s - %s",
493
+ job_id,
494
+ response.status_code,
495
+ response.text,
496
+ )
497
+ except Exception as e:
498
+ logger.exception("Error updating status for job %s: %s", job_id, e)
499
+
500
+
343
501
  # --- Public Shutdown Function ---
344
502
  def flush(timeout: float = 10.0) -> None:
345
503
  """Flushes pending telemetry data and stops the worker thread."""
@@ -375,15 +533,17 @@ def flush(timeout: float = 10.0) -> None:
375
533
  # This check is racy, but it's the best we can do without more complex inter-thread
376
534
  # sync for task completion. Give some time for the task to process the sentinel and
377
535
  # clear itself.
378
- # Max wait for task to clear
379
- attempt_timeout = time.time() + (timeout / 2 if timeout else 2.0)
536
+ # Max wait for task to clear - should be longer than EXPORT_INTERVAL to ensure
537
+ # the task has time to wake from sleep and process the sentinel
538
+ attempt_timeout = time.time() + (timeout / 2 if timeout else 2.0) + EXPORT_INTERVAL + 1.0
380
539
  while _export_task_async is not None and time.time() < attempt_timeout:
381
540
  time.sleep(0.1)
382
541
  # _export_task_async is set to None by _process_export_queue_async upon its exit.
383
542
  if _export_task_async is not None:
384
- logger.warning(
385
- "Telemetry processing task did not clear itself after sentinel. May still be "
386
- "running or stuck."
543
+ # This is often a false positive due to race conditions during shutdown
544
+ logger.debug(
545
+ "Telemetry processing task did not clear itself after sentinel. "
546
+ "This is normal during shutdown."
387
547
  )
388
548
  else:
389
549
  logger.debug("Telemetry processing task appears to have completed after sentinel.")