hud-python 0.2.10__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +20 -8
- hud/adapters/common/adapter.py +14 -3
- hud/adapters/common/tests/test_adapter.py +16 -4
- hud/datasets.py +188 -0
- hud/env/docker_client.py +15 -3
- hud/env/environment.py +10 -7
- hud/env/local_docker_client.py +29 -7
- hud/env/remote_client.py +1 -1
- hud/env/remote_docker_client.py +2 -2
- hud/exceptions.py +2 -1
- hud/gym.py +0 -9
- hud/mcp/__init__.py +17 -0
- hud/mcp/base.py +631 -0
- hud/mcp/claude.py +321 -0
- hud/mcp/client.py +312 -0
- hud/mcp/langchain.py +250 -0
- hud/mcp/openai.py +334 -0
- hud/mcp/tests/__init__.py +1 -0
- hud/mcp/tests/test_base.py +512 -0
- hud/mcp/tests/test_claude.py +294 -0
- hud/mcp/tests/test_client.py +324 -0
- hud/mcp/tests/test_openai.py +238 -0
- hud/settings.py +20 -2
- hud/task.py +5 -88
- hud/taskset.py +2 -23
- hud/telemetry/__init__.py +16 -7
- hud/telemetry/_trace.py +246 -72
- hud/telemetry/context.py +88 -27
- hud/telemetry/exporter.py +171 -11
- hud/telemetry/instrumentation/mcp.py +174 -410
- hud/telemetry/job.py +141 -0
- hud/telemetry/mcp_models.py +13 -74
- hud/telemetry/tests/test_context.py +9 -6
- hud/telemetry/tests/test_trace.py +120 -78
- hud/tools/__init__.py +34 -0
- hud/tools/base.py +65 -0
- hud/tools/bash.py +137 -0
- hud/tools/computer/__init__.py +13 -0
- hud/tools/computer/anthropic.py +411 -0
- hud/tools/computer/hud.py +315 -0
- hud/tools/computer/openai.py +283 -0
- hud/tools/edit.py +290 -0
- hud/tools/executors/__init__.py +30 -0
- hud/tools/executors/base.py +331 -0
- hud/tools/executors/pyautogui.py +619 -0
- hud/tools/executors/tests/__init__.py +1 -0
- hud/tools/executors/tests/test_base_executor.py +338 -0
- hud/tools/executors/tests/test_pyautogui_executor.py +165 -0
- hud/tools/executors/xdo.py +503 -0
- hud/tools/helper/README.md +56 -0
- hud/tools/helper/__init__.py +9 -0
- hud/tools/helper/mcp_server.py +78 -0
- hud/tools/helper/server_initialization.py +115 -0
- hud/tools/helper/utils.py +58 -0
- hud/tools/playwright_tool.py +379 -0
- hud/tools/tests/__init__.py +3 -0
- hud/tools/tests/test_bash.py +152 -0
- hud/tools/tests/test_computer.py +52 -0
- hud/tools/tests/test_computer_actions.py +34 -0
- hud/tools/tests/test_edit.py +240 -0
- hud/tools/tests/test_init.py +27 -0
- hud/tools/tests/test_playwright_tool.py +183 -0
- hud/tools/tests/test_tools.py +157 -0
- hud/tools/tests/test_utils.py +156 -0
- hud/tools/utils.py +50 -0
- hud/trajectory.py +5 -1
- hud/types.py +10 -1
- hud/utils/tests/test_init.py +21 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/METADATA +27 -18
- hud_python-0.3.1.dist-info/RECORD +119 -0
- hud/evaluators/__init__.py +0 -9
- hud/evaluators/base.py +0 -32
- hud/evaluators/inspect.py +0 -24
- hud/evaluators/judge.py +0 -189
- hud/evaluators/match.py +0 -156
- hud/evaluators/remote.py +0 -65
- hud/evaluators/tests/__init__.py +0 -0
- hud/evaluators/tests/test_inspect.py +0 -12
- hud/evaluators/tests/test_judge.py +0 -231
- hud/evaluators/tests/test_match.py +0 -115
- hud/evaluators/tests/test_remote.py +0 -98
- hud_python-0.2.10.dist-info/RECORD +0 -85
- {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/WHEEL +0 -0
- {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/licenses/LICENSE +0 -0
hud/telemetry/exporter.py
CHANGED
|
@@ -2,17 +2,19 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import concurrent.futures # For run_coroutine_threadsafe return type
|
|
5
|
+
import enum
|
|
5
6
|
import json
|
|
6
7
|
import logging
|
|
7
8
|
import threading
|
|
8
9
|
import time
|
|
9
|
-
from datetime import
|
|
10
|
+
from datetime import UTC, datetime # For ISO timestamp conversion
|
|
10
11
|
from typing import TYPE_CHECKING, Any
|
|
11
12
|
|
|
12
13
|
if TYPE_CHECKING:
|
|
13
14
|
from collections.abc import Coroutine
|
|
14
15
|
|
|
15
16
|
import httpx
|
|
17
|
+
from pydantic import BaseModel
|
|
16
18
|
|
|
17
19
|
from hud.settings import settings
|
|
18
20
|
|
|
@@ -25,6 +27,41 @@ from hud.telemetry.mcp_models import ( # MCPResponseCall for isinstance check
|
|
|
25
27
|
|
|
26
28
|
logger = logging.getLogger("hud.telemetry")
|
|
27
29
|
|
|
30
|
+
|
|
31
|
+
# --- Task Run Status Models ---
|
|
32
|
+
class TaskRunStatus(enum.StrEnum):
|
|
33
|
+
INITIALIZING = "initializing"
|
|
34
|
+
RUNNING = "running"
|
|
35
|
+
EVALUATING = "evaluating"
|
|
36
|
+
COMPLETED = "completed"
|
|
37
|
+
ERROR = "error"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class TaskRunStatusUpdateRequest(BaseModel):
|
|
41
|
+
"""Request model for updating task run status."""
|
|
42
|
+
|
|
43
|
+
status: TaskRunStatus
|
|
44
|
+
error_message: str | None = None # Optional error message if status is ERROR
|
|
45
|
+
metadata: dict[str, Any] | None = None # Optional metadata for context
|
|
46
|
+
job_id: str | None = None # Optional parent job ID
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# --- Job Status Models ---
|
|
50
|
+
class JobStatus(enum.StrEnum):
|
|
51
|
+
RUNNING = "running"
|
|
52
|
+
COMPLETED = "completed"
|
|
53
|
+
ERROR = "error"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class JobStatusUpdateRequest(BaseModel):
|
|
57
|
+
"""Request model for updating job status."""
|
|
58
|
+
|
|
59
|
+
status: JobStatus
|
|
60
|
+
error_message: str | None = None # Optional error message if status is ERROR
|
|
61
|
+
metadata: dict[str, Any] | None = None # Optional metadata for context
|
|
62
|
+
taskset_name: str | None = None # Optional dataset/taskset name
|
|
63
|
+
|
|
64
|
+
|
|
28
65
|
# --- Worker Thread and Event Loop Management ---
|
|
29
66
|
_worker_thread: threading.Thread | None = None
|
|
30
67
|
_worker_loop: asyncio.AbstractEventLoop | None = None
|
|
@@ -38,7 +75,8 @@ _export_lock_async = asyncio.Lock() # Async lock for the async queue
|
|
|
38
75
|
_export_task_async: asyncio.Task | None = None # Async task for processing the queue
|
|
39
76
|
|
|
40
77
|
# --- Constants ---
|
|
41
|
-
EXPORT_INTERVAL = 5.0 # seconds
|
|
78
|
+
EXPORT_INTERVAL = 5.0 # seconds - delay between non-incremental exports
|
|
79
|
+
MIN_EXPORT_INTERVAL = 0.1 # seconds - minimum delay between any exports to avoid overwhelming
|
|
42
80
|
# MAX_BATCH_SIZE removed as we send one trace payload at a time
|
|
43
81
|
|
|
44
82
|
|
|
@@ -157,7 +195,7 @@ async def export_telemetry(
|
|
|
157
195
|
actual_start_time_float = getattr(mcp_call_model, "start_time", None)
|
|
158
196
|
if actual_start_time_float:
|
|
159
197
|
start_ts_iso = (
|
|
160
|
-
datetime.fromtimestamp(actual_start_time_float,
|
|
198
|
+
datetime.fromtimestamp(actual_start_time_float, UTC)
|
|
161
199
|
.isoformat()
|
|
162
200
|
.replace("+00:00", "Z")
|
|
163
201
|
)
|
|
@@ -170,7 +208,7 @@ async def export_telemetry(
|
|
|
170
208
|
|
|
171
209
|
if effective_end_timestamp_float:
|
|
172
210
|
end_ts_iso = (
|
|
173
|
-
datetime.fromtimestamp(effective_end_timestamp_float,
|
|
211
|
+
datetime.fromtimestamp(effective_end_timestamp_float, UTC)
|
|
174
212
|
.isoformat()
|
|
175
213
|
.replace("+00:00", "Z")
|
|
176
214
|
)
|
|
@@ -265,12 +303,19 @@ async def _process_export_queue_async() -> None:
|
|
|
265
303
|
|
|
266
304
|
if isinstance(payload_to_process, dict): # Ensure it's a dict before processing as such
|
|
267
305
|
await _export_trace_payload_async(payload_to_process)
|
|
306
|
+
|
|
307
|
+
# Apply appropriate delay based on export type
|
|
308
|
+
is_incremental = payload_to_process.get("attributes", {}).get("incremental", False)
|
|
309
|
+
if is_incremental:
|
|
310
|
+
# Small delay for incremental exports to avoid overwhelming the server
|
|
311
|
+
await asyncio.sleep(MIN_EXPORT_INTERVAL)
|
|
312
|
+
else:
|
|
313
|
+
# Longer delay for final exports
|
|
314
|
+
await asyncio.sleep(EXPORT_INTERVAL)
|
|
268
315
|
else:
|
|
269
316
|
# Should not happen if only dicts and sentinel are queued
|
|
270
317
|
logger.warning("Unexpected item in telemetry queue: %s", type(payload_to_process))
|
|
271
318
|
|
|
272
|
-
await asyncio.sleep(EXPORT_INTERVAL)
|
|
273
|
-
|
|
274
319
|
except asyncio.CancelledError:
|
|
275
320
|
logger.debug("Async telemetry export processing task cancelled.")
|
|
276
321
|
_export_task_async = None
|
|
@@ -340,6 +385,119 @@ async def send_telemetry_to_server(task_run_id: str, data: dict[str, Any]) -> No
|
|
|
340
385
|
logger.exception("Error exporting telemetry for task run %s: %s", task_run_id, e)
|
|
341
386
|
|
|
342
387
|
|
|
388
|
+
async def update_task_run_status(
|
|
389
|
+
task_run_id: str,
|
|
390
|
+
status: TaskRunStatus,
|
|
391
|
+
error_message: str | None = None,
|
|
392
|
+
metadata: dict[str, Any] | None = None,
|
|
393
|
+
job_id: str | None = None,
|
|
394
|
+
) -> None:
|
|
395
|
+
"""Update the status of a task run."""
|
|
396
|
+
if not settings.telemetry_enabled:
|
|
397
|
+
logger.debug("Status update skipped - telemetry not enabled")
|
|
398
|
+
return
|
|
399
|
+
|
|
400
|
+
status_url = f"{settings.base_url}/v2/task_runs/{task_run_id}/status"
|
|
401
|
+
|
|
402
|
+
try:
|
|
403
|
+
async with httpx.AsyncClient() as client:
|
|
404
|
+
headers = {
|
|
405
|
+
"Content-Type": "application/json",
|
|
406
|
+
"Authorization": f"Bearer {settings.api_key}",
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
request_data = TaskRunStatusUpdateRequest(
|
|
410
|
+
status=status, error_message=error_message, metadata=metadata, job_id=job_id
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
logger.debug(
|
|
414
|
+
"Updating status for task run %s to %s",
|
|
415
|
+
task_run_id,
|
|
416
|
+
status,
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
response = await client.post(
|
|
420
|
+
status_url,
|
|
421
|
+
json=request_data.model_dump(exclude_none=True),
|
|
422
|
+
headers=headers,
|
|
423
|
+
timeout=10.0,
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
if response.status_code >= 200 and response.status_code < 300:
|
|
427
|
+
logger.debug(
|
|
428
|
+
"Successfully updated status for task run %s to %s",
|
|
429
|
+
task_run_id,
|
|
430
|
+
status,
|
|
431
|
+
)
|
|
432
|
+
else:
|
|
433
|
+
logger.warning(
|
|
434
|
+
"Failed to update status for task run %s: HTTP %s - %s",
|
|
435
|
+
task_run_id,
|
|
436
|
+
response.status_code,
|
|
437
|
+
response.text,
|
|
438
|
+
)
|
|
439
|
+
except Exception as e:
|
|
440
|
+
logger.exception("Error updating status for task run %s: %s", task_run_id, e)
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
async def update_job_status(
|
|
444
|
+
job_id: str,
|
|
445
|
+
status: JobStatus,
|
|
446
|
+
error_message: str | None = None,
|
|
447
|
+
metadata: dict[str, Any] | None = None,
|
|
448
|
+
taskset_name: str | None = None,
|
|
449
|
+
) -> None:
|
|
450
|
+
"""Update the status of a job."""
|
|
451
|
+
if not settings.telemetry_enabled:
|
|
452
|
+
logger.debug("Job status update skipped - telemetry not enabled")
|
|
453
|
+
return
|
|
454
|
+
|
|
455
|
+
status_url = f"{settings.base_url}/v2/jobs/{job_id}/status"
|
|
456
|
+
|
|
457
|
+
try:
|
|
458
|
+
async with httpx.AsyncClient() as client:
|
|
459
|
+
headers = {
|
|
460
|
+
"Content-Type": "application/json",
|
|
461
|
+
"Authorization": f"Bearer {settings.api_key}",
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
request_data = JobStatusUpdateRequest(
|
|
465
|
+
status=status,
|
|
466
|
+
error_message=error_message,
|
|
467
|
+
metadata=metadata,
|
|
468
|
+
taskset_name=taskset_name,
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
logger.debug(
|
|
472
|
+
"Updating status for job %s to %s",
|
|
473
|
+
job_id,
|
|
474
|
+
status,
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
response = await client.post(
|
|
478
|
+
status_url,
|
|
479
|
+
json=request_data.model_dump(exclude_none=True),
|
|
480
|
+
headers=headers,
|
|
481
|
+
timeout=10.0,
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
if response.status_code >= 200 and response.status_code < 300:
|
|
485
|
+
logger.debug(
|
|
486
|
+
"Successfully updated status for job %s to %s",
|
|
487
|
+
job_id,
|
|
488
|
+
status,
|
|
489
|
+
)
|
|
490
|
+
else:
|
|
491
|
+
logger.warning(
|
|
492
|
+
"Failed to update status for job %s: HTTP %s - %s",
|
|
493
|
+
job_id,
|
|
494
|
+
response.status_code,
|
|
495
|
+
response.text,
|
|
496
|
+
)
|
|
497
|
+
except Exception as e:
|
|
498
|
+
logger.exception("Error updating status for job %s: %s", job_id, e)
|
|
499
|
+
|
|
500
|
+
|
|
343
501
|
# --- Public Shutdown Function ---
|
|
344
502
|
def flush(timeout: float = 10.0) -> None:
|
|
345
503
|
"""Flushes pending telemetry data and stops the worker thread."""
|
|
@@ -375,15 +533,17 @@ def flush(timeout: float = 10.0) -> None:
|
|
|
375
533
|
# This check is racy, but it's the best we can do without more complex inter-thread
|
|
376
534
|
# sync for task completion. Give some time for the task to process the sentinel and
|
|
377
535
|
# clear itself.
|
|
378
|
-
# Max wait for task to clear
|
|
379
|
-
|
|
536
|
+
# Max wait for task to clear - should be longer than EXPORT_INTERVAL to ensure
|
|
537
|
+
# the task has time to wake from sleep and process the sentinel
|
|
538
|
+
attempt_timeout = time.time() + (timeout / 2 if timeout else 2.0) + EXPORT_INTERVAL + 1.0
|
|
380
539
|
while _export_task_async is not None and time.time() < attempt_timeout:
|
|
381
540
|
time.sleep(0.1)
|
|
382
541
|
# _export_task_async is set to None by _process_export_queue_async upon its exit.
|
|
383
542
|
if _export_task_async is not None:
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
"
|
|
543
|
+
# This is often a false positive due to race conditions during shutdown
|
|
544
|
+
logger.debug(
|
|
545
|
+
"Telemetry processing task did not clear itself after sentinel. "
|
|
546
|
+
"This is normal during shutdown."
|
|
387
547
|
)
|
|
388
548
|
else:
|
|
389
549
|
logger.debug("Telemetry processing task appears to have completed after sentinel.")
|