hud-python 0.2.5__py3-none-any.whl → 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/agent/claude_plays_pokemon.py +2 -1
- hud/env/remote_docker_client.py +2 -2
- hud/job.py +9 -9
- hud/server/requests.py +26 -4
- hud/settings.py +1 -1
- hud/taskset.py +16 -4
- hud/telemetry/context.py +33 -57
- hud/telemetry/instrumentation/mcp.py +0 -3
- hud/telemetry/tests/test_context.py +7 -3
- hud/types.py +1 -1
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.2.5.dist-info → hud_python-0.2.6.dist-info}/METADATA +18 -18
- {hud_python-0.2.5.dist-info → hud_python-0.2.6.dist-info}/RECORD +16 -16
- {hud_python-0.2.5.dist-info → hud_python-0.2.6.dist-info}/WHEEL +0 -0
- {hud_python-0.2.5.dist-info → hud_python-0.2.6.dist-info}/licenses/LICENSE +0 -0
|
@@ -11,6 +11,7 @@ from anthropic.types.beta import (
|
|
|
11
11
|
BetaImageBlockParam,
|
|
12
12
|
)
|
|
13
13
|
|
|
14
|
+
from hud.adapters.common.types import CLA
|
|
14
15
|
from hud.agent import Agent
|
|
15
16
|
from hud.adapters import Adapter
|
|
16
17
|
from hud.settings import settings
|
|
@@ -128,7 +129,7 @@ def extract_json_from_response(response: str) -> str:
|
|
|
128
129
|
return response.strip()
|
|
129
130
|
|
|
130
131
|
|
|
131
|
-
class ClaudePlaysPokemon(Agent[AsyncAnthropic,
|
|
132
|
+
class ClaudePlaysPokemon(Agent[AsyncAnthropic, CLA]):
|
|
132
133
|
"""AI agent that plays Pokémon games using Claude."""
|
|
133
134
|
|
|
134
135
|
def __init__(
|
hud/env/remote_docker_client.py
CHANGED
|
@@ -113,8 +113,8 @@ class RemoteDockerClient(DockerClient):
|
|
|
113
113
|
|
|
114
114
|
logger.info("Creating remote environment")
|
|
115
115
|
|
|
116
|
-
true_gym_id = await get_gym_id("local-docker")
|
|
117
|
-
|
|
116
|
+
# true_gym_id = await get_gym_id("local-docker")
|
|
117
|
+
true_gym_id = await get_gym_id("docker")
|
|
118
118
|
|
|
119
119
|
# augment metadata with dockerfile
|
|
120
120
|
if "environment_config" not in metadata:
|
hud/job.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
-
import datetime
|
|
5
4
|
import functools
|
|
6
5
|
import inspect
|
|
7
6
|
import logging
|
|
8
7
|
import sys
|
|
9
8
|
from collections.abc import Callable, Coroutine
|
|
9
|
+
from datetime import datetime
|
|
10
10
|
from typing import TYPE_CHECKING, Any, TypeVar, cast
|
|
11
11
|
|
|
12
12
|
from pydantic import BaseModel, PrivateAttr, TypeAdapter
|
|
@@ -44,7 +44,7 @@ class Job(BaseModel):
|
|
|
44
44
|
id: str
|
|
45
45
|
name: str
|
|
46
46
|
metadata: dict[str, Any] | None = None
|
|
47
|
-
created_at: datetime
|
|
47
|
+
created_at: datetime
|
|
48
48
|
status: str
|
|
49
49
|
|
|
50
50
|
# Internal cache for trajectories
|
|
@@ -164,13 +164,15 @@ async def create_job(
|
|
|
164
164
|
# If not, we might need to make a subsequent GET request
|
|
165
165
|
job_data = data # Adjust if the API response structure is different
|
|
166
166
|
|
|
167
|
+
created_at = datetime.fromisoformat(job_data["created_at"].replace("Z", "+00:00"))
|
|
168
|
+
|
|
167
169
|
logger.info("View job at https://app.hud.so/jobs/%s.", job_data["id"])
|
|
168
170
|
|
|
169
171
|
return Job(
|
|
170
172
|
id=job_data["id"],
|
|
171
173
|
name=job_data["name"],
|
|
172
174
|
metadata=job_data.get("metadata", {}), # Ensure metadata is dict
|
|
173
|
-
created_at=
|
|
175
|
+
created_at=created_at, # Parse datetime
|
|
174
176
|
status=job_data["status"],
|
|
175
177
|
)
|
|
176
178
|
|
|
@@ -379,7 +381,7 @@ async def _execute_task(
|
|
|
379
381
|
"type": "step_error",
|
|
380
382
|
"step": step + 1,
|
|
381
383
|
"error": str(agent_step_err),
|
|
382
|
-
"timestamp": datetime.
|
|
384
|
+
"timestamp": datetime.now().isoformat(),
|
|
383
385
|
}
|
|
384
386
|
)
|
|
385
387
|
continue
|
|
@@ -413,7 +415,7 @@ async def _execute_task(
|
|
|
413
415
|
"task_id": task_id,
|
|
414
416
|
"type": "evaluation_error",
|
|
415
417
|
"error": str(eval_err),
|
|
416
|
-
"timestamp": datetime.
|
|
418
|
+
"timestamp": datetime.now().isoformat(),
|
|
417
419
|
}
|
|
418
420
|
)
|
|
419
421
|
|
|
@@ -427,7 +429,7 @@ async def _execute_task(
|
|
|
427
429
|
"task_id": task_id,
|
|
428
430
|
"type": "setup_error",
|
|
429
431
|
"error": str(e),
|
|
430
|
-
"timestamp": datetime.
|
|
432
|
+
"timestamp": datetime.now().isoformat(),
|
|
431
433
|
}
|
|
432
434
|
)
|
|
433
435
|
|
|
@@ -447,7 +449,7 @@ async def _execute_task(
|
|
|
447
449
|
"task_id": task_id,
|
|
448
450
|
"type": "env_close_error",
|
|
449
451
|
"error": str(close_err),
|
|
450
|
-
"timestamp": datetime.
|
|
452
|
+
"timestamp": datetime.now().isoformat(),
|
|
451
453
|
}
|
|
452
454
|
)
|
|
453
455
|
|
|
@@ -532,8 +534,6 @@ async def run_job(
|
|
|
532
534
|
Returns:
|
|
533
535
|
The created Job object with errors stored in job.errors.
|
|
534
536
|
"""
|
|
535
|
-
hud_logger = logging.getLogger("hud")
|
|
536
|
-
hud_logger.setLevel(logging.CRITICAL)
|
|
537
537
|
|
|
538
538
|
tasks_to_run: list[Task] = []
|
|
539
539
|
created_job: Job | None = None
|
hud/server/requests.py
CHANGED
|
@@ -6,6 +6,7 @@ from __future__ import annotations
|
|
|
6
6
|
|
|
7
7
|
import asyncio
|
|
8
8
|
import logging
|
|
9
|
+
import ssl
|
|
9
10
|
import time
|
|
10
11
|
from typing import Any
|
|
11
12
|
|
|
@@ -20,7 +21,7 @@ from hud.exceptions import (
|
|
|
20
21
|
|
|
21
22
|
# Set up logger
|
|
22
23
|
logger = logging.getLogger("hud.http")
|
|
23
|
-
logger.setLevel(logging.
|
|
24
|
+
logger.setLevel(logging.INFO)
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
# Long running requests can take up to 10 minutes.
|
|
@@ -37,7 +38,7 @@ async def _handle_retry(
|
|
|
37
38
|
) -> None:
|
|
38
39
|
"""Helper function to handle retry logic and logging."""
|
|
39
40
|
retry_time = retry_delay * (2 ** (attempt - 1)) # Exponential backoff
|
|
40
|
-
logger.
|
|
41
|
+
logger.debug(
|
|
41
42
|
"%s from %s, retrying in %.2f seconds (attempt %d/%d)",
|
|
42
43
|
error_msg,
|
|
43
44
|
url,
|
|
@@ -140,6 +141,12 @@ async def make_request(
|
|
|
140
141
|
continue
|
|
141
142
|
else:
|
|
142
143
|
raise HudNetworkError(f"Network error: {e!s}") from None
|
|
144
|
+
except ssl.SSLError as e:
|
|
145
|
+
if attempt <= max_retries:
|
|
146
|
+
await _handle_retry(attempt, max_retries, retry_delay, url, f"SSL error: {e}")
|
|
147
|
+
continue
|
|
148
|
+
else:
|
|
149
|
+
raise HudNetworkError(f"SSL error: {e!s}") from None
|
|
143
150
|
except Exception as e:
|
|
144
151
|
raise HudRequestError(f"Unexpected error: {e!s}") from None
|
|
145
152
|
raise HudRequestError(f"Request failed after {max_retries} retries with unknown error")
|
|
@@ -201,7 +208,7 @@ def make_request_sync(
|
|
|
201
208
|
# Check if we got a retriable status code
|
|
202
209
|
if response.status_code in retry_status_codes and attempt <= max_retries:
|
|
203
210
|
retry_time = retry_delay * (2 ** (attempt - 1)) # Exponential backoff
|
|
204
|
-
logger.
|
|
211
|
+
logger.debug(
|
|
205
212
|
"Received status %d from %s, retrying in %.2f seconds (attempt %d/%d)",
|
|
206
213
|
response.status_code,
|
|
207
214
|
url,
|
|
@@ -222,7 +229,7 @@ def make_request_sync(
|
|
|
222
229
|
except httpx.RequestError as e:
|
|
223
230
|
if attempt <= max_retries:
|
|
224
231
|
retry_time = retry_delay * (2 ** (attempt - 1))
|
|
225
|
-
logger.
|
|
232
|
+
logger.debug(
|
|
226
233
|
"Network error %s from %s, retrying in %.2f seconds (attempt %d/%d)",
|
|
227
234
|
str(e),
|
|
228
235
|
url,
|
|
@@ -234,6 +241,21 @@ def make_request_sync(
|
|
|
234
241
|
continue
|
|
235
242
|
else:
|
|
236
243
|
raise HudNetworkError(f"Network error: {e!s}") from None
|
|
244
|
+
except ssl.SSLError as e:
|
|
245
|
+
if attempt <= max_retries:
|
|
246
|
+
retry_time = retry_delay * (2 ** (attempt - 1)) # Exponential backoff
|
|
247
|
+
logger.debug(
|
|
248
|
+
"SSL error %s from %s, retrying in %.2f seconds (attempt %d/%d)",
|
|
249
|
+
str(e),
|
|
250
|
+
url,
|
|
251
|
+
retry_time,
|
|
252
|
+
attempt,
|
|
253
|
+
max_retries,
|
|
254
|
+
)
|
|
255
|
+
time.sleep(retry_time)
|
|
256
|
+
continue
|
|
257
|
+
else:
|
|
258
|
+
raise HudNetworkError(f"SSL error: {e!s}") from None
|
|
237
259
|
except Exception as e:
|
|
238
260
|
raise HudRequestError(f"Unexpected error: {e!s}") from None
|
|
239
261
|
raise HudRequestError(f"Request failed after {max_retries} retries with unknown error")
|
hud/settings.py
CHANGED
|
@@ -15,7 +15,7 @@ class Settings(BaseSettings):
|
|
|
15
15
|
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="allow")
|
|
16
16
|
|
|
17
17
|
base_url: str = Field(
|
|
18
|
-
default="https://
|
|
18
|
+
default="https://orchestration.hud.so/hud-gym/api",
|
|
19
19
|
description="Base URL for the HUD API",
|
|
20
20
|
validation_alias="base_url",
|
|
21
21
|
)
|
hud/taskset.py
CHANGED
|
@@ -86,15 +86,27 @@ class TaskSet(BaseModel):
|
|
|
86
86
|
# Convert all tasks to expanded configs
|
|
87
87
|
processed_tasks = []
|
|
88
88
|
for task in self.tasks:
|
|
89
|
-
|
|
90
|
-
|
|
89
|
+
if task.setup is not None:
|
|
90
|
+
setup_config = (
|
|
91
|
+
create_remote_config(None, task.setup, REMOTE_SETUP)[0].args[0].model_dump()
|
|
92
|
+
)
|
|
93
|
+
else:
|
|
94
|
+
setup_config = None
|
|
95
|
+
if task.evaluate is not None:
|
|
96
|
+
evaluate_config = (
|
|
97
|
+
create_remote_config(None, task.evaluate, REMOTE_EVALUATE)[0]
|
|
98
|
+
.args[0]
|
|
99
|
+
.model_dump()
|
|
100
|
+
)
|
|
101
|
+
else:
|
|
102
|
+
evaluate_config = None
|
|
91
103
|
|
|
92
104
|
processed_tasks.append(
|
|
93
105
|
{
|
|
94
106
|
"prompt": task.prompt,
|
|
95
107
|
"gym": task.gym,
|
|
96
|
-
"setup": setup_config
|
|
97
|
-
"evaluate": evaluate_config
|
|
108
|
+
"setup": setup_config,
|
|
109
|
+
"evaluate": evaluate_config,
|
|
98
110
|
"config": task.config,
|
|
99
111
|
}
|
|
100
112
|
)
|
hud/telemetry/context.py
CHANGED
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import contextvars
|
|
4
4
|
import logging
|
|
5
|
+
from collections import defaultdict
|
|
5
6
|
from datetime import datetime
|
|
6
7
|
from typing import Any, TypeVar
|
|
7
8
|
|
|
@@ -11,7 +12,6 @@ from hud.telemetry.mcp_models import (
|
|
|
11
12
|
MCPNotificationCall,
|
|
12
13
|
MCPRequestCall,
|
|
13
14
|
MCPResponseCall,
|
|
14
|
-
MCPTelemetryRecord,
|
|
15
15
|
StatusType,
|
|
16
16
|
)
|
|
17
17
|
|
|
@@ -21,9 +21,8 @@ logger = logging.getLogger("hud.telemetry")
|
|
|
21
21
|
current_task_run_id: contextvars.ContextVar[str | None] = contextvars.ContextVar(
|
|
22
22
|
"current_task_run_id", default=None
|
|
23
23
|
)
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
)
|
|
24
|
+
# NEW: Global dictionary for buffering, keyed by task_run_id
|
|
25
|
+
_GLOBAL_MCP_CALL_BUFFERS: defaultdict[str, list[BaseMCPCall]] = defaultdict(list)
|
|
27
26
|
is_root_trace: contextvars.ContextVar[bool] = contextvars.ContextVar("is_root_trace", default=False)
|
|
28
27
|
|
|
29
28
|
# Maximum buffer size before automatic flush
|
|
@@ -35,51 +34,37 @@ T = TypeVar("T", bound=BaseMCPCall)
|
|
|
35
34
|
|
|
36
35
|
def get_current_task_run_id() -> str | None:
|
|
37
36
|
"""Get the task_run_id for the current trace context."""
|
|
38
|
-
|
|
39
|
-
# Convert empty string sentinel back to None
|
|
40
|
-
return None if value == "" else value
|
|
37
|
+
return current_task_run_id.get()
|
|
41
38
|
|
|
42
39
|
|
|
43
40
|
def set_current_task_run_id(task_run_id: str | None) -> None:
|
|
44
41
|
"""Set the task_run_id for the current trace context."""
|
|
45
|
-
|
|
46
|
-
value_to_set = "" if task_run_id is None else task_run_id
|
|
47
|
-
current_task_run_id.set(value_to_set)
|
|
42
|
+
current_task_run_id.set(task_run_id)
|
|
48
43
|
|
|
49
44
|
|
|
50
45
|
def buffer_mcp_call(record: BaseMCPCall | dict[str, Any]) -> None:
|
|
51
|
-
"""
|
|
52
|
-
Add an MCP call to the buffer for the current trace.
|
|
53
|
-
|
|
54
|
-
Args:
|
|
55
|
-
record: Either a Pydantic model instance or dictionary with MCP call data
|
|
56
|
-
"""
|
|
57
|
-
# Only buffer if we have an active trace
|
|
58
46
|
task_run_id = get_current_task_run_id()
|
|
59
|
-
if task_run_id is not None and task_run_id != "":
|
|
60
|
-
buffer = mcp_calls_buffer.get()
|
|
61
|
-
if buffer is None:
|
|
62
|
-
buffer = []
|
|
63
47
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
48
|
+
if not task_run_id:
|
|
49
|
+
logger.warning(
|
|
50
|
+
"BUFFER_MCP_CALL: No task_run_id. Skipping buffer for %s", type(record).__name__
|
|
51
|
+
)
|
|
52
|
+
return
|
|
67
53
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
54
|
+
# Ensure 'record' is a Pydantic model instance from here
|
|
55
|
+
if isinstance(record, dict):
|
|
56
|
+
try:
|
|
57
|
+
record_model = BaseMCPCall.from_dict(record)
|
|
58
|
+
record = record_model
|
|
59
|
+
except Exception as e_conv:
|
|
60
|
+
logger.exception("BUFFER_MCP_CALL: Failed to convert dict to BaseMCPCall: %s", e_conv)
|
|
61
|
+
return
|
|
74
62
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
mcp_calls_buffer.set(buffer)
|
|
63
|
+
_GLOBAL_MCP_CALL_BUFFERS[task_run_id].append(record)
|
|
64
|
+
buffer_len = len(_GLOBAL_MCP_CALL_BUFFERS[task_run_id])
|
|
78
65
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
logger.debug("MCP calls buffer reached size %d, auto-flushing", len(buffer))
|
|
82
|
-
flush_buffer(export=True)
|
|
66
|
+
if buffer_len >= MAX_BUFFER_SIZE:
|
|
67
|
+
flush_buffer(export=True)
|
|
83
68
|
|
|
84
69
|
|
|
85
70
|
def flush_buffer(export: bool = False) -> list[BaseMCPCall]:
|
|
@@ -92,25 +77,16 @@ def flush_buffer(export: bool = False) -> list[BaseMCPCall]:
|
|
|
92
77
|
Returns:
|
|
93
78
|
The list of buffered MCP calls
|
|
94
79
|
"""
|
|
95
|
-
|
|
96
|
-
if
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
# Create a telemetry record for export
|
|
106
|
-
_telemetry_record = MCPTelemetryRecord(task_run_id=task_id, records=buffer)
|
|
107
|
-
# In the future, we could call an export function here
|
|
108
|
-
# For now, just log that we have telemetry
|
|
109
|
-
logger.debug("MCP telemetry record created with %d calls", len(buffer))
|
|
110
|
-
else:
|
|
111
|
-
logger.warning("No task_run_id found in buffer, skipping export")
|
|
112
|
-
|
|
113
|
-
return buffer
|
|
80
|
+
task_run_id = get_current_task_run_id()
|
|
81
|
+
if not task_run_id:
|
|
82
|
+
logger.warning("FLUSH_BUFFER: No current task_run_id. Cannot flush.")
|
|
83
|
+
return []
|
|
84
|
+
|
|
85
|
+
buffer_for_task = _GLOBAL_MCP_CALL_BUFFERS.pop(
|
|
86
|
+
task_run_id, []
|
|
87
|
+
) # Get and remove the list for this task
|
|
88
|
+
|
|
89
|
+
return buffer_for_task # Return the flushed items
|
|
114
90
|
|
|
115
91
|
|
|
116
92
|
def create_request_record(
|
|
@@ -150,6 +126,7 @@ def create_response_record(
|
|
|
150
126
|
is_error=is_error,
|
|
151
127
|
**kwargs,
|
|
152
128
|
)
|
|
129
|
+
|
|
153
130
|
buffer_mcp_call(record)
|
|
154
131
|
return record
|
|
155
132
|
|
|
@@ -189,5 +166,4 @@ def create_manual_test_record(**custom_data: Any) -> MCPManualTestCall | None:
|
|
|
189
166
|
def reset_context() -> None:
|
|
190
167
|
"""Reset all telemetry context variables. Useful for test isolation."""
|
|
191
168
|
set_current_task_run_id(None)
|
|
192
|
-
mcp_calls_buffer.set([])
|
|
193
169
|
is_root_trace.set(False)
|
|
@@ -31,9 +31,6 @@ from hud.telemetry.mcp_models import DirectionType, MCPCallType, MCPManualTestCa
|
|
|
31
31
|
|
|
32
32
|
logger = logging.getLogger(__name__)
|
|
33
33
|
|
|
34
|
-
# Ensure no OTel imports remain
|
|
35
|
-
# from opentelemetry import context as otel_context, propagate # Should be removed
|
|
36
|
-
|
|
37
34
|
|
|
38
35
|
class MCPInstrumentor:
|
|
39
36
|
"""
|
|
@@ -140,9 +140,13 @@ class TestMCPCallBuffer:
|
|
|
140
140
|
|
|
141
141
|
# Flush should return all calls from both tasks
|
|
142
142
|
result = flush_buffer()
|
|
143
|
-
assert len(result) ==
|
|
144
|
-
assert result[0] ==
|
|
145
|
-
|
|
143
|
+
assert len(result) == 1
|
|
144
|
+
assert result[0] == mock_call_2
|
|
145
|
+
|
|
146
|
+
set_current_task_run_id("task-1")
|
|
147
|
+
result2 = flush_buffer()
|
|
148
|
+
assert len(result2) == 1
|
|
149
|
+
assert result2[0] == mock_call_1
|
|
146
150
|
|
|
147
151
|
def test_buffer_mcp_call_without_task_id(self):
|
|
148
152
|
"""Test adding MCP call when no task run ID is set."""
|
hud/types.py
CHANGED
|
@@ -48,7 +48,7 @@ class EnvironmentStatus(str, enum.Enum):
|
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
# Available HUD gyms
|
|
51
|
-
ServerGym: TypeAlias = Literal["qa", "hud-browser", "
|
|
51
|
+
ServerGym: TypeAlias = Literal["qa", "hud-browser", "OSWorld-Ubuntu"]
|
|
52
52
|
|
|
53
53
|
# Gyms can be either custom or server-side
|
|
54
54
|
Gym: TypeAlias = CustomGym | ServerGym
|
hud/utils/tests/test_version.py
CHANGED
hud/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.6
|
|
4
4
|
Summary: SDK for the HUD evaluation platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
|
|
@@ -74,17 +74,17 @@ Description-Content-Type: text/markdown
|
|
|
74
74
|
</div>
|
|
75
75
|
|
|
76
76
|
<h3>
|
|
77
|
-
|
|
77
|
+
Evaluate your Computer Use AI agents across web browsers, desktop environments, and custom scenarios.
|
|
78
78
|
</h3>
|
|
79
79
|
|
|
80
|
-
|
|
81
|
-
>
|
|
82
|
-
> [📅 Hop on a call ](https://cal.com/jay-ram-z6st6w/demo) or [📧 founders@hud.so](mailto:founders@hud.so)
|
|
83
|
-
>
|
|
84
|
-
> We're here to help with eval strategies, custom environments, or improving your agent architecture!
|
|
80
|
+
### 🚀 Are you a startup building agents?
|
|
85
81
|
|
|
82
|
+
[📅 Hop on a call](https://cal.com/jay-ram-z6st6w/demo) or [📧 founders@hud.so](mailto:founders@hud.so)
|
|
86
83
|
|
|
87
|
-
|
|
84
|
+
We're here to help with eval strategies, custom environments, or improving your agent architecture!
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
> **Early Release Notice**: We'd love to hear your feedback in [Issues](https://github.com/hud-evals/hud-sdk/issues), as the SDK is still evolving!
|
|
88
88
|
|
|
89
89
|
[](https://pypi.org/project/hud-python/)
|
|
90
90
|
|
|
@@ -132,23 +132,23 @@ with hud.trace("my-agent-run"):
|
|
|
132
132
|
result = await agent.run(task)
|
|
133
133
|
```
|
|
134
134
|
|
|
135
|
-
##
|
|
136
|
-
|
|
137
|
-
Before getting started, you'll need to obtain an API key:
|
|
135
|
+
## Quick Start
|
|
138
136
|
|
|
139
|
-
|
|
140
|
-
2. Set it in your environment or .env file:
|
|
137
|
+
### Installation
|
|
141
138
|
|
|
142
139
|
```bash
|
|
143
|
-
|
|
140
|
+
pip install hud-python
|
|
144
141
|
```
|
|
145
142
|
|
|
146
|
-
|
|
143
|
+
### API Key Setup
|
|
147
144
|
|
|
148
|
-
|
|
145
|
+
Before getting started, you'll need to obtain an API key:
|
|
146
|
+
|
|
147
|
+
1. Visit [app.hud.so](https://app.hud.so) to create a free account and generate your API key
|
|
148
|
+
2. Set it in your environment or .env file:
|
|
149
149
|
|
|
150
150
|
```bash
|
|
151
|
-
|
|
151
|
+
export HUD_API_KEY=your_api_key_here
|
|
152
152
|
```
|
|
153
153
|
|
|
154
154
|
### Simple Browser Example with Claude Computer Use
|
|
@@ -269,4 +269,4 @@ If you use this SDK in your research, please cite it as follows:
|
|
|
269
269
|
url = {https://github.com/hud-evals/hud-sdk},
|
|
270
270
|
langid = {en}
|
|
271
271
|
}
|
|
272
|
-
```
|
|
272
|
+
```
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
hud/__init__.py,sha256=6PlxwtjYyaqk6UAyHLJZhsiHRlgndH-Jja9f9BtInUY,1063
|
|
2
2
|
hud/exceptions.py,sha256=pifKvSqxj9_g4NfARVyH5a-lTThhi9XW06tIXaBakQw,5526
|
|
3
3
|
hud/gym.py,sha256=Dl7nur2QTxoVNAcWIvFjuGAbKmoc7CVgjV5gWd35usU,4544
|
|
4
|
-
hud/job.py,sha256=
|
|
5
|
-
hud/settings.py,sha256=
|
|
4
|
+
hud/job.py,sha256=bd88L83L3uqdXE7B3Bjsk8hGk95OggJiLjItFsZXDoQ,25116
|
|
5
|
+
hud/settings.py,sha256=3zALwVbPTaDc01-dR_-rGsrDfc-ieMIcmO5avv6S2Y0,1510
|
|
6
6
|
hud/task.py,sha256=AMmJLYl3BjX8TfBY4ZuR_QIXhTkWDX-4C_Pbi3HziVg,5505
|
|
7
|
-
hud/taskset.py,sha256=
|
|
7
|
+
hud/taskset.py,sha256=ou2Ivulv392txtDvXDpvJAgHj_4h2LZNBaxpeC_mRnw,4903
|
|
8
8
|
hud/trajectory.py,sha256=OrcRbxK_ejFp1VhJCjZnM1WCmCXxEOK4CxNjCngcsjo,3721
|
|
9
|
-
hud/types.py,sha256=
|
|
10
|
-
hud/version.py,sha256=
|
|
9
|
+
hud/types.py,sha256=O8eotDLw4onwRreX9xLPZ2T11qf8wwUidaqGI7jvagY,1808
|
|
10
|
+
hud/version.py,sha256=4c7HS3iYXZGe4vuaPb8pgYwVInf3F3G95h0PITpm6aw,104
|
|
11
11
|
hud/adapters/__init__.py,sha256=zz24KdC_e9TJPgWo6y57_8SzevEE5ak4Cm6tXzMxwRk,266
|
|
12
12
|
hud/adapters/claude/__init__.py,sha256=i7QEF-29FLb9qxp1eYtXs-adIk_tG54tL-9g6d3xodk,100
|
|
13
13
|
hud/adapters/claude/adapter.py,sha256=_qUD0iu0_Y_8yuhrsZw2E5wNv8RB-Aa7BqclAmNHdtI,6096
|
|
@@ -25,7 +25,7 @@ hud/adapters/operator/tests/test_adapter.py,sha256=4RAXwyxAtkh-1Mlt1zJayRkcv3LWa
|
|
|
25
25
|
hud/agent/__init__.py,sha256=_OxMG3UW1vXSuixdpo09b1jexfWcUbfK44zto8t6_LE,453
|
|
26
26
|
hud/agent/base.py,sha256=d7eMoRPepVSCFUyU1oV1hGvyff8rsPtXDelVcJlEF7Y,4022
|
|
27
27
|
hud/agent/claude.py,sha256=bXgdzlZHaIHaxrGIYt6w1kEh5oHFDT5P2u304swi8wU,7529
|
|
28
|
-
hud/agent/claude_plays_pokemon.py,sha256=
|
|
28
|
+
hud/agent/claude_plays_pokemon.py,sha256=H1AIXb6qNE-45G5CdBgsDGTMXOgjg44wGtySgeKgeq4,10009
|
|
29
29
|
hud/agent/langchain.py,sha256=iuMpu-k55Qic0LCzfOR0Wa5kDhIv7zHCc8et3axypus,8833
|
|
30
30
|
hud/agent/operator.py,sha256=Bji_v6NB-hUyTe1otdpEc3Hb0ZpyQbuL_iZu5irYFe4,8612
|
|
31
31
|
hud/agent/misc/__init__.py,sha256=-ftYH1T5r7fXKKra6d8jXYmUz9KOTmYwBrPJU-V3S7g,71
|
|
@@ -38,7 +38,7 @@ hud/env/docker_client.py,sha256=-nQLGeRl9GLJKEku1eIBL3RQuteIPLrecjVHxzSD_vU,1046
|
|
|
38
38
|
hud/env/environment.py,sha256=hSJh5KaiGuWw4IEV1IHNDVjOxyoyOxhHkavGj_5RBbQ,15154
|
|
39
39
|
hud/env/local_docker_client.py,sha256=bcgmmRJGX1--bf0-5Zlk59l2W0PvABnDn4FWZKvXjjk,9565
|
|
40
40
|
hud/env/remote_client.py,sha256=gjCzcuotvDC7GraVBBT2Tix5GKpntwtldv5PqnXx8wk,6109
|
|
41
|
-
hud/env/remote_docker_client.py,sha256=
|
|
41
|
+
hud/env/remote_docker_client.py,sha256=ZLqbd6IeU9BDndjwanmJN3_1CEsrCkntumGavLiPi88,9484
|
|
42
42
|
hud/evaluators/__init__.py,sha256=V5nktEAw3EDn2Y537pjia5Y1IjdLBIPrDjTs6YTCdX4,153
|
|
43
43
|
hud/evaluators/base.py,sha256=ALO9Rj-R_9HtHIHYp84bsQQD12De0XnCTwad78_T5-k,771
|
|
44
44
|
hud/evaluators/inspect.py,sha256=ZvrTXLpgibyvQ5aNXAMP4quyXISrRQHg9besDcuCx7U,692
|
|
@@ -51,19 +51,19 @@ hud/evaluators/tests/test_judge.py,sha256=c1GaAeq_WpBVgBlx-gQncHrOPokzKNxlbgiC8W
|
|
|
51
51
|
hud/evaluators/tests/test_match.py,sha256=C04GoluyT9i41YZ65xEjN7tKHQbENbrpNhNtUd4ivmA,3919
|
|
52
52
|
hud/evaluators/tests/test_remote.py,sha256=YdJpyyuRLkYP0e3jTUkD3zobS2WHQPePn8yBZtYOIN4,3243
|
|
53
53
|
hud/server/__init__.py,sha256=IPxPCqtPLguryN-nBq78Sakypw2bRiE2iHv3SXG8YRk,139
|
|
54
|
-
hud/server/requests.py,sha256=
|
|
54
|
+
hud/server/requests.py,sha256=AnFW4ELojjvfF6xjS2no6_fg4Rph2aR2hjPzYTede0Q,8841
|
|
55
55
|
hud/server/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
56
56
|
hud/server/tests/test_requests.py,sha256=63YCbykcib5MxKxm-OgHJPLX3QC7hmgIwnWaYukVM6s,9077
|
|
57
57
|
hud/telemetry/__init__.py,sha256=ky48kuZD3Bt0vOf9FwZwkV_ka7O26Tvcxh7p1lMpsMk,582
|
|
58
58
|
hud/telemetry/_trace.py,sha256=W7S6CxwtmjNl4OZbA1SQHXsaNm072J9c-fjPjQomgOY,5135
|
|
59
|
-
hud/telemetry/context.py,sha256=
|
|
59
|
+
hud/telemetry/context.py,sha256=PNbfrMgjeRTTg0nUKXYCflqn71I_cSjU8LXdvouUfc4,5209
|
|
60
60
|
hud/telemetry/exporter.py,sha256=l-r7mADcHpn6i9hhB407hx3HS4khfbhuwX0txJ2X0VQ,17986
|
|
61
61
|
hud/telemetry/mcp_models.py,sha256=YIArMtCVfC4NVvaEmUYs_kxDs0GQ-xtFFmB8jEGKaag,11342
|
|
62
62
|
hud/telemetry/instrumentation/__init__.py,sha256=vHmSqaJMMehgRNn6EN2SMoYDD12rSHkLeVmj7Uy1my0,88
|
|
63
|
-
hud/telemetry/instrumentation/mcp.py,sha256=
|
|
63
|
+
hud/telemetry/instrumentation/mcp.py,sha256=xGAMdhTgM1ixHiDX7xkS9Ax1NCjK3u7pLWIbIh8WZIA,21925
|
|
64
64
|
hud/telemetry/instrumentation/registry.py,sha256=UVaSsEA693lvKYd5R3n3ve6GcAB1fwqubRwIVeZiNmo,1821
|
|
65
65
|
hud/telemetry/tests/__init__.py,sha256=QMN8OzfrBUDbQESwrwHCqXLdDwCjYWX8BJcpeLUJfqA,33
|
|
66
|
-
hud/telemetry/tests/test_context.py,sha256=
|
|
66
|
+
hud/telemetry/tests/test_context.py,sha256=BGRDlXXC_VbpD4cYl_o9gRQDDKb2ox1das_ZuX14NC8,6531
|
|
67
67
|
hud/telemetry/tests/test_trace.py,sha256=JzmjNRtHdQFPqLm7hOPastENg-hMJo9p8bbxJ77iXyc,10687
|
|
68
68
|
hud/utils/__init__.py,sha256=oSl_gGoS272X2VFnBYX8hLxcP2xgGoBYQXAuLhtQgw8,260
|
|
69
69
|
hud/utils/common.py,sha256=R83ntEtKr8KXG1mKcy0I_OllFHkDrPMysPhW12uBckc,3926
|
|
@@ -76,9 +76,9 @@ hud/utils/tests/test_common.py,sha256=gbYpQKBNdbCcEH0v1UZpxLt_NW2T5sETMIJKvy8S8p
|
|
|
76
76
|
hud/utils/tests/test_config.py,sha256=dPlXYWuMrxX-NOYbf0vdJ27TJpfacKG8eiKOSGOcfDU,4079
|
|
77
77
|
hud/utils/tests/test_progress.py,sha256=QunwDgi_heQXhDgmC25zgjr-sFUu5FdJ_1aYigMKeIc,6351
|
|
78
78
|
hud/utils/tests/test_telemetry.py,sha256=t0An1RTBaE0dZVEpF4uwuq5k1R-PXFR5k4u71h60tx8,1224
|
|
79
|
-
hud/utils/tests/test_version.py,sha256=
|
|
79
|
+
hud/utils/tests/test_version.py,sha256=wDkcJbOW78FEZpIGKXST347GOm3o_NBVONZ3RhkYIM4,159
|
|
80
80
|
hud/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
81
|
-
hud_python-0.2.
|
|
82
|
-
hud_python-0.2.
|
|
83
|
-
hud_python-0.2.
|
|
84
|
-
hud_python-0.2.
|
|
81
|
+
hud_python-0.2.6.dist-info/METADATA,sha256=xXaqxhBWDKs-vkGiCi19m4wBSbbuehk4fEd1zP5Ufbg,9469
|
|
82
|
+
hud_python-0.2.6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
83
|
+
hud_python-0.2.6.dist-info/licenses/LICENSE,sha256=yIzBheVUf86FC1bztAcr7RYWWNxyd3B-UJQ3uddg1HA,1078
|
|
84
|
+
hud_python-0.2.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|