hud-python 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +16 -12
- hud/adapters/__init__.py +4 -2
- hud/adapters/claude/adapter.py +0 -1
- hud/adapters/common/adapter.py +11 -10
- hud/adapters/common/types.py +27 -13
- hud/adapters/operator/__init__.py +5 -0
- hud/adapters/operator/adapter.py +93 -0
- hud/agent/__init__.py +7 -0
- hud/agent/base.py +109 -0
- hud/agent/claude.py +187 -0
- hud/agent/operator.py +190 -0
- hud/env/__init__.py +11 -0
- hud/env/client.py +35 -0
- hud/env/docker_client.py +306 -0
- hud/env/environment.py +181 -0
- hud/env/local_docker_client.py +249 -0
- hud/env/remote_client.py +185 -0
- hud/env/remote_docker_client.py +221 -0
- hud/evaluators/__init__.py +10 -0
- hud/evaluators/base.py +31 -0
- hud/evaluators/inspect.py +29 -0
- hud/evaluators/judge.py +213 -0
- hud/evaluators/match.py +163 -0
- hud/evaluators/remote.py +78 -0
- hud/gym.py +101 -15
- hud/job.py +185 -0
- hud/server/__init__.py +2 -2
- hud/server/requests.py +87 -0
- hud/settings.py +13 -2
- hud/task.py +133 -0
- hud/taskset.py +95 -0
- hud/trajectory.py +90 -0
- hud/types.py +65 -0
- hud/utils/__init__.py +4 -2
- hud/utils/common.py +69 -0
- hud/utils/config.py +182 -4
- hud/utils/telemetry.py +67 -0
- hud_python-0.2.0.dist-info/METADATA +188 -0
- hud_python-0.2.0.dist-info/RECORD +44 -0
- {hud_python-0.1.4.dist-info → hud_python-0.2.0.dist-info}/licenses/LICENSE +1 -1
- hud/client.py +0 -200
- hud/environment.py +0 -317
- hud/run.py +0 -208
- hud_python-0.1.4.dist-info/METADATA +0 -125
- hud_python-0.1.4.dist-info/RECORD +0 -21
- {hud_python-0.1.4.dist-info → hud_python-0.2.0.dist-info}/WHEEL +0 -0
hud/environment.py
DELETED
|
@@ -1,317 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
import enum
|
|
5
|
-
import logging
|
|
6
|
-
from typing import TYPE_CHECKING, Any
|
|
7
|
-
|
|
8
|
-
from pydantic import BaseModel
|
|
9
|
-
|
|
10
|
-
from hud.server import make_request
|
|
11
|
-
from hud.settings import settings
|
|
12
|
-
|
|
13
|
-
if TYPE_CHECKING:
|
|
14
|
-
from .adapters.common import Adapter
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class BaseResponseWithLogs(BaseModel):
|
|
18
|
-
"""Base model for API responses that include logs."""
|
|
19
|
-
logs: str | None = None
|
|
20
|
-
error: str | None = None
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class RewardResponse(BaseResponseWithLogs):
|
|
24
|
-
reward: float
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
logger = logging.getLogger("hud.environment")
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
class Observation(BaseModel):
|
|
31
|
-
"""
|
|
32
|
-
Observation from the environment.
|
|
33
|
-
|
|
34
|
-
Attributes:
|
|
35
|
-
screenshot: Base64 encoded PNG string of the screen
|
|
36
|
-
text: Text observation, if available
|
|
37
|
-
"""
|
|
38
|
-
|
|
39
|
-
screenshot: str | None = None # base64 string png
|
|
40
|
-
text: str | None = None
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
class TaskResult(BaseModel):
|
|
44
|
-
"""
|
|
45
|
-
Result of a task step.
|
|
46
|
-
|
|
47
|
-
Attributes:
|
|
48
|
-
observation: The current observation
|
|
49
|
-
reward: Reward value from the step
|
|
50
|
-
terminated: Whether the task is complete
|
|
51
|
-
info: Additional information from the environment
|
|
52
|
-
"""
|
|
53
|
-
|
|
54
|
-
observation: Observation
|
|
55
|
-
reward: float
|
|
56
|
-
terminated: bool
|
|
57
|
-
info: dict[str, Any]
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
class EnvironmentStatus(str, enum.Enum):
|
|
61
|
-
"""
|
|
62
|
-
Status of the environment.
|
|
63
|
-
|
|
64
|
-
Attributes:
|
|
65
|
-
INITIALIZING: The environment is initializing
|
|
66
|
-
RUNNING: The environment is running
|
|
67
|
-
COMPLETED: The environment is completed
|
|
68
|
-
ERROR: The environment is in an error state
|
|
69
|
-
"""
|
|
70
|
-
|
|
71
|
-
INITIALIZING = "initializing"
|
|
72
|
-
RUNNING = "running"
|
|
73
|
-
COMPLETED = "completed"
|
|
74
|
-
ERROR = "error"
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
status_messages = {
|
|
78
|
-
EnvironmentStatus.RUNNING.value: "is running",
|
|
79
|
-
EnvironmentStatus.ERROR.value: "had an error initializing",
|
|
80
|
-
EnvironmentStatus.COMPLETED.value: "completed",
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
class Environment:
|
|
85
|
-
"""
|
|
86
|
-
Environment interface for agent interactions.
|
|
87
|
-
|
|
88
|
-
This class handles the environment state and interactions, including
|
|
89
|
-
creating the environment, retrieving state, and executing actions.
|
|
90
|
-
"""
|
|
91
|
-
|
|
92
|
-
def __init__(
|
|
93
|
-
self,
|
|
94
|
-
adapter: Adapter,
|
|
95
|
-
run_id: str,
|
|
96
|
-
id: str | None = None,
|
|
97
|
-
config: dict[str, Any] | None = None,
|
|
98
|
-
metadata: dict[str, Any] | None = None,
|
|
99
|
-
) -> None:
|
|
100
|
-
"""
|
|
101
|
-
Initialize an environment.
|
|
102
|
-
|
|
103
|
-
Args:
|
|
104
|
-
adapter: Adapter for converting actions
|
|
105
|
-
run_id: ID of the run this environment belongs to
|
|
106
|
-
id: Optional ID of an existing environment
|
|
107
|
-
config: Optional configuration parameters
|
|
108
|
-
metadata: Optional metadata for the environment
|
|
109
|
-
"""
|
|
110
|
-
if metadata is None:
|
|
111
|
-
metadata = {}
|
|
112
|
-
if config is None:
|
|
113
|
-
config = {}
|
|
114
|
-
self.run_id = run_id
|
|
115
|
-
self.config = config
|
|
116
|
-
self.adapter = adapter
|
|
117
|
-
self.metadata = metadata
|
|
118
|
-
self.final_response: None | str = None
|
|
119
|
-
self.id = id
|
|
120
|
-
self.vnc_url = None
|
|
121
|
-
|
|
122
|
-
async def create_environment(self) -> str:
|
|
123
|
-
"""
|
|
124
|
-
Initialize the environment and return the task_run_id.
|
|
125
|
-
|
|
126
|
-
Returns:
|
|
127
|
-
str: The environment ID
|
|
128
|
-
"""
|
|
129
|
-
data = await make_request(
|
|
130
|
-
method="POST",
|
|
131
|
-
url=f"{settings.base_url}/create_environment",
|
|
132
|
-
json={"run_id": self.run_id, "metadata": self.metadata},
|
|
133
|
-
api_key=settings.api_key,
|
|
134
|
-
)
|
|
135
|
-
self.id = data["id"]
|
|
136
|
-
return self.id
|
|
137
|
-
|
|
138
|
-
async def get_vnc_url(self) -> str:
|
|
139
|
-
"""
|
|
140
|
-
Get the VNC URL for the environment.
|
|
141
|
-
|
|
142
|
-
Returns:
|
|
143
|
-
str: The VNC URL for remote viewing/control
|
|
144
|
-
"""
|
|
145
|
-
data = await make_request(
|
|
146
|
-
method="GET",
|
|
147
|
-
url=f"{settings.base_url}/environment/{self.id}/vnc",
|
|
148
|
-
api_key=settings.api_key,
|
|
149
|
-
)
|
|
150
|
-
self.vnc_url = data["vm_url"]
|
|
151
|
-
return self.vnc_url
|
|
152
|
-
|
|
153
|
-
async def get_env_state(self) -> str:
|
|
154
|
-
"""
|
|
155
|
-
Get the state of the environment.
|
|
156
|
-
|
|
157
|
-
Returns:
|
|
158
|
-
str: The current state (e.g., "running", "error")
|
|
159
|
-
"""
|
|
160
|
-
data = await make_request(
|
|
161
|
-
method="GET",
|
|
162
|
-
url=f"{settings.base_url}/get_env_state/{self.id}",
|
|
163
|
-
api_key=settings.api_key,
|
|
164
|
-
)
|
|
165
|
-
return data["state"]
|
|
166
|
-
|
|
167
|
-
async def step(
|
|
168
|
-
self, action: Any | None = None
|
|
169
|
-
) -> tuple[Observation, float, bool, dict[str, Any]]:
|
|
170
|
-
"""
|
|
171
|
-
Send action to environment and get result.
|
|
172
|
-
|
|
173
|
-
Args:
|
|
174
|
-
action: The action to take, or None for no action
|
|
175
|
-
|
|
176
|
-
Returns:
|
|
177
|
-
tuple: (observation, reward, terminated, info)
|
|
178
|
-
"""
|
|
179
|
-
action_list = self.translate_action(action) if action is not None else []
|
|
180
|
-
data = await make_request(
|
|
181
|
-
method="POST",
|
|
182
|
-
url=f"{settings.base_url}/execute_step/{self.id}",
|
|
183
|
-
json=action_list,
|
|
184
|
-
api_key=settings.api_key,
|
|
185
|
-
)
|
|
186
|
-
# Convert the raw observation to the correct type
|
|
187
|
-
self.current_observation = Observation(**data["observation"])
|
|
188
|
-
data["observation"] = self.current_observation
|
|
189
|
-
# Return the result
|
|
190
|
-
task_result = TaskResult(**data)
|
|
191
|
-
return (
|
|
192
|
-
task_result.observation,
|
|
193
|
-
task_result.reward,
|
|
194
|
-
task_result.terminated,
|
|
195
|
-
task_result.info,
|
|
196
|
-
)
|
|
197
|
-
|
|
198
|
-
def translate_action(self, action: Any) -> list:
|
|
199
|
-
"""
|
|
200
|
-
Translate action to the correct format.
|
|
201
|
-
|
|
202
|
-
Args:
|
|
203
|
-
action: The action to translate
|
|
204
|
-
|
|
205
|
-
Returns:
|
|
206
|
-
list: List of translated actions in the CLA format
|
|
207
|
-
"""
|
|
208
|
-
# Get adapter and then translate action to Common Language Action
|
|
209
|
-
if isinstance(action, list):
|
|
210
|
-
return self.adapter.adapt_list(action)
|
|
211
|
-
return [self.adapter.adapt(action)]
|
|
212
|
-
|
|
213
|
-
async def evaluate(self) -> RewardResponse:
|
|
214
|
-
"""
|
|
215
|
-
Get final evaluation score.
|
|
216
|
-
|
|
217
|
-
Returns:
|
|
218
|
-
RewardResponse: The evaluation response containing reward, logs, and possible error
|
|
219
|
-
"""
|
|
220
|
-
data = await make_request(
|
|
221
|
-
method="POST",
|
|
222
|
-
url=f"{settings.base_url}/evaluation/{self.id}",
|
|
223
|
-
api_key=settings.api_key,
|
|
224
|
-
)
|
|
225
|
-
return RewardResponse(**data)
|
|
226
|
-
|
|
227
|
-
async def close(self) -> None:
|
|
228
|
-
"""
|
|
229
|
-
Close the environment.
|
|
230
|
-
"""
|
|
231
|
-
await make_request(
|
|
232
|
-
method="POST",
|
|
233
|
-
url=f"{settings.base_url}/close/{self.id}",
|
|
234
|
-
api_key=settings.api_key,
|
|
235
|
-
)
|
|
236
|
-
|
|
237
|
-
async def reset(self, task_id: str, metadata: dict[str, Any] | None = None) -> Observation:
|
|
238
|
-
"""
|
|
239
|
-
Reset the environment to the task.
|
|
240
|
-
|
|
241
|
-
Args:
|
|
242
|
-
task_id: ID of the task to reset to
|
|
243
|
-
metadata: Optional metadata for the reset
|
|
244
|
-
|
|
245
|
-
Returns:
|
|
246
|
-
Observation: Initial observation for the task
|
|
247
|
-
"""
|
|
248
|
-
if metadata is None:
|
|
249
|
-
metadata = {}
|
|
250
|
-
data = await make_request(
|
|
251
|
-
method="POST",
|
|
252
|
-
url=f"{settings.base_url}/environments/{self.id}/reset",
|
|
253
|
-
json={"task_id": task_id, "metadata": metadata},
|
|
254
|
-
api_key=settings.api_key,
|
|
255
|
-
)
|
|
256
|
-
return Observation(**data["observation"])
|
|
257
|
-
|
|
258
|
-
async def wait_for_ready(self) -> None:
|
|
259
|
-
"""Wait for the environment to be ready"""
|
|
260
|
-
while True:
|
|
261
|
-
state = await self.get_env_state()
|
|
262
|
-
if state in (
|
|
263
|
-
EnvironmentStatus.RUNNING.value,
|
|
264
|
-
EnvironmentStatus.ERROR.value,
|
|
265
|
-
EnvironmentStatus.COMPLETED.value,
|
|
266
|
-
):
|
|
267
|
-
logger.info("Environment %s %s", self.id, status_messages.get(state))
|
|
268
|
-
break
|
|
269
|
-
await asyncio.sleep(10)
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
class EvalSet:
|
|
273
|
-
"""
|
|
274
|
-
Evaluation set containing tasks for benchmarking.
|
|
275
|
-
|
|
276
|
-
Attributes:
|
|
277
|
-
id: Unique identifier for the evalset
|
|
278
|
-
name: Human-readable name
|
|
279
|
-
tasks: List of task IDs in this evalset
|
|
280
|
-
"""
|
|
281
|
-
|
|
282
|
-
def __init__(
|
|
283
|
-
self,
|
|
284
|
-
id: str,
|
|
285
|
-
name: str,
|
|
286
|
-
tasks: list[str] | None = None,
|
|
287
|
-
configs: dict[str, Any] | None = None,
|
|
288
|
-
) -> None:
|
|
289
|
-
"""
|
|
290
|
-
Initialize an evaluation set.
|
|
291
|
-
|
|
292
|
-
Args:
|
|
293
|
-
id: Unique identifier
|
|
294
|
-
name: Human-readable name
|
|
295
|
-
tasks: Optional list of task IDs
|
|
296
|
-
"""
|
|
297
|
-
self.id = id
|
|
298
|
-
self.name = name
|
|
299
|
-
self.tasks = tasks or []
|
|
300
|
-
self.configs = configs or {}
|
|
301
|
-
|
|
302
|
-
async def fetch_tasks(self) -> list[str]:
|
|
303
|
-
"""
|
|
304
|
-
Fetch all tasks in this evalset from the API.
|
|
305
|
-
|
|
306
|
-
Returns:
|
|
307
|
-
list[str]: List of task IDs
|
|
308
|
-
"""
|
|
309
|
-
data = await make_request(
|
|
310
|
-
method="GET",
|
|
311
|
-
url=f"{settings.base_url}/evalsets/{self.id}/tasks",
|
|
312
|
-
api_key=settings.api_key,
|
|
313
|
-
)
|
|
314
|
-
# Extracts a list of task ids and list of config objects for the evalset
|
|
315
|
-
self.tasks = data["tasks"]
|
|
316
|
-
self.configs = data["evalset"]
|
|
317
|
-
return self.tasks
|
hud/run.py
DELETED
|
@@ -1,208 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import datetime
|
|
4
|
-
from typing import TYPE_CHECKING, Any
|
|
5
|
-
|
|
6
|
-
from pydantic import BaseModel, Field
|
|
7
|
-
|
|
8
|
-
from .adapters.common import Adapter
|
|
9
|
-
from .environment import Environment, EvalSet
|
|
10
|
-
from .server import make_request
|
|
11
|
-
from .settings import settings
|
|
12
|
-
|
|
13
|
-
if TYPE_CHECKING:
|
|
14
|
-
import datetime
|
|
15
|
-
|
|
16
|
-
from .gym import Gym
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class RunResponse(BaseModel):
|
|
20
|
-
"""
|
|
21
|
-
Response model for run data from the API.
|
|
22
|
-
|
|
23
|
-
Attributes:
|
|
24
|
-
id: Unique identifier for the run
|
|
25
|
-
name: Human-readable name of the run
|
|
26
|
-
gym: Dictionary containing gym information
|
|
27
|
-
evalset: Dictionary containing evalset information
|
|
28
|
-
config: Dictionary containing configuration parameters
|
|
29
|
-
metadata: Dictionary containing metadata
|
|
30
|
-
"""
|
|
31
|
-
|
|
32
|
-
id: str
|
|
33
|
-
name: str
|
|
34
|
-
gym: dict[str, Any]
|
|
35
|
-
evalset: dict[str, Any]
|
|
36
|
-
config: dict[str, Any]
|
|
37
|
-
metadata: dict[str, Any]
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
class RunAnalyticsResponse(BaseModel):
|
|
41
|
-
"""
|
|
42
|
-
Model for Run analytics data.
|
|
43
|
-
|
|
44
|
-
Attributes:
|
|
45
|
-
id: Unique identifier for the run
|
|
46
|
-
name: Human-readable name of the run
|
|
47
|
-
status_counts: Counts of tasks in different states
|
|
48
|
-
avg_score: Average score across all tasks, if available
|
|
49
|
-
completion_rate: Percentage of tasks completed
|
|
50
|
-
total_tasks: Total number of tasks in the run
|
|
51
|
-
completed_tasks: Number of completed tasks
|
|
52
|
-
running_time: Total runtime in seconds, if available
|
|
53
|
-
created_at: When the run was created
|
|
54
|
-
raw_data: Detailed data about tasks and environments
|
|
55
|
-
"""
|
|
56
|
-
|
|
57
|
-
id: str
|
|
58
|
-
name: str
|
|
59
|
-
status_counts: dict[str, int] # e.g. {"completed": 5, "running": 2, "error": 1}
|
|
60
|
-
avg_score: float | None = None
|
|
61
|
-
completion_rate: float | None = None # percentage of tasks completed
|
|
62
|
-
total_tasks: int
|
|
63
|
-
completed_tasks: int
|
|
64
|
-
running_time: float | None = None # runtime in seconds if available
|
|
65
|
-
created_at: datetime.datetime
|
|
66
|
-
raw_data: dict[str, list[dict[str, Any]]] = Field(
|
|
67
|
-
default_factory=lambda: {"tasks": [], "environments": []}
|
|
68
|
-
)
|
|
69
|
-
|
|
70
|
-
def __str__(self) -> str:
|
|
71
|
-
return self.visualize()
|
|
72
|
-
|
|
73
|
-
def visualize(self) -> str:
|
|
74
|
-
"""
|
|
75
|
-
Generate an ASCII bar chart visualization of run analytics.
|
|
76
|
-
|
|
77
|
-
Args:
|
|
78
|
-
data: The run analytics data to visualize
|
|
79
|
-
|
|
80
|
-
Returns:
|
|
81
|
-
A string containing an ASCII visualization
|
|
82
|
-
"""
|
|
83
|
-
max_width = 50
|
|
84
|
-
|
|
85
|
-
completion_rate = self.completion_rate if self.completion_rate is not None else 0
|
|
86
|
-
|
|
87
|
-
result = [
|
|
88
|
-
f"Run: {self.name} (ID: {self.id})",
|
|
89
|
-
f"Created: {self.created_at.strftime('%Y-%m-%d %H:%M:%S')}",
|
|
90
|
-
"-" * 60,
|
|
91
|
-
f"""Progress: {self.completed_tasks}/{self.total_tasks} tasks completed (
|
|
92
|
-
{completion_rate:.1f}% completion rate)""",
|
|
93
|
-
"",
|
|
94
|
-
]
|
|
95
|
-
|
|
96
|
-
result.append("Status Distribution:")
|
|
97
|
-
total = sum(self.status_counts.values())
|
|
98
|
-
for status, count in self.status_counts.items():
|
|
99
|
-
percentage = (count / total) * 100
|
|
100
|
-
bar_length = int((count / total) * max_width)
|
|
101
|
-
bar = "█" * bar_length
|
|
102
|
-
result.append(f"{status.ljust(10)}: {bar} {count} ({percentage:.1f}%)")
|
|
103
|
-
|
|
104
|
-
if self.avg_score is not None:
|
|
105
|
-
result.append("")
|
|
106
|
-
result.append(f"Average Score: {self.avg_score:.2f}")
|
|
107
|
-
|
|
108
|
-
score_bar_length = int((self.avg_score / 100) * max_width)
|
|
109
|
-
score_bar = "█" * score_bar_length
|
|
110
|
-
result.append(f"Score: {score_bar} {self.avg_score:.2f}/1.00")
|
|
111
|
-
|
|
112
|
-
if self.running_time is not None:
|
|
113
|
-
hours, remainder = divmod(self.running_time, 3600)
|
|
114
|
-
minutes, seconds = divmod(remainder, 60)
|
|
115
|
-
runtime_str = f"{int(hours)}h {int(minutes)}m {int(seconds)}s"
|
|
116
|
-
result.append(f"Total Runtime: {runtime_str}")
|
|
117
|
-
|
|
118
|
-
return "\n".join(result)
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
class Run:
|
|
122
|
-
"""
|
|
123
|
-
A run represents a collection of tasks and environments.
|
|
124
|
-
|
|
125
|
-
This class provides methods to fetch task IDs, create environments,
|
|
126
|
-
and access analytics for the run.
|
|
127
|
-
"""
|
|
128
|
-
|
|
129
|
-
def __init__(
|
|
130
|
-
self,
|
|
131
|
-
id: str,
|
|
132
|
-
name: str,
|
|
133
|
-
gym: Gym,
|
|
134
|
-
evalset: EvalSet,
|
|
135
|
-
config: dict[str, Any] | None = None,
|
|
136
|
-
metadata: dict[str, Any] | None = None,
|
|
137
|
-
adapter: Adapter | None = None,
|
|
138
|
-
) -> None:
|
|
139
|
-
"""
|
|
140
|
-
Initialize a run.
|
|
141
|
-
|
|
142
|
-
Args:
|
|
143
|
-
id: Unique identifier
|
|
144
|
-
name: Human-readable name
|
|
145
|
-
gym: Gym object for this run
|
|
146
|
-
evalset: EvalSet object containing tasks
|
|
147
|
-
config: Optional configuration parameters
|
|
148
|
-
metadata: Optional metadata
|
|
149
|
-
adapter: Optional adapter for action conversion
|
|
150
|
-
"""
|
|
151
|
-
adapter = adapter or Adapter()
|
|
152
|
-
if metadata is None:
|
|
153
|
-
metadata = {}
|
|
154
|
-
if config is None:
|
|
155
|
-
config = {}
|
|
156
|
-
self.id = id
|
|
157
|
-
self.name = name
|
|
158
|
-
self.gym = gym
|
|
159
|
-
self.evalset = evalset
|
|
160
|
-
self.adapter = adapter
|
|
161
|
-
self.config = config
|
|
162
|
-
self.metadata = metadata
|
|
163
|
-
self.environments: list[Environment] = []
|
|
164
|
-
|
|
165
|
-
async def fetch_task_ids(self) -> list[str]:
|
|
166
|
-
"""
|
|
167
|
-
Fetch task IDs for this run from the evalset.
|
|
168
|
-
|
|
169
|
-
Returns:
|
|
170
|
-
list[str]: List of task IDs
|
|
171
|
-
"""
|
|
172
|
-
return await self.evalset.fetch_tasks()
|
|
173
|
-
|
|
174
|
-
async def make(self, metadata: dict[str, Any] | None = None) -> Environment:
|
|
175
|
-
"""
|
|
176
|
-
Create a new environment for this run.
|
|
177
|
-
|
|
178
|
-
Args:
|
|
179
|
-
metadata: Metadata for the environment
|
|
180
|
-
|
|
181
|
-
Returns:
|
|
182
|
-
Environment: The created environment
|
|
183
|
-
"""
|
|
184
|
-
# Make the env class
|
|
185
|
-
env = Environment(
|
|
186
|
-
run_id=self.id,
|
|
187
|
-
config=self.config,
|
|
188
|
-
adapter=self.adapter,
|
|
189
|
-
metadata=metadata or {},
|
|
190
|
-
)
|
|
191
|
-
await env.create_environment()
|
|
192
|
-
self.environments.append(env)
|
|
193
|
-
return env
|
|
194
|
-
|
|
195
|
-
async def get_analytics(self) -> RunAnalyticsResponse:
|
|
196
|
-
"""
|
|
197
|
-
Get analytics for this run.
|
|
198
|
-
|
|
199
|
-
Returns:
|
|
200
|
-
RunAnalyticsResponse: Analytics data including status counts,
|
|
201
|
-
average score, and other metrics
|
|
202
|
-
"""
|
|
203
|
-
data = await make_request(
|
|
204
|
-
method="GET",
|
|
205
|
-
url=f"{settings.base_url}/runs/{self.id}/analytics",
|
|
206
|
-
api_key=settings.api_key,
|
|
207
|
-
)
|
|
208
|
-
return RunAnalyticsResponse(**data)
|
|
@@ -1,125 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: hud-python
|
|
3
|
-
Version: 0.1.4
|
|
4
|
-
Summary: SDK for the HUD evaluation platform.
|
|
5
|
-
Project-URL: Homepage, https://github.com/Human-Data/hud-sdk
|
|
6
|
-
Project-URL: Bug Tracker, https://github.com/Human-Data/hud-sdk/issues
|
|
7
|
-
Project-URL: Documentation, https://hud.so
|
|
8
|
-
Author-email: Human Union Data SDK <founders@hud.so>
|
|
9
|
-
License: MIT License
|
|
10
|
-
|
|
11
|
-
Copyright (c) 2025 Human Data Company
|
|
12
|
-
|
|
13
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
14
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
15
|
-
in the Software without restriction, including without limitation the rights
|
|
16
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
17
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
18
|
-
furnished to do so, subject to the following conditions:
|
|
19
|
-
|
|
20
|
-
The above copyright notice and this permission notice shall be included in all
|
|
21
|
-
copies or substantial portions of the Software.
|
|
22
|
-
|
|
23
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
24
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
25
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
26
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
27
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
28
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
29
|
-
SOFTWARE.
|
|
30
|
-
License-File: LICENSE
|
|
31
|
-
Classifier: Development Status :: 4 - Beta
|
|
32
|
-
Classifier: Intended Audience :: Developers
|
|
33
|
-
Classifier: Programming Language :: Python :: 3
|
|
34
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
35
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
36
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
37
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
38
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
39
|
-
Requires-Python: <3.14,>=3.9
|
|
40
|
-
Requires-Dist: eval-type-backport>=0.2.2
|
|
41
|
-
Requires-Dist: httpx<1,>=0.23.0
|
|
42
|
-
Requires-Dist: pillow<12,>=11
|
|
43
|
-
Requires-Dist: pydantic-settings<3,>=2
|
|
44
|
-
Requires-Dist: pydantic<3,>=2
|
|
45
|
-
Provides-Extra: dev
|
|
46
|
-
Requires-Dist: anthropic; extra == 'dev'
|
|
47
|
-
Requires-Dist: dotenv; extra == 'dev'
|
|
48
|
-
Requires-Dist: ipykernel; extra == 'dev'
|
|
49
|
-
Requires-Dist: ipython<9; extra == 'dev'
|
|
50
|
-
Requires-Dist: jupyter-client; extra == 'dev'
|
|
51
|
-
Requires-Dist: jupyter-core; extra == 'dev'
|
|
52
|
-
Requires-Dist: openai; extra == 'dev'
|
|
53
|
-
Requires-Dist: pyright==1.1.364; extra == 'dev'
|
|
54
|
-
Requires-Dist: pytest<9,>=8.1.1; extra == 'dev'
|
|
55
|
-
Requires-Dist: ruff==0.9.8; extra == 'dev'
|
|
56
|
-
Description-Content-Type: text/markdown
|
|
57
|
-
|
|
58
|
-
# HUD
|
|
59
|
-
|
|
60
|
-
A Python SDK for interacting with HUD environments and evaluation benchmarks for browser use and computer use models.
|
|
61
|
-
|
|
62
|
-
> **Alpha Release Notice**: This SDK is currently in early release status. The API is evolving and may change in future releases as we gather feedback and improve functionality.
|
|
63
|
-
|
|
64
|
-
[](https://pypi.org/project/hud-python/)
|
|
65
|
-
|
|
66
|
-
[📚 Documentation](https://documentation.hud.so) | [🏠 Homepage](https://hud.so)
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
## Quick start
|
|
70
|
-
|
|
71
|
-
[RECOMMENDED] To set get started with an agent, see the [Claude Computer use example](https://github.com/Human-Data/hud-sdk/tree/main/examples).
|
|
72
|
-
|
|
73
|
-
Install the package with Python>=3.9:
|
|
74
|
-
```bash
|
|
75
|
-
pip install hud-python
|
|
76
|
-
```
|
|
77
|
-
|
|
78
|
-
Make sure to setup your account with us (email founders@hud.so) and add your API key to the environment variables:
|
|
79
|
-
```bash
|
|
80
|
-
HUD_API_KEY=<your-api-key>
|
|
81
|
-
```
|
|
82
|
-
|
|
83
|
-
Load in your agent and create a run! Go to the [examples](https://github.com/Human-Data/hud-sdk/tree/main/examples) folder for more examples.
|
|
84
|
-
```python
|
|
85
|
-
import asyncio
|
|
86
|
-
from hud import HUDClient
|
|
87
|
-
|
|
88
|
-
async def main():
|
|
89
|
-
# Initialize client with API key
|
|
90
|
-
client = HUDClient(api_key=os.getenv("HUD_API_KEY"))
|
|
91
|
-
|
|
92
|
-
# Load a gym and evaluation set
|
|
93
|
-
gym = await client.load_gym(id="OSWorld-Ubuntu")
|
|
94
|
-
evalset = await client.load_evalset(id="OSWorld-Ubuntu")
|
|
95
|
-
|
|
96
|
-
# Create a run and environment
|
|
97
|
-
run = await client.create_run(name="example-run", gym=gym, evalset=evalset)
|
|
98
|
-
env = await run.make(metadata={"agent_id": "OSWORLD-1"})
|
|
99
|
-
await env.wait_for_ready()
|
|
100
|
-
|
|
101
|
-
###
|
|
102
|
-
### Agent loop goes here, see example in /examples
|
|
103
|
-
###
|
|
104
|
-
|
|
105
|
-
# Evaluate the environment
|
|
106
|
-
result = await env.evaluate()
|
|
107
|
-
|
|
108
|
-
# Close the environment when done
|
|
109
|
-
await env.close()
|
|
110
|
-
|
|
111
|
-
# Get analytics for the run such as rewards, task completions, etc.
|
|
112
|
-
analytics = await run.get_analytics()
|
|
113
|
-
print(analytics)
|
|
114
|
-
|
|
115
|
-
if __name__ == "__main__":
|
|
116
|
-
asyncio.run(main())
|
|
117
|
-
```
|
|
118
|
-
|
|
119
|
-
## Documentation
|
|
120
|
-
|
|
121
|
-
For comprehensive guides, examples, and API reference, visit [our docs](https://docs.hud.so/introduction)
|
|
122
|
-
|
|
123
|
-
## License
|
|
124
|
-
|
|
125
|
-
[MIT License](LICENSE)
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
hud/__init__.py,sha256=ExaYEMWVJDjuGwkMB8SZ90HFxxevW6iIkghOiorkY4s,414
|
|
2
|
-
hud/client.py,sha256=7WHXTQhVK-T9Rj4ZooADE_c1pah5Bc1DJ9ZRqUyUnuQ,5724
|
|
3
|
-
hud/environment.py,sha256=9r8eK3OVqr-wpPGlhnrpuDt-z6FIp3S3oukTZ7swN3o,8899
|
|
4
|
-
hud/gym.py,sha256=aanBHtlsXrJwrFax9SbXWwk_By-X8wE3M9deS-E_s4c,463
|
|
5
|
-
hud/run.py,sha256=5ukjuRNLjj5fczaWxpR_5NebFbQpoy8w81eRYy309Vg,6401
|
|
6
|
-
hud/settings.py,sha256=1ScSac0ta03LkckkH2gi6SyKY2M7nr15vRGugo2C_xs,1015
|
|
7
|
-
hud/adapters/__init__.py,sha256=y3H7yMl7rC-rrXG2WvePdSojoNFSui02eYTH17Xd7OY,87
|
|
8
|
-
hud/adapters/claude/__init__.py,sha256=i7QEF-29FLb9qxp1eYtXs-adIk_tG54tL-9g6d3xodk,100
|
|
9
|
-
hud/adapters/claude/adapter.py,sha256=ekYZixANKfx-4lENlXGaomh6Ecw4SRKtLWD5quGNWdM,5782
|
|
10
|
-
hud/adapters/common/__init__.py,sha256=BjdZWJVs_AKtpFrt-tNsdQRjnz7D97DFEQirJ-r0mp8,118
|
|
11
|
-
hud/adapters/common/adapter.py,sha256=SCtOuRjW5Szzd45LXCaqDEaKr2lhA-nIqSEMJ9KLsKI,5799
|
|
12
|
-
hud/adapters/common/types.py,sha256=Kgj0ZhiWOU6V95qxrvf-mMCvodLV_6rGBHwP1FQdMBk,4620
|
|
13
|
-
hud/server/__init__.py,sha256=VPrhyyqg3inge9J7BjcmDBNJRuvkCA9ZDXS_R5Q8ZtY,129
|
|
14
|
-
hud/server/requests.py,sha256=XEJkks8jJKFnbPDH32rrI9ww9MAGELAC-8Gr09vMd2s,6102
|
|
15
|
-
hud/utils/__init__.py,sha256=0m8klSLnMLeIJT23ipBXfFACk4hNWPsA6ZNqZDpv6oY,99
|
|
16
|
-
hud/utils/config.py,sha256=dze0BGE4q14omjj9822kL9BeiIgWQvJyuU29A2wa1SE,193
|
|
17
|
-
hud/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
|
-
hud_python-0.1.4.dist-info/METADATA,sha256=NxmtHt0FdUmopz24Z_tsyeUnzLevLXotHaCocf_CMP4,4785
|
|
19
|
-
hud_python-0.1.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
20
|
-
hud_python-0.1.4.dist-info/licenses/LICENSE,sha256=IVdfcZ8xq5apYGJS5GzRLLbm9r03Aecxd03isi-3P9k,1075
|
|
21
|
-
hud_python-0.1.4.dist-info/RECORD,,
|
|
File without changes
|