hud-python 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +3 -2
- hud/adapters/__init__.py +2 -1
- hud/adapters/claude/adapter.py +15 -2
- hud/adapters/common/types.py +7 -3
- hud/adapters/operator/adapter.py +10 -6
- hud/agent/__init__.py +2 -1
- hud/agent/claude.py +22 -2
- hud/agent/langchain.py +198 -0
- hud/agent/operator.py +35 -17
- hud/env/docker_client.py +1 -1
- hud/env/environment.py +182 -9
- hud/env/local_docker_client.py +3 -1
- hud/env/remote_client.py +4 -0
- hud/gym.py +3 -3
- hud/job.py +420 -12
- hud/task.py +41 -30
- hud/taskset.py +8 -0
- hud/types.py +5 -3
- hud/utils/common.py +31 -1
- hud/utils/config.py +2 -93
- hud/utils/progress.py +136 -0
- {hud_python-0.2.0.dist-info → hud_python-0.2.2.dist-info}/METADATA +52 -39
- hud_python-0.2.2.dist-info/RECORD +46 -0
- hud_python-0.2.0.dist-info/RECORD +0 -44
- {hud_python-0.2.0.dist-info → hud_python-0.2.2.dist-info}/WHEEL +0 -0
- {hud_python-0.2.0.dist-info → hud_python-0.2.2.dist-info}/licenses/LICENSE +0 -0
hud/utils/config.py
CHANGED
|
@@ -2,14 +2,8 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
import re
|
|
5
|
-
from typing import TYPE_CHECKING, Any
|
|
6
5
|
|
|
7
|
-
from
|
|
8
|
-
|
|
9
|
-
if TYPE_CHECKING:
|
|
10
|
-
from collections.abc import Iterator
|
|
11
|
-
|
|
12
|
-
from hud.task import Task
|
|
6
|
+
from hud.utils.common import HudStyleConfig, HudStyleConfigs
|
|
13
7
|
|
|
14
8
|
logger = logging.getLogger("hud.utils.config")
|
|
15
9
|
|
|
@@ -17,30 +11,6 @@ REMOTE_FUNCTION_PREFIX = "private_"
|
|
|
17
11
|
REMOTE_SETUP = "setup"
|
|
18
12
|
REMOTE_EVALUATE = "evaluate"
|
|
19
13
|
|
|
20
|
-
class HudStyleConfig(BaseModel):
|
|
21
|
-
function: str # Format: "x.y.z"
|
|
22
|
-
args: list[Any] # Must be json serializable
|
|
23
|
-
|
|
24
|
-
id: str | None = None # Optional id for remote execution
|
|
25
|
-
|
|
26
|
-
def __len__(self) -> int:
|
|
27
|
-
return len(self.args)
|
|
28
|
-
|
|
29
|
-
def __getitem__(self, index: int) -> Any:
|
|
30
|
-
return self.args[index]
|
|
31
|
-
|
|
32
|
-
def __iter__(self) -> Iterator[Any]:
|
|
33
|
-
return iter(self.args)
|
|
34
|
-
|
|
35
|
-
def __str__(self) -> str:
|
|
36
|
-
return f"{self.function}: {', '.join(str(arg) for arg in self.args)}"
|
|
37
|
-
|
|
38
|
-
# Type alias for the shorthand config, which just converts to function name and args
|
|
39
|
-
ShorthandConfig = tuple[str | dict[str, Any] | list[str] | list[dict[str, Any]], ...]
|
|
40
|
-
|
|
41
|
-
# Type alias for multiple config formats
|
|
42
|
-
HudStyleConfigs = ShorthandConfig | HudStyleConfig | list[HudStyleConfig] | dict[str, Any] | str
|
|
43
|
-
|
|
44
14
|
def _is_valid_python_name(name: str) -> bool:
|
|
45
15
|
"""Check if a string is a valid Python identifier."""
|
|
46
16
|
return bool(re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", name))
|
|
@@ -95,7 +65,7 @@ def expand_config(config: HudStyleConfigs) -> list[HudStyleConfig]:
|
|
|
95
65
|
|
|
96
66
|
# If it's a list of HudStyleConfigs, return as is
|
|
97
67
|
if isinstance(config, list) and all(isinstance(item, HudStyleConfig) for item in config):
|
|
98
|
-
return config
|
|
68
|
+
return config # type: ignore
|
|
99
69
|
|
|
100
70
|
# Handle dictionary configuration
|
|
101
71
|
if isinstance(config, dict):
|
|
@@ -122,64 +92,3 @@ def expand_config(config: HudStyleConfigs) -> list[HudStyleConfig]:
|
|
|
122
92
|
error_msg = f"Unknown configuration type: {type(config)}"
|
|
123
93
|
logger.error(error_msg)
|
|
124
94
|
raise ValueError(error_msg)
|
|
125
|
-
|
|
126
|
-
def create_remote_config(
|
|
127
|
-
task: Task | None = None,
|
|
128
|
-
config: HudStyleConfigs | None = None,
|
|
129
|
-
function: str | None = None,
|
|
130
|
-
) -> list[HudStyleConfig]:
|
|
131
|
-
"""
|
|
132
|
-
Create a configuration based on provided inputs.
|
|
133
|
-
|
|
134
|
-
Args:
|
|
135
|
-
task: Task object with configuration
|
|
136
|
-
config: Direct configuration (expanded or not)
|
|
137
|
-
function: Function name to use
|
|
138
|
-
|
|
139
|
-
Returns:
|
|
140
|
-
list[HudStyleConfig]: List of standardized configurations
|
|
141
|
-
|
|
142
|
-
Logic:
|
|
143
|
-
1) If explicit config: expand and return HudStyleConfig with func of the function,
|
|
144
|
-
and args of expanded config
|
|
145
|
-
2) If task has the specified function defined: use that
|
|
146
|
-
3) If no task function: check for task._config and use that
|
|
147
|
-
4) If no _config: use task.id and create private_[function]
|
|
148
|
-
"""
|
|
149
|
-
# If no function provided, just expand the config and return it directly
|
|
150
|
-
if function is None:
|
|
151
|
-
if config:
|
|
152
|
-
return expand_config(config)
|
|
153
|
-
raise ValueError("Either function or config must be provided")
|
|
154
|
-
|
|
155
|
-
# Case 1: Explicit config provided
|
|
156
|
-
if config:
|
|
157
|
-
expanded_configs = expand_config(config)
|
|
158
|
-
return [HudStyleConfig(function=function, args=expanded_configs)]
|
|
159
|
-
|
|
160
|
-
# Must have a task for the remaining cases
|
|
161
|
-
if task is None:
|
|
162
|
-
raise ValueError("Either task or config must be provided")
|
|
163
|
-
|
|
164
|
-
# Case 2: Task has the specified function attribute
|
|
165
|
-
task_config = getattr(task, function, None)
|
|
166
|
-
if task_config and len(task_config) > 0:
|
|
167
|
-
expanded_configs = expand_config(task_config)
|
|
168
|
-
if task.id:
|
|
169
|
-
expanded_configs[0].id = task.id # for remote IDs
|
|
170
|
-
return [HudStyleConfig(function=function, args=expanded_configs)]
|
|
171
|
-
|
|
172
|
-
# Case 3: Check for _config
|
|
173
|
-
if hasattr(task, "config") and task.config:
|
|
174
|
-
if task.id:
|
|
175
|
-
task.config["id"] = task.id # for remote IDs
|
|
176
|
-
return [HudStyleConfig(function=function, args=[task.config])]
|
|
177
|
-
|
|
178
|
-
# Case 4: Use task.id
|
|
179
|
-
if task.id:
|
|
180
|
-
return [HudStyleConfig(function=f"{REMOTE_FUNCTION_PREFIX}{function}", args=[task.id])]
|
|
181
|
-
|
|
182
|
-
# No valid configuration found
|
|
183
|
-
#logger.warning("No valid configuration found for function: %s", function)
|
|
184
|
-
return [HudStyleConfig(function=function, args=[])]
|
|
185
|
-
|
hud/utils/progress.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class StepProgressTracker:
|
|
8
|
+
"""
|
|
9
|
+
Tracks progress across potentially parallel async tasks based on steps completed.
|
|
10
|
+
Provides estimates assuming tasks run up to max_steps_per_task.
|
|
11
|
+
"""
|
|
12
|
+
def __init__(self, total_tasks: int, max_steps_per_task: int) -> None:
|
|
13
|
+
if total_tasks <= 0:
|
|
14
|
+
raise ValueError("total_tasks must be positive")
|
|
15
|
+
if max_steps_per_task <= 0:
|
|
16
|
+
raise ValueError("max_steps_per_task must be positive")
|
|
17
|
+
|
|
18
|
+
self.total_tasks = total_tasks
|
|
19
|
+
self.max_steps_per_task = max_steps_per_task
|
|
20
|
+
self.total_potential_steps = total_tasks * max_steps_per_task
|
|
21
|
+
|
|
22
|
+
# Use asyncio.Lock for potentially concurrent updates/reads if needed,
|
|
23
|
+
# but start without for simplicity in single-threaded asyncio.
|
|
24
|
+
# self._lock = asyncio.Lock()
|
|
25
|
+
self._task_steps: dict[str, int] = defaultdict(int)
|
|
26
|
+
self._finished_tasks: dict[str, bool] = defaultdict(bool)
|
|
27
|
+
self._tasks_started = 0
|
|
28
|
+
self._tasks_finished = 0
|
|
29
|
+
|
|
30
|
+
self.start_time: float | None = None
|
|
31
|
+
self.current_total_steps = 0
|
|
32
|
+
|
|
33
|
+
def start_task(self, task_id: str) -> None:
|
|
34
|
+
# async with self._lock: # If using lock
|
|
35
|
+
if self.start_time is None:
|
|
36
|
+
self.start_time = time.monotonic()
|
|
37
|
+
self._task_steps[task_id] = 0
|
|
38
|
+
self._finished_tasks[task_id] = False
|
|
39
|
+
self._tasks_started += 1
|
|
40
|
+
|
|
41
|
+
def increment_step(self, task_id: str) -> None:
|
|
42
|
+
# async with self._lock:
|
|
43
|
+
if (not self._finished_tasks[task_id] and
|
|
44
|
+
self._task_steps[task_id] < self.max_steps_per_task):
|
|
45
|
+
self._task_steps[task_id] += 1
|
|
46
|
+
# Update overall progress immediately
|
|
47
|
+
self._update_total_steps()
|
|
48
|
+
|
|
49
|
+
def finish_task(self, task_id: str) -> None:
|
|
50
|
+
# async with self._lock:
|
|
51
|
+
if not self._finished_tasks[task_id]:
|
|
52
|
+
# For calculation, consider a finished task as having completed max steps
|
|
53
|
+
self._task_steps[task_id] = self.max_steps_per_task
|
|
54
|
+
self._finished_tasks[task_id] = True
|
|
55
|
+
self._tasks_finished += 1
|
|
56
|
+
# Update overall progress
|
|
57
|
+
self._update_total_steps()
|
|
58
|
+
|
|
59
|
+
def _update_total_steps(self) -> None:
|
|
60
|
+
# This could be expensive if called extremely frequently.
|
|
61
|
+
# Called after increment or finish.
|
|
62
|
+
# async with self._lock:
|
|
63
|
+
self.current_total_steps = sum(self._task_steps.values())
|
|
64
|
+
|
|
65
|
+
def get_progress(self) -> tuple[int, int, float]:
|
|
66
|
+
"""Returns (current_steps, total_potential_steps, percentage)."""
|
|
67
|
+
# async with self._lock:
|
|
68
|
+
# Recalculate here for safety, though _update_total_steps should keep it current
|
|
69
|
+
# current_steps = sum(self._task_steps.values())
|
|
70
|
+
current_steps = self.current_total_steps
|
|
71
|
+
|
|
72
|
+
percentage = 0.0
|
|
73
|
+
if self.total_potential_steps > 0:
|
|
74
|
+
percentage = (current_steps / self.total_potential_steps) * 100
|
|
75
|
+
return current_steps, self.total_potential_steps, percentage
|
|
76
|
+
|
|
77
|
+
def get_stats(self) -> tuple[float, float | None]:
|
|
78
|
+
"""Returns (rate_steps_per_minute, eta_seconds_upper_bound)."""
|
|
79
|
+
# async with self._lock:
|
|
80
|
+
if self.start_time is None or self._tasks_started == 0:
|
|
81
|
+
return 0.0, None # No rate or ETA yet
|
|
82
|
+
|
|
83
|
+
elapsed_time = time.monotonic() - self.start_time
|
|
84
|
+
current_steps = self.current_total_steps
|
|
85
|
+
|
|
86
|
+
rate_sec = 0.0
|
|
87
|
+
if elapsed_time > 0:
|
|
88
|
+
rate_sec = current_steps / elapsed_time
|
|
89
|
+
|
|
90
|
+
rate_min = rate_sec * 60 # Convert rate to steps per minute
|
|
91
|
+
|
|
92
|
+
eta = None
|
|
93
|
+
# ETA calculation still uses rate_sec (steps/second) for time estimation in seconds
|
|
94
|
+
if rate_sec > 0:
|
|
95
|
+
remaining_steps = self.total_potential_steps - current_steps
|
|
96
|
+
eta = remaining_steps / rate_sec if remaining_steps > 0 else 0.0
|
|
97
|
+
|
|
98
|
+
return rate_min, eta # Return rate in steps/min
|
|
99
|
+
|
|
100
|
+
def is_finished(self) -> bool:
|
|
101
|
+
# async with self._lock:
|
|
102
|
+
return self._tasks_finished >= self.total_tasks
|
|
103
|
+
|
|
104
|
+
def display(self, bar_length: int = 40) -> str:
|
|
105
|
+
"""Generates a progress string similar to tqdm."""
|
|
106
|
+
current_steps, total_steps, percentage = self.get_progress()
|
|
107
|
+
rate_min, eta = self.get_stats() # Rate is now per minute
|
|
108
|
+
|
|
109
|
+
# Ensure valid values for display
|
|
110
|
+
current_steps = min(current_steps, total_steps)
|
|
111
|
+
percentage = max(0.0, min(100.0, percentage))
|
|
112
|
+
|
|
113
|
+
filled_length = int(bar_length * current_steps // total_steps) if total_steps else 0
|
|
114
|
+
bar = "█" * filled_length + "-" * (bar_length - filled_length)
|
|
115
|
+
|
|
116
|
+
# Format time
|
|
117
|
+
elapsed_str = "0:00"
|
|
118
|
+
eta_str = "??:??"
|
|
119
|
+
if self.start_time:
|
|
120
|
+
elapsed_seconds = int(time.monotonic() - self.start_time)
|
|
121
|
+
elapsed_str = f"{elapsed_seconds // 60}:{elapsed_seconds % 60:02d}"
|
|
122
|
+
if eta is not None:
|
|
123
|
+
eta_seconds = int(eta)
|
|
124
|
+
eta_str = f"{eta_seconds // 60}:{eta_seconds % 60:02d}"
|
|
125
|
+
elif self.is_finished():
|
|
126
|
+
eta_str = "0:00"
|
|
127
|
+
|
|
128
|
+
# Update rate string format
|
|
129
|
+
rate_str = f"{rate_min:.1f} steps/min" if rate_min > 0 else "?? steps/min"
|
|
130
|
+
|
|
131
|
+
# Format steps - use K/M for large numbers if desired, keep simple for now
|
|
132
|
+
steps_str = f"{current_steps}/{total_steps}"
|
|
133
|
+
|
|
134
|
+
# tasks_str = f" {self._tasks_finished}/{self.total_tasks} tasks" # Optional tasks counter
|
|
135
|
+
|
|
136
|
+
return f"{percentage:3.0f}%|{bar}| {steps_str} [{elapsed_str}<{eta_str}, {rate_str}]"
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: SDK for the HUD evaluation platform.
|
|
5
|
-
Project-URL: Homepage, https://github.com/
|
|
6
|
-
Project-URL: Bug Tracker, https://github.com/
|
|
5
|
+
Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
|
|
6
|
+
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
|
|
7
7
|
Project-URL: Documentation, https://hud.so
|
|
8
|
-
Author-email:
|
|
8
|
+
Author-email: HUD SDK <founders@hud.so>
|
|
9
9
|
License: MIT License
|
|
10
10
|
|
|
11
11
|
Copyright (c) 2025 Human Union Data, Inc
|
|
@@ -37,8 +37,14 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
37
37
|
Classifier: Programming Language :: Python :: 3.13
|
|
38
38
|
Requires-Python: <3.14,>=3.10
|
|
39
39
|
Requires-Dist: aiodocker>=0.24.0
|
|
40
|
+
Requires-Dist: anthropic
|
|
40
41
|
Requires-Dist: httpx<1,>=0.23.0
|
|
41
42
|
Requires-Dist: inspect-ai>=0.3.80
|
|
43
|
+
Requires-Dist: ipykernel
|
|
44
|
+
Requires-Dist: langchain
|
|
45
|
+
Requires-Dist: langchain-openai
|
|
46
|
+
Requires-Dist: numpy
|
|
47
|
+
Requires-Dist: openai
|
|
42
48
|
Requires-Dist: pillow>=11.1.0
|
|
43
49
|
Requires-Dist: pydantic-settings<3,>=2
|
|
44
50
|
Requires-Dist: pydantic<3,>=2
|
|
@@ -57,7 +63,7 @@ Requires-Dist: pytest<9,>=8.1.1; extra == 'dev'
|
|
|
57
63
|
Requires-Dist: ruff==0.9.8; extra == 'dev'
|
|
58
64
|
Description-Content-Type: text/markdown
|
|
59
65
|
|
|
60
|
-
# HUD
|
|
66
|
+
# HUD
|
|
61
67
|
|
|
62
68
|
A Python SDK for creating, evaluating, and benchmarking agent interactions with web browsers and OS environments.
|
|
63
69
|
|
|
@@ -86,21 +92,20 @@ export HUD_API_KEY=your_api_key_here
|
|
|
86
92
|
pip install hud-python
|
|
87
93
|
```
|
|
88
94
|
|
|
89
|
-
### Simple Browser Example with
|
|
95
|
+
### Simple Browser Example with Claude Computer Use
|
|
90
96
|
|
|
91
97
|
> This example uses the `@job("test-run")` decorator, so the results of this run will appear under the job named "test-run" on the your [HUD Jobs page](https://app.hud.so/jobs).
|
|
92
98
|
|
|
99
|
+
Make sure your have defined your `ANTRHOPIC_API_KEY` in environment variables to run Claude.
|
|
100
|
+
|
|
93
101
|
```python
|
|
94
|
-
import os
|
|
95
102
|
import asyncio
|
|
96
103
|
from hud import gym, job
|
|
97
104
|
from hud.task import Task
|
|
98
|
-
from hud.
|
|
99
|
-
from hud.agent import OperatorAgent
|
|
105
|
+
from hud.agent import ClaudeAgent
|
|
100
106
|
|
|
101
107
|
@job("test-run")
|
|
102
108
|
async def main():
|
|
103
|
-
# Define a simple task
|
|
104
109
|
task = Task(
|
|
105
110
|
prompt="Insert the text 'capybara' into the search bar",
|
|
106
111
|
gym="hud-browser",
|
|
@@ -108,26 +113,19 @@ async def main():
|
|
|
108
113
|
evaluate=("contains_text", "capybara")
|
|
109
114
|
)
|
|
110
115
|
|
|
111
|
-
# Create environment
|
|
116
|
+
# Create environment using the gym module
|
|
112
117
|
env = await gym.make(task)
|
|
113
118
|
|
|
114
|
-
# Get URLs and display live view (optional)
|
|
115
|
-
# urls = await env.get_urls()
|
|
116
|
-
# stream(urls["live_url"])
|
|
117
|
-
|
|
118
119
|
# Initialize Operator agent (API key is loaded automatically)
|
|
119
|
-
agent =
|
|
120
|
+
agent = ClaudeAgent()
|
|
120
121
|
|
|
121
|
-
# Agent loop
|
|
122
|
-
obs, _ = env.reset()
|
|
122
|
+
# Agent loop with predict and step functions
|
|
123
|
+
obs, _ = await env.reset() # Gets first observation
|
|
123
124
|
for i in range(5):
|
|
124
125
|
actions, done = await agent.predict(obs)
|
|
125
|
-
|
|
126
|
-
break
|
|
127
|
-
|
|
126
|
+
|
|
128
127
|
obs, reward, terminated, info = await env.step(actions)
|
|
129
|
-
if terminated:
|
|
130
|
-
break
|
|
128
|
+
if done or terminated: break
|
|
131
129
|
|
|
132
130
|
# Evaluate and close
|
|
133
131
|
result = await env.evaluate()
|
|
@@ -139,35 +137,50 @@ if __name__ == "__main__":
|
|
|
139
137
|
|
|
140
138
|
```
|
|
141
139
|
|
|
140
|
+
Alternatively, run a full evaluation set via the ```run_job``` command:
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
from hud import load_taskset, run_job, ClaudeAgent
|
|
144
|
+
|
|
145
|
+
# load
|
|
146
|
+
taskset = load_taskset("GAIA")
|
|
147
|
+
|
|
148
|
+
# evaluate
|
|
149
|
+
job = await run_job(ClaudeAgent, taskset, "test-gaia-job")
|
|
150
|
+
|
|
151
|
+
# get results OR view them in app.hud.so
|
|
152
|
+
print(await job.get_analytics())
|
|
153
|
+
```
|
|
154
|
+
|
|
142
155
|
## Documentation Sections
|
|
143
156
|
|
|
144
157
|
Explore the core concepts and features of the SDK:
|
|
145
158
|
|
|
146
|
-
* **[Tasks and TaskSets](/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios.
|
|
147
|
-
* **[Environments](/concepts/environment)**: Understand the browser and OS runtimes where agents interact.
|
|
148
|
-
* **[Agents](/concepts/agent)**: Learn about the agent architecture (Claude, Operator) and how they process observations and predict actions.
|
|
149
|
-
* **[Adapters](/concepts/adapter)**: See how actions and observations are translated between agents and environments.
|
|
150
|
-
* **[Jobs](/concepts/job)**: Group related runs for analysis and viewing on the HUD platform.
|
|
151
|
-
* **[Trajectories](/concepts/trajectory)**: Understand the recorded data from each agent run.
|
|
159
|
+
* **[Tasks and TaskSets](https://documentation.hud.so/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios. This includes both interactive and **question-answering (QA)** style tasks.
|
|
160
|
+
* **[Environments](https://documentation.hud.so/concepts/environment)**: Understand the browser and OS runtimes where agents interact.
|
|
161
|
+
* **[Agents](https://documentation.hud.so/concepts/agent)**: Learn about the agent architecture (Claude, Operator) and how they process observations and predict actions.
|
|
162
|
+
* **[Adapters](https://documentation.hud.so/concepts/adapter)**: See how actions and observations are translated between agents and environments.
|
|
163
|
+
* **[Jobs](https://documentation.hud.so/concepts/job)**: Group related runs for analysis and viewing on the HUD platform.
|
|
164
|
+
* **[Trajectories](https://documentation.hud.so/concepts/trajectory)**: Understand the recorded data from each agent run.
|
|
152
165
|
* **Advanced Topics**:
|
|
153
|
-
* **[
|
|
154
|
-
* **[
|
|
155
|
-
* **[
|
|
166
|
+
* **[CLA Action Details](https://documentation.hud.so/advanced/cla-details)**: Explore the standardized action format.
|
|
167
|
+
* **[Custom Environments](https://documentation.hud.so/advanced/custom-environments)**: Build your own Docker-based local or remote environments.
|
|
168
|
+
* **[Advanced Environment Control](https://documentation.hud.so/advanced/environment-control)**: Use `invoke`, `execute`, and `_setup` for finer control.
|
|
156
169
|
|
|
157
|
-
* **[Full API Reference](/api-reference/gym)**: Detailed specifications for all modules and classes.
|
|
170
|
+
* **[Full API Reference](https://documentation.hud.so/api-reference/gym)**: Detailed specifications for all modules and classes.
|
|
158
171
|
|
|
159
172
|
## [Examples](examples/)
|
|
160
173
|
|
|
161
|
-
We
|
|
174
|
+
We recommend you first take a look at the example notebooks showing how to use the HUD SDK:
|
|
162
175
|
|
|
163
176
|
1. [Browser Basics](examples/browser_use.ipynb) - Simple browser interaction with live view
|
|
164
177
|
2. [Task Design](examples/tasks.ipynb) - Creating and customizing tasks
|
|
165
|
-
3. [OSWorld](examples/osworld.ipynb) -
|
|
178
|
+
3. [OSWorld](examples/osworld.ipynb) - Running the OSWorld benchmark
|
|
166
179
|
4. [Local Development](examples/local.ipynb) - Setting up local custom environments
|
|
167
180
|
|
|
168
181
|
## Documentation
|
|
169
182
|
|
|
170
|
-
For comprehensive guides, examples, and API reference, visit [our docs](https://
|
|
183
|
+
For comprehensive guides, examples, and API reference, visit [our docs](https://documentation.hud.so/introduction)
|
|
171
184
|
|
|
172
185
|
## License
|
|
173
186
|
|
|
@@ -180,9 +193,9 @@ If you use this SDK in your research, please cite it as follows:
|
|
|
180
193
|
```bibtex
|
|
181
194
|
@software{hud2025agentevalplatform,
|
|
182
195
|
author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Max Muoto and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
|
|
183
|
-
title = {{HUD: An Evaluation Platform for
|
|
184
|
-
date = {2025-
|
|
185
|
-
url = {https://github.com/
|
|
196
|
+
title = {{HUD: An Evaluation Platform for Agents}},
|
|
197
|
+
date = {2025-04},
|
|
198
|
+
url = {https://github.com/hud-evals/hud-sdk},
|
|
186
199
|
langid = {en}
|
|
187
200
|
}
|
|
188
201
|
```
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
hud/__init__.py,sha256=XJXuALIb-pRnnVdfEkjpuiLtS77WD3Idv5VOLECY3eo,488
|
|
2
|
+
hud/gym.py,sha256=ErNwJgCJVhWZHzMILfzVXX0Dawh5Cy0nIQWWh7fsKW4,3641
|
|
3
|
+
hud/job.py,sha256=IvW2sBFoQpExXVi2FL3cEwnrxVIGp8RBfVj2s8edn20,22387
|
|
4
|
+
hud/settings.py,sha256=rv8TiZx4wmBzIoEEkOzoywC0nt8UZXlHxIa_LW4tWAg,1346
|
|
5
|
+
hud/task.py,sha256=kuP69hIxV0ZsHRsZ1XEq6lzYnUSD3b6ywWzloCGW5DU,5380
|
|
6
|
+
hud/taskset.py,sha256=xDPBXeDm4AlSOwl-MM98lN0x6PmGV8t9jv7sNyS_u0c,2426
|
|
7
|
+
hud/trajectory.py,sha256=PA-sE2iyt2BctO2Dex-2ZaRmS95AkEXTicZjHCVCYqE,3749
|
|
8
|
+
hud/types.py,sha256=D_OGPutR55PlWrUDqehYLlR-FqQp9GyKlxJhNmCRyFE,2485
|
|
9
|
+
hud/adapters/__init__.py,sha256=zz24KdC_e9TJPgWo6y57_8SzevEE5ak4Cm6tXzMxwRk,266
|
|
10
|
+
hud/adapters/claude/__init__.py,sha256=i7QEF-29FLb9qxp1eYtXs-adIk_tG54tL-9g6d3xodk,100
|
|
11
|
+
hud/adapters/claude/adapter.py,sha256=viZDCNjM6aCCfpxt3PIxfVOz3rrlOgZli5WyHUxEGjc,6079
|
|
12
|
+
hud/adapters/common/__init__.py,sha256=BjdZWJVs_AKtpFrt-tNsdQRjnz7D97DFEQirJ-r0mp8,118
|
|
13
|
+
hud/adapters/common/adapter.py,sha256=ls-gXtg1N_SQc211rkDb3LL511HNZv6etm1nx2ZtrkQ,5808
|
|
14
|
+
hud/adapters/common/types.py,sha256=9RWLZp6sViu9uPSU5K8-TRaQkdirunxZfDsPIxAR_TM,4995
|
|
15
|
+
hud/adapters/operator/__init__.py,sha256=31vTRs268_TOLd-TeQRKau5bDYy78wxCNpJFhD5_l8U,104
|
|
16
|
+
hud/adapters/operator/adapter.py,sha256=NNbNYPqSquIh4KHCk9aN7dARe7yPUx0J2kDIk-N015s,3309
|
|
17
|
+
hud/agent/__init__.py,sha256=qdCWY6wthkTpyq7SWT1JYAYu1eXk4LfdSAcAfKt0Ohs,294
|
|
18
|
+
hud/agent/base.py,sha256=RThJ_h4A3oU23zyvvKtxY2a_YM03Vd1XYDXdY3bAf8g,3881
|
|
19
|
+
hud/agent/claude.py,sha256=tbDKAzGCLJPnUnHc8eV-zZmj3ZG6QQx0ukWKoO4Ekec,7445
|
|
20
|
+
hud/agent/langchain.py,sha256=9ow74ENcJmZ_muzoMdG2tz5VhvAHm2zKiemphHZm-Pg,8683
|
|
21
|
+
hud/agent/operator.py,sha256=44t19TzcCrS1N3-rnD25ZLXx5s4Io8On27LomALuugs,8185
|
|
22
|
+
hud/env/__init__.py,sha256=BHFY_N0kEI142pjWtMyqUb3BGnoiekY8evRCIbSbO2w,271
|
|
23
|
+
hud/env/client.py,sha256=SPR6ct6NFxmIrgIi3K8tEC-vnqOmCbCBtuT81PaVjuY,869
|
|
24
|
+
hud/env/docker_client.py,sha256=56_u3Ri4NulGcBumAg-7-KilmFmBKthOwEIM5bOLOZc,10418
|
|
25
|
+
hud/env/environment.py,sha256=Xyq4KQO9aWYPwZ0uESAetB5EEZgmlEnZVc7sA0DLz2c,13706
|
|
26
|
+
hud/env/local_docker_client.py,sha256=TCD9z1qjafxjwAWLatAL8d587_ioMDHjs8T5cBgusr8,7789
|
|
27
|
+
hud/env/remote_client.py,sha256=XDKmr5ImLBMZn-ToPrXnc4iBNRwDwzPtQIXEcgShbhE,5977
|
|
28
|
+
hud/env/remote_docker_client.py,sha256=FwaO7NyygDt9oe3pDD7PwUS21pxzc465mwcXk-Cx-60,6838
|
|
29
|
+
hud/evaluators/__init__.py,sha256=XophB666xPnurhQ_ygfW44h0Jh0BQGCgUzCXEOG2Q1g,158
|
|
30
|
+
hud/evaluators/base.py,sha256=CNbrvFWQfl1YuBxJKzuG4_TBAdAf0TOQA3hl7eGsbaA,782
|
|
31
|
+
hud/evaluators/inspect.py,sha256=eTu9E2eBFe4jd4sPtXL0_vGIEY8aiEmTgmN__v77jvI,735
|
|
32
|
+
hud/evaluators/judge.py,sha256=0T9DHFRR38oH2X1d87t58SBSAhbKWRH5PlljzCa3rkg,6449
|
|
33
|
+
hud/evaluators/match.py,sha256=iFJ_qqaHJQ19TICdQPjHgTubBhq7izCIET5qU_104Fk,4719
|
|
34
|
+
hud/evaluators/remote.py,sha256=NVUJJvrpGQj2eL-aFxzTEnAWW7iuSI9eDWtar54dc6E,2174
|
|
35
|
+
hud/server/__init__.py,sha256=cxDKTwMdGzhj7bYajtejN8XCt7K8Xq3eKB2No0qBpoY,169
|
|
36
|
+
hud/server/requests.py,sha256=s8LZZYWT1wl7lPu2vwRaYPZs9_gjKwSg3LZLvS5-s6E,9085
|
|
37
|
+
hud/utils/__init__.py,sha256=LnoI2tQUnd-mQ4eg-gpJJgmHBBIhggJ6c9ap7MBgrfs,260
|
|
38
|
+
hud/utils/common.py,sha256=xJWBF2KTAQKYMGFq5hJWcwpcHAEYY3so4ZqvZYf1BjU,2778
|
|
39
|
+
hud/utils/config.py,sha256=Evu2nUCYaujpWXXwLprsgr_KFUkWuSdkibmLRJ_iq64,3281
|
|
40
|
+
hud/utils/progress.py,sha256=gP7_NXG0m_bhNaYPwrwUOeNumwjx4ewjXP7v-_0Lsj0,5684
|
|
41
|
+
hud/utils/telemetry.py,sha256=md7AuKxtDqsONMeeTOHen1XpmNds8CbXROX_PnkDxFc,1993
|
|
42
|
+
hud/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
43
|
+
hud_python-0.2.2.dist-info/METADATA,sha256=I26pZPqv8O5r36BNehTmJuYQjtbRu-C3bCjt37Iwync,7963
|
|
44
|
+
hud_python-0.2.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
45
|
+
hud_python-0.2.2.dist-info/licenses/LICENSE,sha256=yIzBheVUf86FC1bztAcr7RYWWNxyd3B-UJQ3uddg1HA,1078
|
|
46
|
+
hud_python-0.2.2.dist-info/RECORD,,
|
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
hud/__init__.py,sha256=YX9zAqOSjAFZqHbDJGUVefOsxg7PhkH1ZDflRoiSgP8,464
|
|
2
|
-
hud/gym.py,sha256=cKjIuJS7A0vJx4K7fctpUjIEv8TkW5x6aB_PRrODrDY,3651
|
|
3
|
-
hud/job.py,sha256=E4RN1CkppRQVy46RWCUDjNIyhMa7lNlFfCgpky2vKFk,5463
|
|
4
|
-
hud/settings.py,sha256=rv8TiZx4wmBzIoEEkOzoywC0nt8UZXlHxIa_LW4tWAg,1346
|
|
5
|
-
hud/task.py,sha256=q1E_urMavnfsb87x2JHkRNMBzbkkaQI1skOulkpJ5DY,5132
|
|
6
|
-
hud/taskset.py,sha256=fV4QgHf8tphDoMjTdBzkyCJT7pQBLEMoGu_Uxuji2DM,2226
|
|
7
|
-
hud/trajectory.py,sha256=PA-sE2iyt2BctO2Dex-2ZaRmS95AkEXTicZjHCVCYqE,3749
|
|
8
|
-
hud/types.py,sha256=fJZnzK3j3mq7G0gO5TbqRaN92qT4xAb4jUNOXIX8ZZ0,2395
|
|
9
|
-
hud/adapters/__init__.py,sha256=0RNQgrzBCkhNBq1Q7JRESN1WfUVLs_99fR5g1re3APs,207
|
|
10
|
-
hud/adapters/claude/__init__.py,sha256=i7QEF-29FLb9qxp1eYtXs-adIk_tG54tL-9g6d3xodk,100
|
|
11
|
-
hud/adapters/claude/adapter.py,sha256=sgdgkCtNFjFPSSmfsUD1vx0Xz9xhG81A_it4BvRsOXE,5781
|
|
12
|
-
hud/adapters/common/__init__.py,sha256=BjdZWJVs_AKtpFrt-tNsdQRjnz7D97DFEQirJ-r0mp8,118
|
|
13
|
-
hud/adapters/common/adapter.py,sha256=ls-gXtg1N_SQc211rkDb3LL511HNZv6etm1nx2ZtrkQ,5808
|
|
14
|
-
hud/adapters/common/types.py,sha256=ubnWlm4JMtCkTNonKZGb425p6oi8jZyIVcekp-pjTXQ,4905
|
|
15
|
-
hud/adapters/operator/__init__.py,sha256=31vTRs268_TOLd-TeQRKau5bDYy78wxCNpJFhD5_l8U,104
|
|
16
|
-
hud/adapters/operator/adapter.py,sha256=j2bBe_bwOhdbd7Qr6UvWUEkTkUTOA-ADvWYx0B1c_TU,3159
|
|
17
|
-
hud/agent/__init__.py,sha256=cI3bqfmG2_Lwzn2RjrxV0X9qIxCRDiffwd1UaWToct4,238
|
|
18
|
-
hud/agent/base.py,sha256=RThJ_h4A3oU23zyvvKtxY2a_YM03Vd1XYDXdY3bAf8g,3881
|
|
19
|
-
hud/agent/claude.py,sha256=ZPoged_sun2CmPgludfkV4uv-gjak_yyIlGgCIRcWx0,6583
|
|
20
|
-
hud/agent/operator.py,sha256=zJaYW5kJ7rgvRQCufrjsoNCPn2Ra9EakmFFwut_v7Hk,7335
|
|
21
|
-
hud/env/__init__.py,sha256=BHFY_N0kEI142pjWtMyqUb3BGnoiekY8evRCIbSbO2w,271
|
|
22
|
-
hud/env/client.py,sha256=SPR6ct6NFxmIrgIi3K8tEC-vnqOmCbCBtuT81PaVjuY,869
|
|
23
|
-
hud/env/docker_client.py,sha256=4G3OeFBCbIqg9zOXxreDekNvLNMhgtc2cMAjMbqB6Tk,10394
|
|
24
|
-
hud/env/environment.py,sha256=h-Z7I_1Y8vXBL1oOYbC5xRIKwl28NZt0PJ4GmKcd0AM,5863
|
|
25
|
-
hud/env/local_docker_client.py,sha256=9p2IHeSRmk9_lU7FRiHaCMWn0CjbtWLQjsT3x8x6qxY,7767
|
|
26
|
-
hud/env/remote_client.py,sha256=iJiwueuf98xOx0_Y2ltu_63BwKIKNvohhim73Goq74E,5804
|
|
27
|
-
hud/env/remote_docker_client.py,sha256=FwaO7NyygDt9oe3pDD7PwUS21pxzc465mwcXk-Cx-60,6838
|
|
28
|
-
hud/evaluators/__init__.py,sha256=XophB666xPnurhQ_ygfW44h0Jh0BQGCgUzCXEOG2Q1g,158
|
|
29
|
-
hud/evaluators/base.py,sha256=CNbrvFWQfl1YuBxJKzuG4_TBAdAf0TOQA3hl7eGsbaA,782
|
|
30
|
-
hud/evaluators/inspect.py,sha256=eTu9E2eBFe4jd4sPtXL0_vGIEY8aiEmTgmN__v77jvI,735
|
|
31
|
-
hud/evaluators/judge.py,sha256=0T9DHFRR38oH2X1d87t58SBSAhbKWRH5PlljzCa3rkg,6449
|
|
32
|
-
hud/evaluators/match.py,sha256=iFJ_qqaHJQ19TICdQPjHgTubBhq7izCIET5qU_104Fk,4719
|
|
33
|
-
hud/evaluators/remote.py,sha256=NVUJJvrpGQj2eL-aFxzTEnAWW7iuSI9eDWtar54dc6E,2174
|
|
34
|
-
hud/server/__init__.py,sha256=cxDKTwMdGzhj7bYajtejN8XCt7K8Xq3eKB2No0qBpoY,169
|
|
35
|
-
hud/server/requests.py,sha256=s8LZZYWT1wl7lPu2vwRaYPZs9_gjKwSg3LZLvS5-s6E,9085
|
|
36
|
-
hud/utils/__init__.py,sha256=LnoI2tQUnd-mQ4eg-gpJJgmHBBIhggJ6c9ap7MBgrfs,260
|
|
37
|
-
hud/utils/common.py,sha256=qTAgiqQqplfrCrll06SAYYr9TyT8gnV4mwDSxsj-W1s,1842
|
|
38
|
-
hud/utils/config.py,sha256=x3F9Rg2lTGEG8_FcnEyymh4Y02qD1UWmcDlOSA1Xq0U,6476
|
|
39
|
-
hud/utils/telemetry.py,sha256=md7AuKxtDqsONMeeTOHen1XpmNds8CbXROX_PnkDxFc,1993
|
|
40
|
-
hud/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
|
-
hud_python-0.2.0.dist-info/METADATA,sha256=GbG7OHnQ8WqR3iXT6utC26PkCmgPKrOePTdCNZxuwK4,7222
|
|
42
|
-
hud_python-0.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
43
|
-
hud_python-0.2.0.dist-info/licenses/LICENSE,sha256=yIzBheVUf86FC1bztAcr7RYWWNxyd3B-UJQ3uddg1HA,1078
|
|
44
|
-
hud_python-0.2.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|