hud-python 0.1.0__tar.gz → 0.1.0b2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- {hud_python-0.1.0 → hud_python-0.1.0b2}/PKG-INFO +32 -20
- hud_python-0.1.0b2/README.md +80 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/agent/base.py +4 -1
- {hud_python-0.1.0 → hud_python-0.1.0b2}/agent/claude.py +13 -13
- {hud_python-0.1.0 → hud_python-0.1.0b2}/docs/installation.mdx +1 -1
- hud_python-0.1.0b2/examples/README.md +44 -0
- hud_python-0.1.0b2/examples/claude_osworld.ipynb +154 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/hud/__init__.py +3 -3
- {hud_python-0.1.0 → hud_python-0.1.0b2}/hud/client.py +19 -4
- hud_python-0.1.0/hud/env.py → hud_python-0.1.0b2/hud/environment.py +41 -2
- {hud_python-0.1.0 → hud_python-0.1.0b2}/hud/run.py +62 -9
- hud_python-0.1.0b2/hud/server/requests.py +166 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/pyproject.toml +2 -1
- {hud_python-0.1.0 → hud_python-0.1.0b2}/tests/test_import.py +1 -1
- hud_python-0.1.0/README.md +0 -69
- hud_python-0.1.0/examples/README.md +0 -22
- hud_python-0.1.0/examples/basic_usage.py +0 -81
- hud_python-0.1.0/examples/claude_agent_example.py +0 -134
- hud_python-0.1.0/examples/simple_agent_example.py +0 -162
- hud_python-0.1.0/hud/server/requests.py +0 -79
- {hud_python-0.1.0 → hud_python-0.1.0b2}/.env.example +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/.github/workflows/ci.yml +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/.github/workflows/release.yml +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/.gitignore +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/LICENSE +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/MANIFEST.in +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/agent/response_agent.py +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/docs/api-reference/adapters.mdx +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/docs/api-reference/client.mdx +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/docs/api-reference/env.mdx +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/docs/concepts/adapter.mdx +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/docs/concepts/client.mdx +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/docs/concepts/environment.mdx +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/docs/concepts/gym.mdx +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/docs/examples/basic.mdx +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/docs/examples/claude-agent.mdx +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/docs/examples/custom-agent.mdx +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/docs/introduction.mdx +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/docs/logo/HUD.svg +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/docs/mint.json +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/docs/quickstart.mdx +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/hud/adapters/__init__.py +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/hud/adapters/claude/__init__.py +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/hud/adapters/claude/adapter.py +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/hud/adapters/common/__init__.py +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/hud/adapters/common/adapter.py +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/hud/adapters/common/types.py +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/hud/gym.py +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/hud/py.typed +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/hud/server/__init__.py +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/hud/settings.py +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/hud/utils/__init__.py +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/hud/utils/config.py +0 -0
- {hud_python-0.1.0 → hud_python-0.1.0b2}/tests/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.0b2
|
|
4
4
|
Summary: SDK for the HUD evaluation platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/Human-Data/hud-sdk
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/Human-Data/hud-sdk/issues
|
|
@@ -44,6 +44,7 @@ Requires-Dist: pydantic-settings<3,>=2
|
|
|
44
44
|
Requires-Dist: pydantic<3,>=2
|
|
45
45
|
Provides-Extra: dev
|
|
46
46
|
Requires-Dist: anthropic; extra == 'dev'
|
|
47
|
+
Requires-Dist: dotenv; extra == 'dev'
|
|
47
48
|
Requires-Dist: ipykernel; extra == 'dev'
|
|
48
49
|
Requires-Dist: ipython<9; extra == 'dev'
|
|
49
50
|
Requires-Dist: jupyter-client; extra == 'dev'
|
|
@@ -54,38 +55,40 @@ Requires-Dist: pytest<9,>=8.1.1; extra == 'dev'
|
|
|
54
55
|
Requires-Dist: ruff==0.9.8; extra == 'dev'
|
|
55
56
|
Description-Content-Type: text/markdown
|
|
56
57
|
|
|
57
|
-
# HUD
|
|
58
|
+
# HUD
|
|
58
59
|
|
|
59
|
-
A Python SDK for interacting with HUD environments and evaluation benchmarks for browser use and computer use models.
|
|
60
|
+
A Python SDK for interacting with HUD environments and evaluation benchmarks for browser use and computer use models. Visit [hud.so](https://hud.so).
|
|
60
61
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
> **Alpha Release Notice**: This SDK is currently in alpha status (v0.1.0-alpha). The API is still evolving and may change in future releases as we gather feedback and improve functionality.
|
|
62
|
+
> **Alpha Release Notice**: This SDK is currently in alpha status (v0.1.0-alpha). The API is evolving and may change in future releases as we gather feedback and improve functionality.
|
|
64
63
|
|
|
65
64
|
[](https://pypi.org/project/hud-python/)
|
|
66
65
|
|
|
67
|
-
[📚 Documentation](https://
|
|
66
|
+
[📚 Documentation](https://documentation.hud.so) | [🏠 Homepage](https://hud.so)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
## Quick start
|
|
68
70
|
|
|
69
|
-
|
|
71
|
+
[RECOMMENDED] To set get started with an agent, see the [Claude Computer use example](https://github.com/Human-Data/hud-sdk/tree/main/examples).
|
|
70
72
|
|
|
73
|
+
|
|
74
|
+
Otherwise, install the package with Python>=3.9:
|
|
71
75
|
```bash
|
|
72
|
-
# Install the latest stable release
|
|
73
76
|
pip install hud-python
|
|
77
|
+
```
|
|
74
78
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
# Install a specific alpha version
|
|
79
|
-
pip install hud-python==0.1.0-alpha
|
|
79
|
+
Make sure to setup your account [here](https://hud.so/settings) and add your API key to the environment variables:
|
|
80
|
+
```bash
|
|
81
|
+
HUD_API_KEY=<your-api-key>
|
|
80
82
|
```
|
|
81
83
|
|
|
84
|
+
Load in your agent and create a run! Go to the [examples](https://github.com/Human-Data/hud-sdk/tree/main/examples) folder for more examples.
|
|
82
85
|
```python
|
|
83
86
|
import asyncio
|
|
84
87
|
from hud import HUDClient
|
|
85
88
|
|
|
86
89
|
async def main():
|
|
87
90
|
# Initialize client with API key
|
|
88
|
-
client = HUDClient(api_key="
|
|
91
|
+
client = HUDClient(api_key=os.getenv("HUD_API_KEY"))
|
|
89
92
|
|
|
90
93
|
# Load a gym and evaluation set
|
|
91
94
|
gym = await client.load_gym(id="OSWorld-Ubuntu")
|
|
@@ -93,24 +96,33 @@ async def main():
|
|
|
93
96
|
|
|
94
97
|
# Create a run and environment
|
|
95
98
|
run = client.create_run(name="example-run", gym=gym, evalset=evalset)
|
|
96
|
-
env = await run.make(metadata={"agent_id": "
|
|
99
|
+
env = await run.make(metadata={"agent_id": "OSWORLD-1"})
|
|
100
|
+
await env.wait_for_ready()
|
|
101
|
+
|
|
102
|
+
###
|
|
103
|
+
### Agent loop goes here, see example in /examples
|
|
104
|
+
###
|
|
97
105
|
|
|
98
|
-
#
|
|
99
|
-
|
|
106
|
+
# Evaluate the environment
|
|
107
|
+
result = await env.evaluate()
|
|
100
108
|
|
|
101
109
|
# Close the environment when done
|
|
102
110
|
await env.close()
|
|
103
111
|
|
|
112
|
+
# Get analytics for the run such as rewards, task completions, etc.
|
|
113
|
+
analytics = await run.get_analytics()
|
|
114
|
+
print(analytics)
|
|
115
|
+
|
|
104
116
|
if __name__ == "__main__":
|
|
105
117
|
asyncio.run(main())
|
|
106
118
|
```
|
|
107
119
|
|
|
108
|
-
##
|
|
120
|
+
## Features
|
|
109
121
|
|
|
110
122
|
- Connect to HUD evaluation environments
|
|
111
123
|
- Run benchmarks across various tasks
|
|
112
124
|
- Support for different agent adapters
|
|
113
|
-
- Asynchronous API
|
|
125
|
+
- Asynchronous API
|
|
114
126
|
|
|
115
127
|
## Documentation
|
|
116
128
|
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# HUD
|
|
2
|
+
|
|
3
|
+
A Python SDK for interacting with HUD environments and evaluation benchmarks for browser use and computer use models. Visit [hud.so](https://hud.so).
|
|
4
|
+
|
|
5
|
+
> **Alpha Release Notice**: This SDK is currently in alpha status (v0.1.0-alpha). The API is evolving and may change in future releases as we gather feedback and improve functionality.
|
|
6
|
+
|
|
7
|
+
[](https://pypi.org/project/hud-python/)
|
|
8
|
+
|
|
9
|
+
[📚 Documentation](https://documentation.hud.so) | [🏠 Homepage](https://hud.so)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
## Quick start
|
|
13
|
+
|
|
14
|
+
[RECOMMENDED] To set get started with an agent, see the [Claude Computer use example](https://github.com/Human-Data/hud-sdk/tree/main/examples).
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
Otherwise, install the package with Python>=3.9:
|
|
18
|
+
```bash
|
|
19
|
+
pip install hud-python
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Make sure to setup your account [here](https://hud.so/settings) and add your API key to the environment variables:
|
|
23
|
+
```bash
|
|
24
|
+
HUD_API_KEY=<your-api-key>
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Load in your agent and create a run! Go to the [examples](https://github.com/Human-Data/hud-sdk/tree/main/examples) folder for more examples.
|
|
28
|
+
```python
|
|
29
|
+
import asyncio
|
|
30
|
+
from hud import HUDClient
|
|
31
|
+
|
|
32
|
+
async def main():
|
|
33
|
+
# Initialize client with API key
|
|
34
|
+
client = HUDClient(api_key=os.getenv("HUD_API_KEY"))
|
|
35
|
+
|
|
36
|
+
# Load a gym and evaluation set
|
|
37
|
+
gym = await client.load_gym(id="OSWorld-Ubuntu")
|
|
38
|
+
evalset = await client.load_evalset(id="OSWorld-Ubuntu")
|
|
39
|
+
|
|
40
|
+
# Create a run and environment
|
|
41
|
+
run = client.create_run(name="example-run", gym=gym, evalset=evalset)
|
|
42
|
+
env = await run.make(metadata={"agent_id": "OSWORLD-1"})
|
|
43
|
+
await env.wait_for_ready()
|
|
44
|
+
|
|
45
|
+
###
|
|
46
|
+
### Agent loop goes here, see example in /examples
|
|
47
|
+
###
|
|
48
|
+
|
|
49
|
+
# Evaluate the environment
|
|
50
|
+
result = await env.evaluate()
|
|
51
|
+
|
|
52
|
+
# Close the environment when done
|
|
53
|
+
await env.close()
|
|
54
|
+
|
|
55
|
+
# Get analytics for the run such as rewards, task completions, etc.
|
|
56
|
+
analytics = await run.get_analytics()
|
|
57
|
+
print(analytics)
|
|
58
|
+
|
|
59
|
+
if __name__ == "__main__":
|
|
60
|
+
asyncio.run(main())
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Features
|
|
64
|
+
|
|
65
|
+
- Connect to HUD evaluation environments
|
|
66
|
+
- Run benchmarks across various tasks
|
|
67
|
+
- Support for different agent adapters
|
|
68
|
+
- Asynchronous API
|
|
69
|
+
|
|
70
|
+
## Documentation
|
|
71
|
+
|
|
72
|
+
For comprehensive guides, examples, and API reference, visit:
|
|
73
|
+
- [Getting Started](https://docs.hud.so/introduction)
|
|
74
|
+
- [Installation](https://docs.hud.so/installation)
|
|
75
|
+
- [API Reference](https://docs.hud.so/api-reference)
|
|
76
|
+
- [Examples](https://docs.hud.so/examples)
|
|
77
|
+
|
|
78
|
+
## License
|
|
79
|
+
|
|
80
|
+
[MIT License](LICENSE)
|
|
@@ -2,19 +2,18 @@ import os
|
|
|
2
2
|
import json
|
|
3
3
|
from agent.base import Agent
|
|
4
4
|
from anthropic import Anthropic
|
|
5
|
+
from anthropic.types import Message
|
|
5
6
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
super().__init__()
|
|
10
|
-
self.client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
|
7
|
+
class ClaudeAgent(Agent):
|
|
8
|
+
def __init__(self, client: Anthropic):
|
|
9
|
+
super().__init__(client)
|
|
11
10
|
self.model = "claude-3-7-sonnet-20250219"
|
|
12
11
|
self.max_tokens = 4096
|
|
13
12
|
self.tool_version = "20250124"
|
|
14
13
|
self.thinking_budget = 1024
|
|
15
14
|
self.conversation = [] # Store the full conversation history including Claude's responses
|
|
16
15
|
|
|
17
|
-
async def predict(self, base64_image: str | None = None, input_text: str | None = None):
|
|
16
|
+
async def predict(self, base64_image: str | None = None, input_text: str | None = None) -> tuple[bool, str | object | None]:
|
|
18
17
|
message = self._create_message(base64_image, input_text)
|
|
19
18
|
|
|
20
19
|
# Only append the message if it's not empty
|
|
@@ -33,7 +32,10 @@ class Claude(Agent):
|
|
|
33
32
|
self.conversation.append(assistant_message)
|
|
34
33
|
|
|
35
34
|
self.responses.append(response)
|
|
36
|
-
|
|
35
|
+
|
|
36
|
+
done, processed = await self.process_response(response)
|
|
37
|
+
|
|
38
|
+
return done, processed
|
|
37
39
|
|
|
38
40
|
def _create_message(self, base64_image: str | None = None, input_text: str | None = None):
|
|
39
41
|
"""Create appropriate message based on context and inputs"""
|
|
@@ -120,19 +122,17 @@ class Claude(Agent):
|
|
|
120
122
|
except Exception as e:
|
|
121
123
|
raise
|
|
122
124
|
|
|
123
|
-
def process_response(self, response:
|
|
125
|
+
async def process_response(self, response: Message) -> tuple[bool, str | object | None]:
|
|
124
126
|
# Check if response contains a computer tool use
|
|
125
|
-
has_computer_tool_use = False
|
|
126
127
|
computer_action = None
|
|
127
|
-
for block in response
|
|
128
|
+
for block in response.content:
|
|
128
129
|
if block.type == "tool_use" and block.name == "computer":
|
|
129
|
-
has_computer_tool_use = True
|
|
130
130
|
computer_action = block.input
|
|
131
131
|
break
|
|
132
132
|
|
|
133
|
-
if
|
|
133
|
+
if response.content[-1].type == "text":
|
|
134
134
|
# No computer tool use, treat as final response
|
|
135
|
-
return True, str(response
|
|
135
|
+
return True, str(response.content[-1].text)
|
|
136
136
|
|
|
137
137
|
# If we have a computer action, adapt it to environment actions
|
|
138
138
|
if computer_action:
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
## Claude Computer Use evaluation on OSWorld
|
|
2
|
+
|
|
3
|
+
### 1. Setup
|
|
4
|
+
|
|
5
|
+
Step 1: Install from the source repository:
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
# Clone the repository
|
|
9
|
+
git clone https://github.com/Human-Data/hud-sdk.git
|
|
10
|
+
cd hud-sdk
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Step 2: Create a virtual environment:
|
|
14
|
+
```bash
|
|
15
|
+
# Option 1: using venv
|
|
16
|
+
python -m venv .venv
|
|
17
|
+
source .venv/bin/activate # On Windows: .venv\Scripts\activate
|
|
18
|
+
|
|
19
|
+
# Option 2: using uv (recommended)
|
|
20
|
+
uv venv
|
|
21
|
+
# Then activate according to your shell
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Step 3: Install in development mode with all dependencies:
|
|
25
|
+
```bash
|
|
26
|
+
# Option 1: using pip
|
|
27
|
+
pip install -e ".[dev]"
|
|
28
|
+
|
|
29
|
+
# Option 2: using uv (recommended)
|
|
30
|
+
uv pip install -e ".[dev]"
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### 2. Set up environment variables
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
HUD_API_KEY=<your-api-key>
|
|
37
|
+
ANTHROPIC_API_KEY=<your-api-key>
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### 3. Run the OSWorld example
|
|
41
|
+
|
|
42
|
+
Explore the [claude_osworld.ipynb](https://github.com/Human-Data/hud-sdk/blob/main/examples/claude_osworld.ipynb) notebook from this folder in Jupyter Notebook.
|
|
43
|
+
|
|
44
|
+
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "code",
|
|
5
|
+
"execution_count": 1,
|
|
6
|
+
"metadata": {},
|
|
7
|
+
"outputs": [],
|
|
8
|
+
"source": [
|
|
9
|
+
"import os\n",
|
|
10
|
+
"from dotenv import load_dotenv\n",
|
|
11
|
+
"load_dotenv()\n",
|
|
12
|
+
"\n",
|
|
13
|
+
"from hud import HUDClient\n",
|
|
14
|
+
"from hud.adapters.claude.adapter import ClaudeAdapter\n",
|
|
15
|
+
"from agent.claude import ClaudeAgent\n",
|
|
16
|
+
"\n",
|
|
17
|
+
"from anthropic import Anthropic"
|
|
18
|
+
]
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"cell_type": "code",
|
|
22
|
+
"execution_count": 2,
|
|
23
|
+
"metadata": {},
|
|
24
|
+
"outputs": [],
|
|
25
|
+
"source": [
|
|
26
|
+
"# initialize HUD client\n",
|
|
27
|
+
"client = HUDClient(api_key=os.getenv(\"HUD_API_KEY\"))\n",
|
|
28
|
+
"\n",
|
|
29
|
+
"# initalize Claude Computer Use agent\n",
|
|
30
|
+
"anthropic = Anthropic(api_key=os.getenv(\"ANTHROPIC_API_KEY\"))\n",
|
|
31
|
+
"agent = ClaudeAgent(anthropic)\n",
|
|
32
|
+
"\n",
|
|
33
|
+
"# initialize adapter to interact with the environment\n",
|
|
34
|
+
"cua_adapter = ClaudeAdapter()"
|
|
35
|
+
]
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
"cell_type": "code",
|
|
39
|
+
"execution_count": null,
|
|
40
|
+
"metadata": {},
|
|
41
|
+
"outputs": [],
|
|
42
|
+
"source": [
|
|
43
|
+
"# load OSWorld environment\n",
|
|
44
|
+
"gym = await client.load_gym(id=\"OSWorld-Ubuntu\")\n",
|
|
45
|
+
"\n",
|
|
46
|
+
"# load OSWorld evalset\n",
|
|
47
|
+
"evalset = await client.load_evalset(id=\"OSWorld-Ubuntu\")\n",
|
|
48
|
+
"\n",
|
|
49
|
+
"# create a run that will host all evaluations\n",
|
|
50
|
+
"run = client.create_run(name=\"Claude-test-OSWorld\", gym=gym, evalset=evalset)\n",
|
|
51
|
+
"\n",
|
|
52
|
+
"# fetch all task ids from the run\n",
|
|
53
|
+
"tasks = await run.fetch_task_ids()\n",
|
|
54
|
+
"print(f\"Total tasks in OSWorld: {len(tasks)}\")"
|
|
55
|
+
]
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
"cell_type": "code",
|
|
59
|
+
"execution_count": null,
|
|
60
|
+
"metadata": {},
|
|
61
|
+
"outputs": [],
|
|
62
|
+
"source": [
|
|
63
|
+
"# it may take around 3 minutes to initialize the OSWorld platform and reset to a task\n",
|
|
64
|
+
"\n",
|
|
65
|
+
"# make a HUD environment\n",
|
|
66
|
+
"env = await run.make()\n",
|
|
67
|
+
"await env.wait_for_ready()\n",
|
|
68
|
+
"\n",
|
|
69
|
+
"# reset to a task with an observation (screenshot and text)\n",
|
|
70
|
+
"obs = await env.reset(task_id=tasks[1])\n",
|
|
71
|
+
"print(f\"Task description: {obs.text}\")\n",
|
|
72
|
+
"\n",
|
|
73
|
+
"# watch the agent live\n",
|
|
74
|
+
"live_url = await env.get_vnc_url()\n",
|
|
75
|
+
"client.display_stream(live_url)"
|
|
76
|
+
]
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
"cell_type": "code",
|
|
80
|
+
"execution_count": null,
|
|
81
|
+
"metadata": {},
|
|
82
|
+
"outputs": [],
|
|
83
|
+
"source": [
|
|
84
|
+
"# agent loop\n",
|
|
85
|
+
"for i in range(8):\n",
|
|
86
|
+
" # rescale screenshot to Claude's resolution\n",
|
|
87
|
+
" screenshot = cua_adapter.rescale(obs.screenshot)\n",
|
|
88
|
+
"\n",
|
|
89
|
+
" # agent's next action\n",
|
|
90
|
+
" done, response = await agent.predict(screenshot, obs.text)\n",
|
|
91
|
+
" if done:\n",
|
|
92
|
+
" env.final_response = str(response)\n",
|
|
93
|
+
" break\n",
|
|
94
|
+
"\n",
|
|
95
|
+
" # convert to HUD action space\n",
|
|
96
|
+
" actions = cua_adapter.adapt_list([response])\n",
|
|
97
|
+
" print(f\"Agent's action: {response}\")\n",
|
|
98
|
+
"\n",
|
|
99
|
+
" # step the environment forward\n",
|
|
100
|
+
" obs, reward, terminated, info = await env.step(actions)\n",
|
|
101
|
+
"\n",
|
|
102
|
+
" # drop out if terminated\n",
|
|
103
|
+
" if terminated:\n",
|
|
104
|
+
" break\n",
|
|
105
|
+
" print(f\"Step {i+1} completed\")\n"
|
|
106
|
+
]
|
|
107
|
+
},
|
|
108
|
+
{
|
|
109
|
+
"cell_type": "code",
|
|
110
|
+
"execution_count": null,
|
|
111
|
+
"metadata": {},
|
|
112
|
+
"outputs": [],
|
|
113
|
+
"source": [
|
|
114
|
+
"# evaluate environment state\n",
|
|
115
|
+
"result = await env.evaluate()\n",
|
|
116
|
+
"print(f\"Evaluation result: {result}\")\n",
|
|
117
|
+
"\n",
|
|
118
|
+
"# close environment\n",
|
|
119
|
+
"await env.close()"
|
|
120
|
+
]
|
|
121
|
+
},
|
|
122
|
+
{
|
|
123
|
+
"cell_type": "code",
|
|
124
|
+
"execution_count": null,
|
|
125
|
+
"metadata": {},
|
|
126
|
+
"outputs": [],
|
|
127
|
+
"source": [
|
|
128
|
+
"analytics = await run.get_analytics()\n",
|
|
129
|
+
"print(analytics)"
|
|
130
|
+
]
|
|
131
|
+
}
|
|
132
|
+
],
|
|
133
|
+
"metadata": {
|
|
134
|
+
"kernelspec": {
|
|
135
|
+
"display_name": ".venv",
|
|
136
|
+
"language": "python",
|
|
137
|
+
"name": "python3"
|
|
138
|
+
},
|
|
139
|
+
"language_info": {
|
|
140
|
+
"codemirror_mode": {
|
|
141
|
+
"name": "ipython",
|
|
142
|
+
"version": 3
|
|
143
|
+
},
|
|
144
|
+
"file_extension": ".py",
|
|
145
|
+
"mimetype": "text/x-python",
|
|
146
|
+
"name": "python",
|
|
147
|
+
"nbconvert_exporter": "python",
|
|
148
|
+
"pygments_lexer": "ipython3",
|
|
149
|
+
"version": "3.12.9"
|
|
150
|
+
}
|
|
151
|
+
},
|
|
152
|
+
"nbformat": 4,
|
|
153
|
+
"nbformat_minor": 2
|
|
154
|
+
}
|
|
@@ -5,14 +5,14 @@ HUD Gym SDK - A Python SDK for interacting with HUD environments.
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
7
|
from hud.client import HUDClient
|
|
8
|
-
from hud.
|
|
8
|
+
from hud.environment import Environment, EvalSet, Observation, TaskResult
|
|
9
9
|
from hud.gym import Gym
|
|
10
10
|
from hud.run import Run
|
|
11
11
|
|
|
12
|
-
__version__ = "0.1.
|
|
12
|
+
__version__ = "0.1.0b2"
|
|
13
13
|
|
|
14
14
|
__all__ = [
|
|
15
|
-
"
|
|
15
|
+
"Environment",
|
|
16
16
|
"EvalSet",
|
|
17
17
|
"Gym",
|
|
18
18
|
"HUDClient",
|
|
@@ -8,7 +8,7 @@ import json
|
|
|
8
8
|
from typing import Any
|
|
9
9
|
|
|
10
10
|
from .adapters.common import Adapter
|
|
11
|
-
from .
|
|
11
|
+
from .environment import EvalSet
|
|
12
12
|
from .gym import Gym
|
|
13
13
|
from .run import Run, RunResponse
|
|
14
14
|
from .server import make_request, make_sync_request
|
|
@@ -23,15 +23,15 @@ class HUDClient:
|
|
|
23
23
|
evalsets, and create runs.
|
|
24
24
|
"""
|
|
25
25
|
|
|
26
|
-
def __init__(self, api_key: str) -> None:
|
|
26
|
+
def __init__(self, api_key: str | None = None) -> None:
|
|
27
27
|
"""
|
|
28
28
|
Initialize the HUD client with an API key.
|
|
29
29
|
|
|
30
30
|
Args:
|
|
31
31
|
api_key: API key for authentication with the HUD API
|
|
32
32
|
"""
|
|
33
|
-
self.api_key = api_key
|
|
34
|
-
settings.api_key = api_key
|
|
33
|
+
self.api_key = api_key or settings.api_key
|
|
34
|
+
settings.api_key = self.api_key
|
|
35
35
|
|
|
36
36
|
async def load_gym(self, id: str) -> Gym:
|
|
37
37
|
"""
|
|
@@ -182,3 +182,18 @@ class HUDClient:
|
|
|
182
182
|
config=config,
|
|
183
183
|
metadata=metadata,
|
|
184
184
|
)
|
|
185
|
+
|
|
186
|
+
def display_stream(self, live_url: str) -> None:
|
|
187
|
+
"""
|
|
188
|
+
Display a stream in the HUD system.
|
|
189
|
+
"""
|
|
190
|
+
from IPython.display import HTML, display
|
|
191
|
+
html_content = f"""
|
|
192
|
+
<div style="width: 960px; height: 540px; overflow: hidden;">
|
|
193
|
+
<div style="transform: scale(0.5); transform-origin: top left;">
|
|
194
|
+
<iframe src="{live_url}" width="1920" height="1080" style="border: 1px solid #ddd;">
|
|
195
|
+
</iframe>
|
|
196
|
+
</div>
|
|
197
|
+
</div>
|
|
198
|
+
"""
|
|
199
|
+
display(HTML(html_content))
|
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
4
|
+
import enum
|
|
5
|
+
import logging
|
|
3
6
|
from typing import TYPE_CHECKING, Any
|
|
4
7
|
|
|
5
8
|
from pydantic import BaseModel
|
|
@@ -10,6 +13,7 @@ from hud.settings import settings
|
|
|
10
13
|
if TYPE_CHECKING:
|
|
11
14
|
from .adapters.common import Adapter
|
|
12
15
|
|
|
16
|
+
logger = logging.getLogger("hud.environment")
|
|
13
17
|
|
|
14
18
|
class Observation(BaseModel):
|
|
15
19
|
"""
|
|
@@ -38,8 +42,29 @@ class TaskResult(BaseModel):
|
|
|
38
42
|
terminated: bool
|
|
39
43
|
info: dict[str, Any]
|
|
40
44
|
|
|
45
|
+
class EnvironmentStatus(str, enum.Enum):
|
|
46
|
+
"""
|
|
47
|
+
Status of the environment.
|
|
48
|
+
|
|
49
|
+
Attributes:
|
|
50
|
+
INITIALIZING: The environment is initializing
|
|
51
|
+
RUNNING: The environment is running
|
|
52
|
+
COMPLETED: The environment is completed
|
|
53
|
+
ERROR: The environment is in an error state
|
|
54
|
+
"""
|
|
55
|
+
INITIALIZING = "initializing"
|
|
56
|
+
RUNNING = "running"
|
|
57
|
+
COMPLETED = "completed"
|
|
58
|
+
ERROR = "error"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
status_messages = {
|
|
62
|
+
EnvironmentStatus.RUNNING.value: "is running",
|
|
63
|
+
EnvironmentStatus.ERROR.value: "had an error initializing",
|
|
64
|
+
EnvironmentStatus.COMPLETED.value: "completed",
|
|
65
|
+
}
|
|
41
66
|
|
|
42
|
-
class
|
|
67
|
+
class Environment:
|
|
43
68
|
"""
|
|
44
69
|
Environment interface for agent interactions.
|
|
45
70
|
|
|
@@ -192,7 +217,9 @@ class Env:
|
|
|
192
217
|
api_key=settings.api_key,
|
|
193
218
|
)
|
|
194
219
|
|
|
195
|
-
async def reset(
|
|
220
|
+
async def reset(
|
|
221
|
+
self, task_id: str, metadata: dict[str, Any] | None = None
|
|
222
|
+
) -> Observation:
|
|
196
223
|
"""
|
|
197
224
|
Reset the environment to the task.
|
|
198
225
|
|
|
@@ -213,6 +240,18 @@ class Env:
|
|
|
213
240
|
)
|
|
214
241
|
return Observation(**data["observation"])
|
|
215
242
|
|
|
243
|
+
async def wait_for_ready(self) -> None:
|
|
244
|
+
"""Wait for the environment to be ready"""
|
|
245
|
+
while True:
|
|
246
|
+
state = await self.get_env_state()
|
|
247
|
+
if state in (
|
|
248
|
+
EnvironmentStatus.RUNNING.value,
|
|
249
|
+
EnvironmentStatus.ERROR.value,
|
|
250
|
+
EnvironmentStatus.COMPLETED.value,
|
|
251
|
+
):
|
|
252
|
+
logger.info("Environment %s %s", self.id, status_messages.get(state))
|
|
253
|
+
break
|
|
254
|
+
await asyncio.sleep(10)
|
|
216
255
|
|
|
217
256
|
class EvalSet:
|
|
218
257
|
"""
|