hud-python 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +16 -12
- hud/adapters/__init__.py +4 -2
- hud/adapters/claude/adapter.py +0 -1
- hud/adapters/common/adapter.py +11 -10
- hud/adapters/common/types.py +27 -13
- hud/adapters/operator/__init__.py +5 -0
- hud/adapters/operator/adapter.py +93 -0
- hud/agent/__init__.py +7 -0
- hud/agent/base.py +109 -0
- hud/agent/claude.py +187 -0
- hud/agent/operator.py +190 -0
- hud/env/__init__.py +11 -0
- hud/env/client.py +35 -0
- hud/env/docker_client.py +306 -0
- hud/env/environment.py +181 -0
- hud/env/local_docker_client.py +249 -0
- hud/env/remote_client.py +185 -0
- hud/env/remote_docker_client.py +221 -0
- hud/evaluators/__init__.py +10 -0
- hud/evaluators/base.py +31 -0
- hud/evaluators/inspect.py +29 -0
- hud/evaluators/judge.py +213 -0
- hud/evaluators/match.py +163 -0
- hud/evaluators/remote.py +78 -0
- hud/gym.py +101 -15
- hud/job.py +185 -0
- hud/server/__init__.py +2 -2
- hud/server/requests.py +87 -0
- hud/settings.py +13 -2
- hud/task.py +133 -0
- hud/taskset.py +95 -0
- hud/trajectory.py +90 -0
- hud/types.py +65 -0
- hud/utils/__init__.py +4 -2
- hud/utils/common.py +69 -0
- hud/utils/config.py +182 -4
- hud/utils/telemetry.py +67 -0
- hud_python-0.2.0.dist-info/METADATA +188 -0
- hud_python-0.2.0.dist-info/RECORD +44 -0
- {hud_python-0.1.4.dist-info → hud_python-0.2.0.dist-info}/licenses/LICENSE +1 -1
- hud/client.py +0 -200
- hud/environment.py +0 -317
- hud/run.py +0 -208
- hud_python-0.1.4.dist-info/METADATA +0 -125
- hud_python-0.1.4.dist-info/RECORD +0 -21
- {hud_python-0.1.4.dist-info → hud_python-0.2.0.dist-info}/WHEEL +0 -0
hud/evaluators/remote.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from hud.evaluators.base import EvaluationResult
|
|
7
|
+
from hud.server import make_request
|
|
8
|
+
from hud.settings import settings
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
async def _remote_eval_call(
|
|
12
|
+
response: Any,
|
|
13
|
+
answer: Any,
|
|
14
|
+
eval_type: str,
|
|
15
|
+
config: dict[str, Any] | None = None
|
|
16
|
+
) -> dict[str, Any]:
|
|
17
|
+
"""Send an evaluation request to the remote server.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
response: The response to evaluate
|
|
21
|
+
answer: The reference answer to compare against
|
|
22
|
+
eval_type: Type of evaluation (e.g., "match", "judge", "agent")
|
|
23
|
+
config: Optional configuration parameters
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Dictionary with evaluation results from the server
|
|
27
|
+
"""
|
|
28
|
+
try:
|
|
29
|
+
result = await make_request(
|
|
30
|
+
method="POST",
|
|
31
|
+
url=f"{settings.base_url}/evaluations/evaluate",
|
|
32
|
+
json={
|
|
33
|
+
"response": response,
|
|
34
|
+
"answer": answer,
|
|
35
|
+
"type": eval_type,
|
|
36
|
+
"config": config or {}
|
|
37
|
+
},
|
|
38
|
+
api_key=settings.api_key,
|
|
39
|
+
)
|
|
40
|
+
return result
|
|
41
|
+
except Exception as e:
|
|
42
|
+
return {
|
|
43
|
+
"score": -1.0,
|
|
44
|
+
"reason": f"Remote evaluation failed: {e!s}",
|
|
45
|
+
"details": {}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def remote_evaluate(
|
|
50
|
+
response: Any,
|
|
51
|
+
answer: Any,
|
|
52
|
+
eval_type: str = "default",
|
|
53
|
+
config: dict[str, Any] | None = None
|
|
54
|
+
) -> EvaluationResult:
|
|
55
|
+
"""Evaluate a response using remote evaluation services.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
response: The response to evaluate
|
|
59
|
+
answer: The reference answer to compare against
|
|
60
|
+
eval_type: Type of evaluation to perform
|
|
61
|
+
config: Optional configuration for the evaluation
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
EvaluationResult containing the evaluation results
|
|
65
|
+
"""
|
|
66
|
+
result = asyncio.run(_remote_eval_call(
|
|
67
|
+
response=response,
|
|
68
|
+
answer=answer,
|
|
69
|
+
eval_type=eval_type,
|
|
70
|
+
config=config
|
|
71
|
+
))
|
|
72
|
+
|
|
73
|
+
return EvaluationResult(
|
|
74
|
+
score=result.get("score", -1.0),
|
|
75
|
+
reason=result.get("reason", "Remote evaluation completed"),
|
|
76
|
+
mode=eval_type,
|
|
77
|
+
criteria_scores=result.get("details", {})
|
|
78
|
+
)
|
hud/gym.py
CHANGED
|
@@ -1,22 +1,108 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
3
6
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
+
from hud.env.environment import Environment
|
|
8
|
+
from hud.env.local_docker_client import LocalDockerClient
|
|
9
|
+
from hud.env.remote_client import RemoteClient
|
|
10
|
+
from hud.env.remote_docker_client import RemoteDockerClient
|
|
11
|
+
from hud.task import Task
|
|
12
|
+
from hud.types import CustomGym, Gym
|
|
13
|
+
from hud.utils.common import get_gym_id
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from hud.job import Job
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger("hud.gym")
|
|
7
19
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
20
|
+
async def make(
|
|
21
|
+
env_src: Gym | Task,
|
|
22
|
+
*,
|
|
23
|
+
job: Job | None = None,
|
|
24
|
+
job_id: str | None = None,
|
|
25
|
+
metadata: dict[str, Any] | None = None,
|
|
26
|
+
) -> Environment:
|
|
11
27
|
"""
|
|
28
|
+
Create an environment from an environment ID or a Task object.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
env_src: Environment ID or Task object
|
|
32
|
+
job: Job object to associate with this environment
|
|
33
|
+
job_id: ID of job to associate with this environment (deprecated, use job instead)
|
|
34
|
+
metadata: Additional metadata for the environment
|
|
35
|
+
"""
|
|
36
|
+
if metadata is None:
|
|
37
|
+
metadata = {}
|
|
38
|
+
|
|
39
|
+
# Handle job parameter
|
|
40
|
+
effective_job_id = None
|
|
41
|
+
if job is not None:
|
|
42
|
+
effective_job_id = job.id
|
|
43
|
+
elif job_id is not None:
|
|
44
|
+
effective_job_id = job_id
|
|
45
|
+
else:
|
|
46
|
+
# Try to get an active job from the decorator context
|
|
47
|
+
try:
|
|
48
|
+
from hud.job import get_active_job
|
|
49
|
+
active_job = get_active_job()
|
|
50
|
+
if active_job:
|
|
51
|
+
effective_job_id = active_job.id
|
|
52
|
+
except ImportError:
|
|
53
|
+
pass # Module not available, skip
|
|
54
|
+
|
|
55
|
+
gym = None
|
|
56
|
+
task = None
|
|
57
|
+
if isinstance(env_src, Gym):
|
|
58
|
+
gym = env_src
|
|
59
|
+
elif isinstance(env_src, Task):
|
|
60
|
+
gym = env_src.gym
|
|
61
|
+
task = env_src
|
|
62
|
+
|
|
63
|
+
if isinstance(gym, CustomGym):
|
|
64
|
+
# Create the environment (depending on location)
|
|
65
|
+
if gym.dockerfile is None:
|
|
66
|
+
raise ValueError("Dockerfile is required for custom environments")
|
|
67
|
+
if gym.location == "local":
|
|
68
|
+
logger.info("Creating local environment")
|
|
69
|
+
client, build_data = await LocalDockerClient.create(gym.dockerfile)
|
|
70
|
+
elif gym.location == "remote":
|
|
71
|
+
logger.info("Creating remote environment")
|
|
72
|
+
client, build_data = await RemoteDockerClient.create(
|
|
73
|
+
dockerfile=gym.dockerfile,
|
|
74
|
+
job_id=effective_job_id,
|
|
75
|
+
task_id=task.id if task else None,
|
|
76
|
+
metadata=metadata,
|
|
77
|
+
)
|
|
78
|
+
else:
|
|
79
|
+
raise ValueError(f"Invalid environment location: {gym.location}")
|
|
80
|
+
|
|
81
|
+
# Set up the environment with a source path
|
|
82
|
+
if gym.controller_source_dir:
|
|
83
|
+
logger.info("Setting source path")
|
|
84
|
+
client.set_source_path(Path(gym.controller_source_dir))
|
|
85
|
+
elif isinstance(gym, str):
|
|
86
|
+
logger.info("Creating private environment")
|
|
87
|
+
# Note: the gym_name_or_id is a unique identifier, but it is not a true
|
|
88
|
+
# gym_id for the purposes of building the environment
|
|
89
|
+
# we therefore fetch the gym_id from the HUD API here
|
|
90
|
+
true_gym_id = await get_gym_id(gym)
|
|
91
|
+
|
|
92
|
+
# Create the environment
|
|
93
|
+
client, build_data = await RemoteClient.create(
|
|
94
|
+
gym_id=true_gym_id,
|
|
95
|
+
job_id=effective_job_id,
|
|
96
|
+
task_id=task.id if task else None,
|
|
97
|
+
metadata=metadata,
|
|
98
|
+
)
|
|
99
|
+
else:
|
|
100
|
+
raise ValueError(f"Invalid gym source: {gym}")
|
|
12
101
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
102
|
+
# Create the environment itself
|
|
103
|
+
environment = Environment(client=client, metadata=metadata, task=task, build_data=build_data)
|
|
104
|
+
|
|
105
|
+
if task:
|
|
106
|
+
await environment._setup()
|
|
16
107
|
|
|
17
|
-
|
|
18
|
-
id: Unique identifier
|
|
19
|
-
name: Human-readable name
|
|
20
|
-
"""
|
|
21
|
-
self.id = id
|
|
22
|
-
self.name = name
|
|
108
|
+
return environment
|
hud/job.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import datetime
|
|
4
|
+
import functools
|
|
5
|
+
import inspect
|
|
6
|
+
import logging
|
|
7
|
+
from collections.abc import Callable
|
|
8
|
+
from typing import Any, TypeVar, cast
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, TypeAdapter
|
|
11
|
+
|
|
12
|
+
from hud.server import make_request
|
|
13
|
+
from hud.settings import settings
|
|
14
|
+
from hud.trajectory import Trajectory
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger("hud.job")
|
|
17
|
+
|
|
18
|
+
# Type variable for the decorator
|
|
19
|
+
T = TypeVar("T", bound=Callable)
|
|
20
|
+
|
|
21
|
+
# Global registry to store active jobs created by decorators
|
|
22
|
+
_ACTIVE_JOBS = {}
|
|
23
|
+
|
|
24
|
+
class Job(BaseModel):
|
|
25
|
+
"""
|
|
26
|
+
A job represents a collection of related trajectories.
|
|
27
|
+
It holds metadata and provides methods to interact with job data.
|
|
28
|
+
Instances should typically be obtained via `create_job` or `load_job`.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
id: str
|
|
32
|
+
name: str
|
|
33
|
+
metadata: dict[str, Any] | None = None
|
|
34
|
+
created_at: datetime.datetime
|
|
35
|
+
status: str
|
|
36
|
+
|
|
37
|
+
async def load_trajectories(self, *, api_key: str | None = None) -> list[Trajectory]:
|
|
38
|
+
"""
|
|
39
|
+
Loads the trajectories associated with this job.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
List[Trajectory]: The trajectories in the job
|
|
43
|
+
"""
|
|
44
|
+
api_key = api_key or settings.api_key
|
|
45
|
+
|
|
46
|
+
data = await make_request(
|
|
47
|
+
method="GET",
|
|
48
|
+
url=f"{settings.base_url}/v2/jobs/{self.id}/trajectories",
|
|
49
|
+
api_key=api_key,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
return TypeAdapter(list[Trajectory]).validate_python(data)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
async def create_job(name: str, gym_id: str | None = None,
|
|
56
|
+
evalset_id: str | None = None,
|
|
57
|
+
metadata: dict[str, Any] | None = None) -> Job:
|
|
58
|
+
"""
|
|
59
|
+
Creates a new job.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
name: The name of the job
|
|
63
|
+
metadata: Metadata for the job
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Job: The created job instance
|
|
67
|
+
"""
|
|
68
|
+
api_key = settings.api_key
|
|
69
|
+
metadata = metadata or {}
|
|
70
|
+
|
|
71
|
+
data = await make_request(
|
|
72
|
+
method="POST",
|
|
73
|
+
url=f"{settings.base_url}/v2/jobs",
|
|
74
|
+
json={
|
|
75
|
+
"name": name,
|
|
76
|
+
"metadata": metadata,
|
|
77
|
+
"gym_id": gym_id,
|
|
78
|
+
"evalset_id": evalset_id,
|
|
79
|
+
},
|
|
80
|
+
api_key=api_key,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Assume the backend API returns the full job data upon creation
|
|
84
|
+
# or at least the necessary fields (id, name, metadata, created_at, status)
|
|
85
|
+
# If not, we might need to make a subsequent GET request
|
|
86
|
+
job_data = data # Adjust if the API response structure is different
|
|
87
|
+
|
|
88
|
+
return Job(
|
|
89
|
+
id=job_data["id"],
|
|
90
|
+
name=job_data["name"],
|
|
91
|
+
metadata=job_data.get("metadata", {}), # Ensure metadata is dict
|
|
92
|
+
created_at=datetime.datetime.fromisoformat(job_data["created_at"]), # Parse datetime
|
|
93
|
+
status=job_data["status"],
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
async def load_job(job_id: str, api_key: str | None = None) -> Job:
|
|
98
|
+
"""
|
|
99
|
+
Retrieves a job by its ID.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
job_id: The ID of the job to retrieve
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
Job: The retrieved job instance
|
|
106
|
+
"""
|
|
107
|
+
api_key = api_key or settings.api_key
|
|
108
|
+
|
|
109
|
+
data = await make_request(
|
|
110
|
+
method="GET",
|
|
111
|
+
url=f"{settings.base_url}/v2/jobs/{job_id}",
|
|
112
|
+
api_key=api_key,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
if not data:
|
|
116
|
+
raise ValueError(f"Job {job_id} not found")
|
|
117
|
+
|
|
118
|
+
# Validate and create the Job instance from the fetched data
|
|
119
|
+
return Job.model_validate(data)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def job(
|
|
123
|
+
name: str,
|
|
124
|
+
metadata: dict[str, Any] | None = None
|
|
125
|
+
) -> Callable[[T], T]:
|
|
126
|
+
"""
|
|
127
|
+
Decorator to automatically create and associate a job with all environments
|
|
128
|
+
created within the decorated function.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
name: The name of the job
|
|
132
|
+
metadata: Additional metadata for the job
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
A decorator function that creates a job and associates it with environments
|
|
136
|
+
"""
|
|
137
|
+
def decorator(func: T) -> T:
|
|
138
|
+
@functools.wraps(func)
|
|
139
|
+
async def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
140
|
+
# Create a job for this function call using the new function
|
|
141
|
+
job = await create_job(
|
|
142
|
+
name=name,
|
|
143
|
+
metadata=metadata
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Store in global registry with a unique key based on function and call
|
|
147
|
+
call_id = f"{func.__module__}.{func.__qualname__}_{id(wrapper)}"
|
|
148
|
+
_ACTIVE_JOBS[call_id] = job
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
# Add the function's frame to the stack for lookup
|
|
152
|
+
frame = inspect.currentframe()
|
|
153
|
+
if frame:
|
|
154
|
+
frame.f_locals["_job_call_id"] = call_id
|
|
155
|
+
|
|
156
|
+
# Run the decorated function
|
|
157
|
+
result = await func(*args, **kwargs)
|
|
158
|
+
return result
|
|
159
|
+
finally:
|
|
160
|
+
# Clean up
|
|
161
|
+
if call_id in _ACTIVE_JOBS:
|
|
162
|
+
del _ACTIVE_JOBS[call_id]
|
|
163
|
+
|
|
164
|
+
return cast(T, wrapper)
|
|
165
|
+
return decorator
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def get_active_job() -> Job | None:
|
|
169
|
+
"""
|
|
170
|
+
Get the currently active job from the call stack, if any.
|
|
171
|
+
Used internally by gym.make to automatically associate environments with jobs.
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
The active job or None if no job is active
|
|
175
|
+
"""
|
|
176
|
+
# Walk up the stack to find any frame with _job_call_id
|
|
177
|
+
frame = inspect.currentframe()
|
|
178
|
+
while frame:
|
|
179
|
+
if "_job_call_id" in frame.f_locals:
|
|
180
|
+
call_id = frame.f_locals["_job_call_id"]
|
|
181
|
+
if call_id in _ACTIVE_JOBS:
|
|
182
|
+
return _ACTIVE_JOBS[call_id]
|
|
183
|
+
frame = frame.f_back
|
|
184
|
+
|
|
185
|
+
return None
|
hud/server/__init__.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from .requests import RequestError, make_request
|
|
3
|
+
from .requests import RequestError, make_request, make_request_sync
|
|
4
4
|
|
|
5
|
-
__all__ = ["RequestError", "make_request"]
|
|
5
|
+
__all__ = ["RequestError", "make_request", "make_request_sync"]
|
hud/server/requests.py
CHANGED
|
@@ -6,6 +6,7 @@ from __future__ import annotations
|
|
|
6
6
|
|
|
7
7
|
import asyncio
|
|
8
8
|
import logging
|
|
9
|
+
import time
|
|
9
10
|
from typing import Any
|
|
10
11
|
|
|
11
12
|
import httpx
|
|
@@ -191,3 +192,89 @@ async def make_request(
|
|
|
191
192
|
except Exception as e:
|
|
192
193
|
raise RequestError(f"Unexpected error: {e!s}") from None
|
|
193
194
|
raise RequestError(f"Request failed after {max_retries} retries with unknown error")
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def make_request_sync(
|
|
198
|
+
method: str,
|
|
199
|
+
url: str,
|
|
200
|
+
json: Any | None = None,
|
|
201
|
+
api_key: str | None = None,
|
|
202
|
+
max_retries: int = 4,
|
|
203
|
+
retry_delay: float = 2.0,
|
|
204
|
+
) -> dict[str, Any]:
|
|
205
|
+
"""
|
|
206
|
+
Make a synchronous HTTP request to the HUD API.
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
method: HTTP method (GET, POST, etc.)
|
|
210
|
+
url: Full URL for the request
|
|
211
|
+
json: Optional JSON serializable data
|
|
212
|
+
api_key: API key for authentication
|
|
213
|
+
max_retries: Maximum number of retries
|
|
214
|
+
retry_delay: Delay between retries
|
|
215
|
+
Returns:
|
|
216
|
+
dict: JSON response from the server
|
|
217
|
+
|
|
218
|
+
Raises:
|
|
219
|
+
RequestError: If API key is missing or request fails
|
|
220
|
+
"""
|
|
221
|
+
if not api_key:
|
|
222
|
+
raise RequestError("API key is required but not provided")
|
|
223
|
+
|
|
224
|
+
headers = {"Authorization": f"Bearer {api_key}"}
|
|
225
|
+
retry_status_codes = [502, 503, 504]
|
|
226
|
+
attempt = 0
|
|
227
|
+
|
|
228
|
+
while attempt <= max_retries:
|
|
229
|
+
attempt += 1
|
|
230
|
+
|
|
231
|
+
try:
|
|
232
|
+
with httpx.Client(
|
|
233
|
+
timeout=600.0, # Long running requests can take up to 10 minutes
|
|
234
|
+
limits=httpx.Limits(
|
|
235
|
+
max_connections=1000,
|
|
236
|
+
max_keepalive_connections=1000,
|
|
237
|
+
keepalive_expiry=10.0,
|
|
238
|
+
),
|
|
239
|
+
) as client:
|
|
240
|
+
response = client.request(
|
|
241
|
+
method=method, url=url, json=json, headers=headers
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# Check if we got a retriable status code
|
|
245
|
+
if response.status_code in retry_status_codes and attempt <= max_retries:
|
|
246
|
+
retry_time = retry_delay * (2 ** (attempt - 1)) # Exponential backoff
|
|
247
|
+
logger.warning(
|
|
248
|
+
"Received status %d from %s, retrying in %.2f seconds (attempt %d/%d)",
|
|
249
|
+
response.status_code,
|
|
250
|
+
url,
|
|
251
|
+
retry_time,
|
|
252
|
+
attempt,
|
|
253
|
+
max_retries,
|
|
254
|
+
)
|
|
255
|
+
time.sleep(retry_time)
|
|
256
|
+
continue
|
|
257
|
+
|
|
258
|
+
response.raise_for_status()
|
|
259
|
+
result = response.json()
|
|
260
|
+
return result
|
|
261
|
+
except httpx.HTTPStatusError as e:
|
|
262
|
+
raise RequestError.from_http_error(e) from None
|
|
263
|
+
except httpx.RequestError as e:
|
|
264
|
+
if attempt <= max_retries:
|
|
265
|
+
retry_time = retry_delay * (2 ** (attempt - 1))
|
|
266
|
+
logger.warning(
|
|
267
|
+
"Network error %s from %s, retrying in %.2f seconds (attempt %d/%d)",
|
|
268
|
+
str(e),
|
|
269
|
+
url,
|
|
270
|
+
retry_time,
|
|
271
|
+
attempt,
|
|
272
|
+
max_retries,
|
|
273
|
+
)
|
|
274
|
+
time.sleep(retry_time)
|
|
275
|
+
continue
|
|
276
|
+
else:
|
|
277
|
+
raise RequestError(f"Network error: {e!s}") from None
|
|
278
|
+
except Exception as e:
|
|
279
|
+
raise RequestError(f"Unexpected error: {e!s}") from None
|
|
280
|
+
raise RequestError(f"Request failed after {max_retries} retries with unknown error")
|
hud/settings.py
CHANGED
|
@@ -15,7 +15,7 @@ class Settings(BaseSettings):
|
|
|
15
15
|
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="allow")
|
|
16
16
|
|
|
17
17
|
base_url: str = Field(
|
|
18
|
-
default="https://
|
|
18
|
+
default="https://orcstaging.hud.so/hud-gym/api",
|
|
19
19
|
description="Base URL for the HUD API",
|
|
20
20
|
validation_alias="base_url",
|
|
21
21
|
)
|
|
@@ -25,7 +25,18 @@ class Settings(BaseSettings):
|
|
|
25
25
|
description="API key for authentication with the HUD API",
|
|
26
26
|
validation_alias="HUD_API_KEY",
|
|
27
27
|
)
|
|
28
|
-
|
|
28
|
+
|
|
29
|
+
anthropic_api_key: str | None = Field(
|
|
30
|
+
default=None,
|
|
31
|
+
description="API key for Anthropic models",
|
|
32
|
+
validation_alias="ANTHROPIC_API_KEY",
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
openai_api_key: str | None = Field(
|
|
36
|
+
default=None,
|
|
37
|
+
description="API key for OpenAI models",
|
|
38
|
+
validation_alias="OPENAI_API_KEY",
|
|
39
|
+
)
|
|
29
40
|
|
|
30
41
|
# Create a singleton instance
|
|
31
42
|
settings = Settings()
|
hud/task.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
from hud.types import CustomGym, Gym
|
|
8
|
+
from hud.utils import HudStyleConfig
|
|
9
|
+
from hud.utils.config import HudStyleConfigs
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from inspect_ai.dataset import Sample
|
|
13
|
+
|
|
14
|
+
# Environment specifications:
|
|
15
|
+
# These represent the environment as a whole, including both the controller
|
|
16
|
+
# and the environment type (eg, what os, which services are running)
|
|
17
|
+
|
|
18
|
+
UBUNTU_DOCKERFILE = "ubuntu:latest"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def convert_inspect_setup(setup: str) -> list[HudStyleConfig]:
|
|
22
|
+
"""
|
|
23
|
+
Inspect setup is a single bash string to run in the environment.
|
|
24
|
+
We convert this into a single HudStyleConfig using the exec command
|
|
25
|
+
"""
|
|
26
|
+
return [HudStyleConfig(function="bash", args=[setup])]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class Task(BaseModel):
|
|
30
|
+
"""A task that can be executed and evaluated.
|
|
31
|
+
|
|
32
|
+
A Task represents a specific activity to be performed in an environment.
|
|
33
|
+
It contains the prompt describing the task and configurations for
|
|
34
|
+
setting up and evaluating the environment.
|
|
35
|
+
|
|
36
|
+
The setup and evaluate configurations can be in several formats:
|
|
37
|
+
- String (function name): "chrome.maximize"
|
|
38
|
+
- String (function with args): "chrome.activate_tab 5"
|
|
39
|
+
- Dict: {"function": "chrome.navigate", "args": ["https://example.com"]}
|
|
40
|
+
- List of the above: ["chrome.maximize", {"function": "chrome.navigate", "args": ["https://example.com"]}]
|
|
41
|
+
|
|
42
|
+
Attributes:
|
|
43
|
+
id: The remote task ID (optional if local-only)
|
|
44
|
+
prompt: The task prompt or instruction
|
|
45
|
+
setup: Environment setup configuration (optional)
|
|
46
|
+
evaluate: Configuration for evaluating responses
|
|
47
|
+
metadata: Additional task metadata
|
|
48
|
+
choices: Multiple choice answer list (for Inspect compatibility)
|
|
49
|
+
target: Ideal target output (for Inspect compatibility)
|
|
50
|
+
files: Files that go along with the task (for Inspect compatibility)
|
|
51
|
+
gym: Environment specification
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
id: str | None = None
|
|
55
|
+
prompt: str
|
|
56
|
+
setup: HudStyleConfigs | None = None
|
|
57
|
+
evaluate: HudStyleConfigs | None = None
|
|
58
|
+
gym: Gym | None = None
|
|
59
|
+
|
|
60
|
+
target: str | list[str] | None = None
|
|
61
|
+
|
|
62
|
+
choices: list[str] | None = None
|
|
63
|
+
files: dict[str, str] | None = None
|
|
64
|
+
metadata: dict[str, Any] | None = None
|
|
65
|
+
|
|
66
|
+
config: dict[str, Any] | None = None
|
|
67
|
+
|
|
68
|
+
@classmethod
|
|
69
|
+
def from_inspect_sample(cls, sample: Sample) -> Task:
|
|
70
|
+
"""Create a Task from an Inspect dataset sample.
|
|
71
|
+
The task's sandbox is a local ubuntu container using the standard controller.
|
|
72
|
+
Files will be copied to the user directory
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
sample: An Inspect dataset Sample object
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Task instance
|
|
79
|
+
|
|
80
|
+
The Inspect Sample has these fields:
|
|
81
|
+
- input (str | list[ChatMessage]): The input to be submitted to the model
|
|
82
|
+
- choices (list[str] | None): Optional multiple choice answer list
|
|
83
|
+
- target (str | list[str] | None): Optional ideal target output
|
|
84
|
+
- id (str | None): Optional unique identifier for sample
|
|
85
|
+
- metadata (dict[str, Any] | None): Optional arbitrary metadata
|
|
86
|
+
- sandbox (str | tuple[str, str]): Optional sandbox environment type
|
|
87
|
+
- files (dict[str, str] | None): Optional files that go with the sample
|
|
88
|
+
- setup (str | None): Optional setup script to run for sample
|
|
89
|
+
"""
|
|
90
|
+
# Extract the input as prompt
|
|
91
|
+
prompt = sample.input
|
|
92
|
+
if isinstance(prompt, list): # Handle ChatMessage format
|
|
93
|
+
# Convert chat message list to a string representation
|
|
94
|
+
prompt_parts = []
|
|
95
|
+
for message in prompt:
|
|
96
|
+
role = message.role
|
|
97
|
+
content = message.content
|
|
98
|
+
prompt_parts.append(f"{role.capitalize()}: {content}")
|
|
99
|
+
prompt = "\n\n".join(prompt_parts)
|
|
100
|
+
|
|
101
|
+
# Map sandbox from Inspect to our envspec
|
|
102
|
+
sandbox = sample.sandbox
|
|
103
|
+
dockerfile = None
|
|
104
|
+
if sandbox:
|
|
105
|
+
if isinstance(sandbox, str):
|
|
106
|
+
if sandbox != "docker":
|
|
107
|
+
raise ValueError("docker is the only supported sandbox")
|
|
108
|
+
elif isinstance(sandbox, tuple) and len(sandbox) == 2:
|
|
109
|
+
sandbox_type, sandbox_config = sandbox
|
|
110
|
+
if sandbox_type != "docker":
|
|
111
|
+
raise ValueError("docker is the only supported sandbox")
|
|
112
|
+
dockerfile = sandbox_config
|
|
113
|
+
else:
|
|
114
|
+
raise ValueError("Invalid sandbox configuration")
|
|
115
|
+
|
|
116
|
+
gym = CustomGym(
|
|
117
|
+
dockerfile=dockerfile or UBUNTU_DOCKERFILE,
|
|
118
|
+
location="local",
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
return cls(
|
|
122
|
+
id=str(sample.id) if sample.id else None,
|
|
123
|
+
prompt=prompt,
|
|
124
|
+
setup=[x for x in convert_inspect_setup(sample.setup)] if sample.setup else [],
|
|
125
|
+
metadata=sample.metadata,
|
|
126
|
+
choices=sample.choices,
|
|
127
|
+
target=sample.target,
|
|
128
|
+
gym=gym,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def convert_sdk01(self) -> None:
|
|
132
|
+
self.setup = [HudStyleConfig(function="reset", args=[{"task_id": self.id}])]
|
|
133
|
+
self.evaluate = [HudStyleConfig(function="evaluate", args=[])]
|