hud-python 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

hud/__init__.py CHANGED
@@ -5,10 +5,10 @@ HUD Gym SDK - A Python SDK for interacting with HUD environments.
5
5
  from __future__ import annotations
6
6
 
7
7
  from . import agent, env, gym, settings, task, taskset, types, utils
8
- from .job import create_job, job, load_job
8
+ from .job import create_job, job, load_job, run_job
9
9
  from .taskset import load_taskset
10
10
 
11
- __version__ = "0.2.1"
11
+ __version__ = "0.2.2"
12
12
 
13
13
  __all__ = [
14
14
  "agent",
@@ -18,6 +18,7 @@ __all__ = [
18
18
  "job",
19
19
  "load_job",
20
20
  "load_taskset",
21
+ "run_job",
21
22
  "settings",
22
23
  "task",
23
24
  "taskset",
hud/adapters/__init__.py CHANGED
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  from .claude import ClaudeAdapter
4
4
  from .common import CLA, Adapter
5
+ from .common.types import ResponseAction
5
6
  from .operator import OperatorAdapter
6
7
 
7
- __all__ = ["CLA", "Adapter", "ClaudeAdapter", "OperatorAdapter"]
8
+ __all__ = ["CLA", "Adapter", "ClaudeAdapter", "OperatorAdapter", "ResponseAction"]
@@ -23,9 +23,13 @@ from hud.adapters.common.types import (
23
23
 
24
24
  class ClaudeAdapter(Adapter):
25
25
  KEY_MAP: ClassVar[dict[str, CLAKey]] = {
26
- "Return": "enter",
27
- "Super": "win",
28
- }
26
+ "return": "enter",
27
+ "super": "win",
28
+ "super_l": "win",
29
+ "super_r": "win",
30
+ "right shift": "shift",
31
+ "left shift": "shift",
32
+ }
29
33
 
30
34
  def __init__(self) -> None:
31
35
  super().__init__()
@@ -34,7 +38,8 @@ class ClaudeAdapter(Adapter):
34
38
 
35
39
  def _map_key(self, key: str) -> CLAKey:
36
40
  """Map a key to its standardized form."""
37
- return self.KEY_MAP.get(key, key.lower()) # type: ignore
41
+ return self.KEY_MAP.get(key.lower(), key.lower()) # type: ignore
42
+
38
43
  def convert(self, data: Any) -> CLA:
39
44
  try:
40
45
  action_type = data.get("action")
@@ -20,7 +20,6 @@ class Point(BaseModel):
20
20
  class ClickAction(CLAAction):
21
21
  type: Literal["click"] = "click"
22
22
  point: Point | None = None
23
- selector: str | None = None
24
23
  button: Literal["left", "right", "wheel", "back", "forward"] = "left"
25
24
  pattern: list[int] | None = None # [delay_1, delay_2, ...]
26
25
  hold_keys: list[CLAKey] | None = None
@@ -48,7 +47,6 @@ class KeyUpAction(CLAAction):
48
47
  class TypeAction(CLAAction):
49
48
  type: Literal["type"] = "type"
50
49
  text: str
51
- selector: str | None = None
52
50
  enter_after: bool | None = False
53
51
 
54
52
 
@@ -64,7 +62,6 @@ class ScrollAction(CLAAction):
64
62
  class MoveAction(CLAAction):
65
63
  type: Literal["move"] = "move"
66
64
  point: Point | None = None
67
- selector: str | None = None
68
65
  offset: Point | None = None
69
66
 
70
67
 
@@ -20,11 +20,11 @@ from hud.adapters.common.types import (
20
20
 
21
21
  class OperatorAdapter(Adapter):
22
22
  KEY_MAP: ClassVar[dict[str, CLAKey]] = {
23
- "Return": "enter",
24
- "ArrowUp": "up",
25
- "ArrowDown": "down",
26
- "ArrowLeft": "left",
27
- "ArrowRight": "right",
23
+ "return": "enter",
24
+ "arrowup": "up",
25
+ "arrowdown": "down",
26
+ "arrowleft": "left",
27
+ "arrowright": "right",
28
28
  }
29
29
 
30
30
  def __init__(self) -> None:
@@ -35,7 +35,7 @@ class OperatorAdapter(Adapter):
35
35
 
36
36
  def _map_key(self, key: str) -> CLAKey:
37
37
  """Map a key to its standardized form."""
38
- return self.KEY_MAP.get(key, key.lower()) # type: ignore
38
+ return self.KEY_MAP.get(key.lower(), key.lower()) # type: ignore
39
39
 
40
40
  def convert(self, data: Any) -> CLA:
41
41
  """Convert a Computer Use action to a HUD action"""
hud/agent/__init__.py CHANGED
@@ -1,7 +1,8 @@
1
1
  from .base import Agent
2
2
  from .claude import ClaudeAgent
3
3
  from .operator import OperatorAgent
4
+ from .langchain import LangchainAgent
4
5
 
5
6
  from hud.adapters import OperatorAdapter, ClaudeAdapter
6
7
 
7
- __all__ = ["Agent", "ClaudeAgent", "OperatorAgent", "OperatorAdapter", "ClaudeAdapter"]
8
+ __all__ = ["Agent", "ClaudeAgent", "OperatorAgent", "OperatorAdapter", "ClaudeAdapter", "LangchainAgent"]
hud/agent/langchain.py ADDED
@@ -0,0 +1,198 @@
1
+ import logging
2
+ from typing import Any, Generic, List, Optional, TypeVar, Union, cast
3
+
4
+ # Langchain imports
5
+ from langchain_core.language_models import BaseLanguageModel
6
+ from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage
7
+ from langchain_core.runnables import Runnable, RunnableSerializable
8
+ from pydantic import Field, BaseModel
9
+
10
+ # HUD imports
11
+ from hud.adapters import Adapter
12
+ from hud.agent.base import Agent
13
+ from hud.env.environment import Observation
14
+ from hud.adapters.common.types import (
15
+ CLA,
16
+ ClickAction,
17
+ TypeAction,
18
+ ScrollAction,
19
+ MoveAction,
20
+ DragAction,
21
+ PressAction,
22
+ KeyDownAction,
23
+ KeyUpAction,
24
+ WaitAction,
25
+ ResponseAction,
26
+ CustomAction,
27
+ # Exclude ScreenshotFetch, PositionFetch as they are internal
28
+ )
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+ # Define a Pydantic Union type representing exactly ONE possible CLA action
33
+ # This is what we'll ask the Langchain model to output.
34
+ SingleCLAction = Union[
35
+ ClickAction,
36
+ TypeAction,
37
+ ScrollAction,
38
+ MoveAction,
39
+ DragAction,
40
+ PressAction,
41
+ KeyDownAction,
42
+ KeyUpAction,
43
+ WaitAction,
44
+ ResponseAction,
45
+ ]
46
+
47
+ # Define a Pydantic model to wrap the single action, potentially making it
48
+ # easier for the LLM to consistently output the desired structure.
49
+ class StepAction(BaseModel):
50
+ """Wrapper model requesting a single concrete CLA action from the Langchain model."""
51
+ action: SingleCLAction = Field(..., description="The single CLA action to perform for this step.")
52
+
53
+ # Generic Type for the Langchain Model/Runnable
54
+ # Allows flexibility in what the user provides (model, chain, etc.)
55
+ # Bound to BaseLanguageModel as .with_structured_output is expected
56
+ LangchainModelOrRunnable = TypeVar("LangchainModelOrRunnable", bound=BaseLanguageModel)
57
+
58
+ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainModelOrRunnable]):
59
+ """
60
+ An agent that uses an arbitrary Langchain model or runnable, leveraging
61
+ Langchain's structured output capabilities to produce a single CLA action per step.
62
+ """
63
+
64
+ def __init__(
65
+ self,
66
+ langchain_model: LangchainModelOrRunnable,
67
+ adapter: Optional[Adapter] = None,
68
+ system_prompt: str | None = None,
69
+ ):
70
+ """
71
+ Initialize the LangchainAgent.
72
+
73
+ Args:
74
+ langchain_model: The Langchain language model or runnable chain to use.
75
+ Must support asynchronous invocation (`ainvoke`) and
76
+ `.with_structured_output()`.
77
+ adapter: An optional HUD adapter. If provided, it will be used for
78
+ preprocessing observations (rescaling) and postprocessing
79
+ the single CLA action (coordinate rescaling).
80
+ system_prompt: An optional system prompt to guide the Langchain model.
81
+ If None, a default prompt encouraging single CLA output is used.
82
+ """
83
+ super().__init__(client=langchain_model, adapter=adapter) # Store model as 'client'
84
+ self.langchain_model = langchain_model # Also store with specific name
85
+
86
+ self.system_prompt_str = system_prompt or self._get_default_system_prompt()
87
+ self.history: List[BaseMessage] = []
88
+
89
+ def _get_default_system_prompt(self) -> str:
90
+ # TODO: Refine this prompt based on testing.
91
+ # It needs to strongly encourage outputting *only* the StepAction structure.
92
+ return (
93
+ "You are an agent interacting with a computer environment (either a web browser or an OS desktop). "
94
+ "Your goal is to follow the user's instructions based on the provided text and screenshot observations."
95
+ "For each step, you must choose exactly ONE action to perform from the available CLA action types."
96
+ "Output your chosen action using the provided 'StepAction' tool/function."
97
+ "If you believe the task is complete based on the user's prompt and the observations, use the 'ResponseAction'."
98
+ )
99
+
100
+ async def fetch_response(self, observation: Observation) -> tuple[CLA | None, bool]:
101
+ """
102
+ Fetches a response from the configured Langchain model, expecting a single
103
+ structured CLA action.
104
+
105
+ Args:
106
+ observation: The preprocessed observation (screenshot potentially rescaled by adapter).
107
+
108
+ Returns:
109
+ A tuple containing:
110
+ - A list with a single dictionary representing the raw CLA action (before adapter postprocessing).
111
+ - A boolean indicating if the agent chose ResponseAction (task completion).
112
+ """
113
+ # 1. Format observation into Langchain message(s)
114
+ human_content: List[Union[str, dict]] = []
115
+ if observation.text:
116
+ human_content.append(observation.text)
117
+ if observation.screenshot:
118
+ # Assuming the Langchain model/chain can handle base64 images
119
+ # This might need adjustment based on the specific model used.
120
+ human_content.append({
121
+ "type": "image_url",
122
+ "image_url": {
123
+ "url": f"data:image/png;base64,{observation.screenshot}"
124
+ }
125
+ })
126
+
127
+ if not human_content:
128
+ logger.warning("LangchainAgent received an observation with no text or screenshot.")
129
+ # Decide how to handle empty observation - perhaps return no action?
130
+ return [], False # Or raise an error?
131
+
132
+ current_human_message = HumanMessage(content=human_content)
133
+
134
+ # 2. Prepare message history for the model
135
+ messages_for_llm: List[BaseMessage] = [
136
+ SystemMessage(content=self.system_prompt_str),
137
+ *self.history,
138
+ current_human_message,
139
+ ]
140
+
141
+ # 3. Configure structured output
142
+ # We ask for the StepAction wrapper, which contains the actual SingleCLAAction
143
+ # Explicitly use method="function_calling" to handle schemas with default values
144
+ structured_llm = self.langchain_model.with_structured_output(
145
+ schema=StepAction,
146
+ method="function_calling"
147
+ )
148
+
149
+ # 4. Invoke Langchain model asynchronously
150
+ try:
151
+ ai_response_structured = await structured_llm.ainvoke(messages_for_llm)
152
+ except Exception as e:
153
+ logger.error(f"Langchain model invocation failed: {e}", exc_info=True)
154
+ # Decide how to handle LLM errors - maybe retry or return empty action?
155
+ return [], False
156
+
157
+ # 5. Process the structured response
158
+ is_done = False
159
+ ai_message_content_for_history = "" # For storing in history
160
+
161
+ if isinstance(ai_response_structured, StepAction):
162
+ # Successfully got the wrapper, extract the actual action
163
+ actual_action = ai_response_structured.action
164
+ ai_message_content_for_history = actual_action.model_dump()
165
+ if isinstance(actual_action, ResponseAction):
166
+ is_done = True
167
+ logger.info(f"LangchainAgent determined task is done with response: {actual_action.text[:100]}...")
168
+ else:
169
+ logger.info(f"LangchainAgent produced action: {type(actual_action).__name__}")
170
+
171
+ else:
172
+ logger.warning(
173
+ f"Langchain model did not return the expected StepAction structure. "
174
+ f"Received type: {type(ai_response_structured)}. Value: {ai_response_structured!r}"
175
+ )
176
+ # Attempt to add raw response to history for debugging
177
+ if isinstance(ai_response_structured, BaseMessage):
178
+ ai_message_content_for_history = ai_response_structured.content
179
+ elif isinstance(ai_response_structured, str):
180
+ ai_message_content_for_history = ai_response_structured
181
+ else:
182
+ ai_message_content_for_history = repr(ai_response_structured)
183
+ # Return no action as we didn't get the expected structure
184
+ return [], False
185
+
186
+ # 6. Update history
187
+ self.history.append(current_human_message)
188
+ # Add the AI response (containing the structured action dict) to history
189
+ # Convert dict to string representation for AIMessage content
190
+ self.history.append(AIMessage(content=repr(ai_message_content_for_history)))
191
+ # TODO: Consider history truncation/summarization if it grows too long
192
+
193
+ if actual_action:
194
+ # Return the single action dictionary within a list
195
+ return [actual_action], is_done
196
+ else:
197
+ # Should ideally not happen if structure validation worked, but as a fallback
198
+ return [], is_done
hud/env/remote_client.py CHANGED
@@ -74,6 +74,10 @@ class RemoteClient(Client):
74
74
 
75
75
  build_data = response.get("metadata", {})
76
76
 
77
+ if response.get("readme"):
78
+ logger.info("[HUD] %s gym created, see how to use it at %s", gym_id,
79
+ response.get("readme"))
80
+
77
81
  return controller, build_data
78
82
 
79
83
  def __init__(self, env_id: str) -> None:
hud/gym.py CHANGED
@@ -8,12 +8,12 @@ from hud.env.environment import Environment
8
8
  from hud.env.local_docker_client import LocalDockerClient
9
9
  from hud.env.remote_client import RemoteClient
10
10
  from hud.env.remote_docker_client import RemoteDockerClient
11
- from hud.task import Task
12
11
  from hud.types import CustomGym, Gym
13
12
  from hud.utils.common import get_gym_id
14
13
 
15
14
  if TYPE_CHECKING:
16
15
  from hud.job import Job
16
+ from hud.task import Task
17
17
 
18
18
  logger = logging.getLogger("hud.gym")
19
19
 
@@ -54,9 +54,9 @@ async def make(
54
54
 
55
55
  gym = None
56
56
  task = None
57
- if isinstance(env_src, Gym):
57
+ if isinstance(env_src, str | CustomGym):
58
58
  gym = env_src
59
- elif isinstance(env_src, Task):
59
+ else:
60
60
  gym = env_src.gym
61
61
  task = env_src
62
62
 
hud/job.py CHANGED
@@ -1,17 +1,27 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  import datetime
4
5
  import functools
5
6
  import inspect
6
7
  import logging
7
- from collections.abc import Callable
8
- from typing import Any, TypeVar, cast
8
+ import sys
9
+ from collections.abc import Callable, Coroutine
10
+ from typing import TYPE_CHECKING, Any, TypeVar, cast
9
11
 
10
- from pydantic import BaseModel, TypeAdapter
12
+ from pydantic import BaseModel, PrivateAttr, TypeAdapter
11
13
 
14
+ from hud import gym
12
15
  from hud.server import make_request
13
16
  from hud.settings import settings
17
+ from hud.task import Task
18
+ from hud.taskset import TaskSet
14
19
  from hud.trajectory import Trajectory
20
+ from hud.utils.progress import StepProgressTracker
21
+
22
+ if TYPE_CHECKING:
23
+ from hud.adapters.common import Adapter
24
+ from hud.agent.base import Agent
15
25
 
16
26
  logger = logging.getLogger("hud.job")
17
27
 
@@ -25,7 +35,7 @@ class Job(BaseModel):
25
35
  """
26
36
  A job represents a collection of related trajectories.
27
37
  It holds metadata and provides methods to interact with job data.
28
- Instances should typically be obtained via `create_job` or `load_job`.
38
+ Instances should typically be obtained via `create_job`, `load_job`, or the new `run_job`.
29
39
  """
30
40
 
31
41
  id: str
@@ -34,23 +44,85 @@ class Job(BaseModel):
34
44
  created_at: datetime.datetime
35
45
  status: str
36
46
 
37
- async def load_trajectories(self, *, api_key: str | None = None) -> list[Trajectory]:
47
+ # Internal cache for trajectories
48
+ _trajectories: list[Trajectory] | None = PrivateAttr(default=None)
49
+ # Store execution errors for debugging
50
+ errors: list[dict[str, Any]] = []
51
+
52
+ async def load_trajectories(
53
+ self, *, api_key: str | None = None, force_reload: bool = False
54
+ ) -> list[Trajectory]:
38
55
  """
39
56
  Loads the trajectories associated with this job.
57
+ Uses cached results unless force_reload is True.
58
+
59
+ Args:
60
+ api_key: Optional API key.
61
+ force_reload: If True, fetches trajectories from the API even if cached.
40
62
 
41
63
  Returns:
42
64
  List[Trajectory]: The trajectories in the job
43
65
  """
66
+ if self._trajectories is not None and not force_reload:
67
+ logger.debug("Returning cached trajectories for Job %s", self.id)
68
+ return self._trajectories
69
+
70
+ logger.debug("Fetching trajectories for Job %s from API...", self.id)
44
71
  api_key = api_key or settings.api_key
45
72
 
46
- data = await make_request(
47
- method="GET",
48
- url=f"{settings.base_url}/v2/jobs/{self.id}/trajectories",
49
- api_key=api_key,
50
- )
73
+ try:
74
+ data = await make_request(
75
+ method="GET",
76
+ url=f"{settings.base_url}/v2/jobs/{self.id}/trajectories",
77
+ api_key=api_key,
78
+ )
79
+ self._trajectories = TypeAdapter(list[Trajectory]).validate_python(data)
80
+ logger.debug("Loaded %d trajectories for Job %s", len(self._trajectories), self.id)
81
+ return self._trajectories
82
+ except Exception as e:
83
+ logger.exception("Failed to load trajectories for Job %s: %s", self.id, e)
84
+ self._trajectories = None # Ensure cache is cleared on error
85
+ return [] # Return empty list on error
86
+
87
+ async def get_analytics(self, *, force_reload: bool = False) -> dict[str, Any]:
88
+ """
89
+ Calculates and returns analytics for the job based on its trajectories.
90
+
91
+ Args:
92
+ force_reload: If True, re-fetches trajectories before calculating.
93
+
94
+ Returns:
95
+ Dictionary containing analytics (e.g., task_count, avg_reward).
96
+ """
97
+ trajectories = await self.load_trajectories(force_reload=force_reload)
51
98
 
52
- return TypeAdapter(list[Trajectory]).validate_python(data)
99
+ task_count = len(trajectories)
100
+ if task_count == 0:
101
+ return {"task_count": 0, "avg_reward": None, "success_rate": None} # Or other default
102
+
103
+ total_reward = 0
104
+ successful_tasks = 0
105
+ valid_rewards = 0
53
106
 
107
+ for traj in trajectories:
108
+ # Example: Assume reward is numeric and success is reward >= 1.0
109
+ # Adjust based on actual trajectory data structure and evaluation logic
110
+ if isinstance(traj.reward, int | float):
111
+ total_reward += traj.reward
112
+ valid_rewards += 1
113
+ if traj.reward >= 1.0:
114
+ successful_tasks += 1
115
+ # Add more complex logic here if needed based on traj.evaluation_result or metadata
116
+
117
+ avg_reward = (total_reward / valid_rewards) if valid_rewards > 0 else None
118
+ success_rate = (successful_tasks / task_count) * 100 if task_count > 0 else None
119
+
120
+ return {
121
+ "task_count": task_count,
122
+ "avg_reward": avg_reward,
123
+ "success_rate": success_rate,
124
+ # Add other relevant stats here
125
+ }
54
126
 
55
127
  async def create_job(name: str, gym_id: str | None = None,
56
128
  evalset_id: str | None = None,
@@ -84,7 +156,9 @@ async def create_job(name: str, gym_id: str | None = None,
84
156
  # or at least the necessary fields (id, name, metadata, created_at, status)
85
157
  # If not, we might need to make a subsequent GET request
86
158
  job_data = data # Adjust if the API response structure is different
87
-
159
+
160
+ logger.info("[HUD] View job at https://app.hud.so/jobs/%s.", job_data["id"])
161
+
88
162
  return Job(
89
163
  id=job_data["id"],
90
164
  name=job_data["name"],
@@ -183,3 +257,337 @@ def get_active_job() -> Job | None:
183
257
  frame = frame.f_back
184
258
 
185
259
  return None
260
+
261
+ # --- Moved helper functions from runner.py ---
262
+
263
+ async def _execute_task(
264
+ agent_cls: type[Agent],
265
+ adapter_cls: type[Adapter] | None,
266
+ agent_kwargs: dict[str, Any] | None,
267
+ adapter_kwargs: dict[str, Any] | None,
268
+ task: Task,
269
+ job_name: str,
270
+ task_id: str,
271
+ max_steps_per_task: int,
272
+ job: Job,
273
+ tracker: StepProgressTracker | None = None,
274
+ # Use semaphores instead of rate limiter
275
+ env_creation_semaphore: asyncio.Semaphore | None = None,
276
+ agent_predict_semaphore: asyncio.Semaphore | None = None,
277
+ ) -> None:
278
+ """Helper function to instantiate/run/evaluate a single task, with concurrency limits via
279
+ semaphores."""
280
+ if tracker:
281
+ tracker.start_task(task_id)
282
+ env = None
283
+ agent_instance: Agent | None = None
284
+ status = "error"
285
+ error_msg = "Initialization failed"
286
+ try:
287
+ adapter_instance = None
288
+ if adapter_cls:
289
+ adapter_instance = adapter_cls(**(adapter_kwargs or {}))
290
+ agent_instance = agent_cls(adapter=adapter_instance, **(agent_kwargs or {}))
291
+ if agent_instance is None:
292
+ raise RuntimeError("Agent could not be instantiated")
293
+
294
+ # Environment creation with semaphore
295
+ if env_creation_semaphore:
296
+ async with env_creation_semaphore:
297
+ env = await gym.make(task, job=job)
298
+ else:
299
+ env = await gym.make(task, job=job)
300
+
301
+ obs_tuple = await env.reset()
302
+ if obs_tuple is None:
303
+ raise ValueError(f"env.reset() returned None for task {task_id}")
304
+ obs, _ = obs_tuple
305
+
306
+ step_error = None
307
+ for step in range(max_steps_per_task):
308
+ action, done = (None, False)
309
+ try:
310
+ # Agent prediction with semaphore
311
+ if agent_predict_semaphore:
312
+ async with agent_predict_semaphore:
313
+ action, done = await agent_instance.predict(obs)
314
+ else:
315
+ action, done = await agent_instance.predict(obs)
316
+
317
+ if tracker:
318
+ tracker.increment_step(task_id)
319
+
320
+ if action is None and not done:
321
+ done = True
322
+
323
+ step_result = await env.step(action)
324
+ if step_result is None:
325
+ terminated = True
326
+ else:
327
+ obs, _, terminated, _ = step_result
328
+ if terminated or done:
329
+ break
330
+
331
+ except Exception as agent_step_err:
332
+ logger.exception("[Job: %s/%s, Task: %s] Step %d Error: %s", job.name, job.id,
333
+ task_id, step + 1, agent_step_err)
334
+ step_error = f"Error at step {step + 1}: {agent_step_err}"
335
+ # Store step error in job
336
+ job.errors.append({
337
+ "task_id": task_id,
338
+ "type": "step_error",
339
+ "step": step + 1,
340
+ "error": str(agent_step_err),
341
+ "timestamp": datetime.datetime.now().isoformat()
342
+ })
343
+ break
344
+ else:
345
+ logger.warning("[Job: %s/%s, Task: %s] Max steps reached.", job.name, job.id, task_id)
346
+
347
+ # --- Evaluate Task ---
348
+ evaluation_result = None
349
+ if step_error:
350
+ status = "error"
351
+ error_msg = step_error
352
+ else:
353
+ try:
354
+ evaluation_result = await env.evaluate()
355
+ status = "completed"
356
+ error_msg = None
357
+ except Exception as eval_err:
358
+ logger.exception("[Job: %s/%s, Task: %s] Evaluation Error: %s", job.name,
359
+ job.id, task_id, eval_err)
360
+ status = "error"
361
+ error_msg = f"Evaluation failed: {eval_err}"
362
+ # Store evaluation error in job
363
+ job.errors.append({
364
+ "task_id": task_id,
365
+ "type": "evaluation_error",
366
+ "error": str(eval_err),
367
+ "timestamp": datetime.datetime.now().isoformat()
368
+ })
369
+
370
+ except Exception as e:
371
+ logger.exception("[Job: %s/%s, Task: %s] Setup/Run Error: %s", job.name, job.id, task_id, e)
372
+ status = "error"
373
+ error_msg = str(e)
374
+ # Store setup/initialization error in job
375
+ job.errors.append({
376
+ "task_id": task_id,
377
+ "type": "setup_error",
378
+ "error": str(e),
379
+ "timestamp": datetime.datetime.now().isoformat()
380
+ })
381
+
382
+ finally:
383
+ if tracker:
384
+ tracker.finish_task(task_id)
385
+ if env:
386
+ try:
387
+ await env.close()
388
+ except Exception as close_err:
389
+ logger.exception("[Job: %s/%s, Task: %s] Close Error: %s", job.name, job.id,
390
+ task_id, close_err)
391
+ # Store environment close error in job
392
+ job.errors.append({
393
+ "task_id": task_id,
394
+ "type": "env_close_error",
395
+ "error": str(close_err),
396
+ "timestamp": datetime.datetime.now().isoformat()
397
+ })
398
+
399
+ log_suffix = f" Error: {error_msg}" if status == "error" else f" Eval: {evaluation_result}"
400
+ logger.info("[Job: %s/%s, Task: %s] Finished local execution. Status: %s.%s", job.name,
401
+ job.id, task_id, status, log_suffix)
402
+
403
+ async def _progress_monitor(tracker: StepProgressTracker, interval: float = 1.0) -> None:
404
+ """Coroutine to periodically display progress using the tracker."""
405
+ try:
406
+ while not tracker.is_finished():
407
+ sys.stderr.write(f"\r{tracker.display()}")
408
+ sys.stderr.flush()
409
+ await asyncio.sleep(interval)
410
+ sys.stderr.write(f"\r{tracker.display()}\n")
411
+ sys.stderr.flush()
412
+ logger.debug("Progress monitor finished.")
413
+ except asyncio.CancelledError:
414
+ sys.stderr.write("\nProgress monitor cancelled.\n")
415
+ sys.stderr.flush()
416
+ logger.debug("Progress monitor cancelled.")
417
+ except Exception as e:
418
+ sys.stderr.write(f"\nProgress monitor error: {e}\n")
419
+ sys.stderr.flush()
420
+ logger.exception("Progress monitor error: %s", e)
421
+
422
+
423
+ # --- New run_job function ---
424
+
425
+ async def run_job(
426
+ agent_cls: type[Agent],
427
+ task_or_taskset: Task | TaskSet,
428
+ job_name: str,
429
+ adapter_cls: type[Adapter] | None = None,
430
+ agent_kwargs: dict[str, Any] | None = None,
431
+ adapter_kwargs: dict[str, Any] | None = None,
432
+ max_steps_per_task: int = 20,
433
+ run_parallel: bool = True,
434
+ job_metadata: dict[str, Any] | None = None,
435
+ show_progress: bool = True,
436
+ # Concurrency control with semaphores
437
+ max_concurrent_env_creations: int | None = 30, # Limits env.make calls
438
+ max_concurrent_agent_predictions: int | None = 30, # Limits agent.predict calls
439
+ max_concurrent_tasks: int | None = 30, # Limits overall task concurrency
440
+ ) -> Job:
441
+ """
442
+ Creates Job, executes tasks locally, linking them to the Job.
443
+ Instantiates agent/adapter per task. Shows step-based progress.
444
+
445
+ Controls concurrency in three ways:
446
+ 1. Limits concurrent environment creations
447
+ 2. Limits concurrent agent predictions
448
+ 3. Limits overall concurrent tasks (when run_parallel=True)
449
+
450
+ All concurrency controls use semaphores for reliability.
451
+ Tracks all errors that occur during execution in job.errors.
452
+
453
+ Args:
454
+ agent_cls: Agent class to instantiate.
455
+ task_or_taskset: Task or TaskSet to run.
456
+ job_name: Name for the Job.
457
+ adapter_cls: Optional Adapter class.
458
+ agent_kwargs: Optional kwargs for agent constructor.
459
+ adapter_kwargs: Optional kwargs for adapter constructor.
460
+ max_steps_per_task: Step limit per task.
461
+ run_parallel: Run TaskSet tasks concurrently if True (limited by max_concurrent_tasks).
462
+ job_metadata: Metadata for the created Job.
463
+ show_progress: Display the step-based progress tracker.
464
+ max_concurrent_env_creations: Max concurrent environment creation calls.
465
+ max_concurrent_agent_predictions: Max concurrent agent prediction calls.
466
+ max_concurrent_tasks: Max number of tasks to run actively at the same time.
467
+
468
+ Returns:
469
+ The created Job object with errors stored in job.errors.
470
+ """
471
+ tasks_to_run: list[Task] = []
472
+ created_job: Job | None = None
473
+
474
+ # --- Create Job ---
475
+ try:
476
+ logger.info("Creating job with name: '%s'", job_name)
477
+ created_job = await create_job(name=job_name, metadata=job_metadata)
478
+ logger.info("Created job with ID: %s", created_job.id)
479
+ except Exception as e:
480
+ logger.exception("Failed to create job '%s': %s", job_name, e)
481
+ raise
482
+
483
+ # --- Task Setup ---
484
+ is_taskset = isinstance(task_or_taskset, TaskSet)
485
+ if is_taskset:
486
+ tasks_to_run = task_or_taskset.tasks if task_or_taskset.tasks else []
487
+ elif isinstance(task_or_taskset, Task):
488
+ tasks_to_run = [task_or_taskset]
489
+ run_parallel = False
490
+ else:
491
+ raise TypeError("task_or_taskset must be either a Task or a TaskSet")
492
+
493
+ if not tasks_to_run:
494
+ logger.warning("Job '%s' (%s): No tasks found to run.", created_job.name, created_job.id)
495
+ return created_job
496
+
497
+ task_ids = [(str(task.id) if task.id else f"task_{i}") for i, task in enumerate(tasks_to_run)]
498
+ num_tasks = len(tasks_to_run)
499
+
500
+ # --- Create semaphores for concurrency control ---
501
+ env_creation_sema = None
502
+ if max_concurrent_env_creations and max_concurrent_env_creations > 0:
503
+ env_creation_sema = asyncio.Semaphore(max_concurrent_env_creations)
504
+ logger.info("Limiting concurrent environment creations to %d.",
505
+ max_concurrent_env_creations)
506
+
507
+ agent_predict_sema = None
508
+ if max_concurrent_agent_predictions and max_concurrent_agent_predictions > 0:
509
+ agent_predict_sema = asyncio.Semaphore(max_concurrent_agent_predictions)
510
+ logger.info("Limiting concurrent agent predictions to %d.",
511
+ max_concurrent_agent_predictions)
512
+
513
+ task_execution_sema = None
514
+ effective_concurrency = num_tasks # Default to running all if parallel
515
+ if run_parallel and max_concurrent_tasks and max_concurrent_tasks > 0:
516
+ effective_concurrency = min(num_tasks, max_concurrent_tasks)
517
+ task_execution_sema = asyncio.Semaphore(effective_concurrency)
518
+ logger.info("Limiting concurrent task executions to %d.", effective_concurrency)
519
+ elif not run_parallel:
520
+ effective_concurrency = 1 # Sequential means concurrency of 1
521
+
522
+ # --- Instantiate Tracker & Start Monitor ---
523
+ tracker = None
524
+ monitor_task = None
525
+ if show_progress and num_tasks > 0:
526
+ tracker = StepProgressTracker(total_tasks=num_tasks, max_steps_per_task=max_steps_per_task)
527
+ monitor_task = asyncio.create_task(_progress_monitor(tracker))
528
+
529
+ # --- Execute Tasks ---
530
+ job_desc_suffix = f" (Job ID: {created_job.id})"
531
+
532
+ async def task_wrapper(task_coro: Coroutine, semaphore: asyncio.Semaphore | None) -> None:
533
+ if semaphore:
534
+ async with semaphore:
535
+ await task_coro
536
+ else:
537
+ await task_coro
538
+
539
+ try:
540
+ if run_parallel and is_taskset:
541
+ logger.info("Job '%s'%s: Running %d tasks with concurrency %d.", created_job.name,
542
+ job_desc_suffix, num_tasks, effective_concurrency)
543
+
544
+ task_coroutines = [
545
+ _execute_task(
546
+ agent_cls=agent_cls, adapter_cls=adapter_cls, agent_kwargs=agent_kwargs,
547
+ adapter_kwargs=adapter_kwargs, task=task, job_name=created_job.name,
548
+ task_id=task_id,
549
+ max_steps_per_task=max_steps_per_task, job=created_job, tracker=tracker,
550
+ env_creation_semaphore=env_creation_sema,
551
+ agent_predict_semaphore=agent_predict_sema,
552
+ )
553
+ for task, task_id in zip(tasks_to_run, task_ids, strict=True)
554
+ ]
555
+
556
+ # Wrap coroutines with semaphore management if limiting concurrency
557
+ wrapped_tasks = [
558
+ task_wrapper(coro, task_execution_sema)
559
+ for i, coro in enumerate(task_coroutines)
560
+ ]
561
+
562
+ # Run all wrapped tasks
563
+ await asyncio.gather(*wrapped_tasks)
564
+
565
+ else:
566
+ # SEQUENTIAL (or single task)
567
+ logger.info("Job '%s'%s: Running %d tasks sequentially.", created_job.name,
568
+ job_desc_suffix, num_tasks)
569
+ for i, task in enumerate(tasks_to_run):
570
+ task_id = task_ids[i]
571
+ await _execute_task(
572
+ agent_cls=agent_cls, adapter_cls=adapter_cls, agent_kwargs=agent_kwargs,
573
+ adapter_kwargs=adapter_kwargs, task=task, job_name=created_job.name,
574
+ task_id=task_id,
575
+ max_steps_per_task=max_steps_per_task, job=created_job, tracker=tracker,
576
+ env_creation_semaphore=env_creation_sema,
577
+ agent_predict_semaphore=agent_predict_sema,
578
+ )
579
+
580
+ finally:
581
+ # Ensure monitor task is stopped and awaited cleanly
582
+ if monitor_task is not None and not monitor_task.done():
583
+ monitor_task.cancel()
584
+ try:
585
+ await monitor_task
586
+ except asyncio.CancelledError:
587
+ pass
588
+ except Exception as e:
589
+ logger.error("Error awaiting progress monitor task: %s", e)
590
+
591
+ logger.info("Job '%s'%s finished local execution phase for %d tasks.", created_job.name,
592
+ job_desc_suffix, num_tasks)
593
+ return created_job
hud/task.py CHANGED
@@ -112,7 +112,7 @@ class Task(BaseModel):
112
112
  if sandbox:
113
113
  if isinstance(sandbox, str):
114
114
  if sandbox == "docker":
115
- dockerfile = UBUNTU_DOCKERFILE
115
+ dockerfile = UBUNTU_DOCKERFILE
116
116
  use_qa_gym = False
117
117
  elif isinstance(sandbox, tuple) and len(sandbox) == 2:
118
118
  sandbox_type, sandbox_config = sandbox
@@ -122,7 +122,7 @@ class Task(BaseModel):
122
122
 
123
123
  if use_qa_gym:
124
124
  task_gym = "qa"
125
- task_setup = None
125
+ task_setup = None
126
126
  else:
127
127
  task_gym = CustomGym(
128
128
  dockerfile=dockerfile or UBUNTU_DOCKERFILE,
@@ -138,7 +138,7 @@ class Task(BaseModel):
138
138
  setup=task_setup,
139
139
  metadata=sample.metadata,
140
140
  choices=sample.choices,
141
- evaluate=evaluate_config,
141
+ evaluate=evaluate_config,
142
142
  gym=task_gym,
143
143
  # files=sample.files, # TODO: Decide how/if to handle files
144
144
  )
hud/types.py CHANGED
@@ -44,9 +44,6 @@ class CustomGym(BaseModel):
44
44
  # Read the Dockerfile content
45
45
  self.dockerfile = dockerfile_path.read_text()
46
46
 
47
- # Strings are identifiers for gyms on the HUD server
48
- Gym = CustomGym | str
49
-
50
47
  class EnvironmentStatus(str, enum.Enum):
51
48
  """
52
49
  Status of the environment.
@@ -63,3 +60,8 @@ class EnvironmentStatus(str, enum.Enum):
63
60
  COMPLETED = "completed"
64
61
  ERROR = "error"
65
62
 
63
+ # Available HUD gyms
64
+ ServerGym = Literal["qa", "hud-browser", "hud-ubuntu", "OSWorld-Ubuntu"]
65
+
66
+ # Gyms can be either custom or server-side
67
+ Gym = CustomGym | ServerGym
hud/utils/common.py CHANGED
@@ -38,7 +38,10 @@ class HudStyleConfig(BaseModel):
38
38
  ShorthandConfig = tuple[str | dict[str, Any] | list[str] | list[dict[str, Any]], ...]
39
39
 
40
40
  # Type alias for multiple config formats
41
- HudStyleConfigs = ShorthandConfig | HudStyleConfig | list[HudStyleConfig] | dict[str, Any] | str
41
+ HudStyleConfigs = (
42
+ ShorthandConfig | HudStyleConfig | list[HudStyleConfig] | list[ShorthandConfig]
43
+ | dict[str, Any] | str
44
+ )
42
45
 
43
46
  class ExecuteResult(TypedDict):
44
47
  """
hud/utils/config.py CHANGED
@@ -65,7 +65,7 @@ def expand_config(config: HudStyleConfigs) -> list[HudStyleConfig]:
65
65
 
66
66
  # If it's a list of HudStyleConfigs, return as is
67
67
  if isinstance(config, list) and all(isinstance(item, HudStyleConfig) for item in config):
68
- return config
68
+ return config # type: ignore
69
69
 
70
70
  # Handle dictionary configuration
71
71
  if isinstance(config, dict):
hud/utils/progress.py ADDED
@@ -0,0 +1,136 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ from collections import defaultdict
5
+
6
+
7
+ class StepProgressTracker:
8
+ """
9
+ Tracks progress across potentially parallel async tasks based on steps completed.
10
+ Provides estimates assuming tasks run up to max_steps_per_task.
11
+ """
12
+ def __init__(self, total_tasks: int, max_steps_per_task: int) -> None:
13
+ if total_tasks <= 0:
14
+ raise ValueError("total_tasks must be positive")
15
+ if max_steps_per_task <= 0:
16
+ raise ValueError("max_steps_per_task must be positive")
17
+
18
+ self.total_tasks = total_tasks
19
+ self.max_steps_per_task = max_steps_per_task
20
+ self.total_potential_steps = total_tasks * max_steps_per_task
21
+
22
+ # Use asyncio.Lock for potentially concurrent updates/reads if needed,
23
+ # but start without for simplicity in single-threaded asyncio.
24
+ # self._lock = asyncio.Lock()
25
+ self._task_steps: dict[str, int] = defaultdict(int)
26
+ self._finished_tasks: dict[str, bool] = defaultdict(bool)
27
+ self._tasks_started = 0
28
+ self._tasks_finished = 0
29
+
30
+ self.start_time: float | None = None
31
+ self.current_total_steps = 0
32
+
33
+ def start_task(self, task_id: str) -> None:
34
+ # async with self._lock: # If using lock
35
+ if self.start_time is None:
36
+ self.start_time = time.monotonic()
37
+ self._task_steps[task_id] = 0
38
+ self._finished_tasks[task_id] = False
39
+ self._tasks_started += 1
40
+
41
+ def increment_step(self, task_id: str) -> None:
42
+ # async with self._lock:
43
+ if (not self._finished_tasks[task_id] and
44
+ self._task_steps[task_id] < self.max_steps_per_task):
45
+ self._task_steps[task_id] += 1
46
+ # Update overall progress immediately
47
+ self._update_total_steps()
48
+
49
+ def finish_task(self, task_id: str) -> None:
50
+ # async with self._lock:
51
+ if not self._finished_tasks[task_id]:
52
+ # For calculation, consider a finished task as having completed max steps
53
+ self._task_steps[task_id] = self.max_steps_per_task
54
+ self._finished_tasks[task_id] = True
55
+ self._tasks_finished += 1
56
+ # Update overall progress
57
+ self._update_total_steps()
58
+
59
+ def _update_total_steps(self) -> None:
60
+ # This could be expensive if called extremely frequently.
61
+ # Called after increment or finish.
62
+ # async with self._lock:
63
+ self.current_total_steps = sum(self._task_steps.values())
64
+
65
+ def get_progress(self) -> tuple[int, int, float]:
66
+ """Returns (current_steps, total_potential_steps, percentage)."""
67
+ # async with self._lock:
68
+ # Recalculate here for safety, though _update_total_steps should keep it current
69
+ # current_steps = sum(self._task_steps.values())
70
+ current_steps = self.current_total_steps
71
+
72
+ percentage = 0.0
73
+ if self.total_potential_steps > 0:
74
+ percentage = (current_steps / self.total_potential_steps) * 100
75
+ return current_steps, self.total_potential_steps, percentage
76
+
77
+ def get_stats(self) -> tuple[float, float | None]:
78
+ """Returns (rate_steps_per_minute, eta_seconds_upper_bound)."""
79
+ # async with self._lock:
80
+ if self.start_time is None or self._tasks_started == 0:
81
+ return 0.0, None # No rate or ETA yet
82
+
83
+ elapsed_time = time.monotonic() - self.start_time
84
+ current_steps = self.current_total_steps
85
+
86
+ rate_sec = 0.0
87
+ if elapsed_time > 0:
88
+ rate_sec = current_steps / elapsed_time
89
+
90
+ rate_min = rate_sec * 60 # Convert rate to steps per minute
91
+
92
+ eta = None
93
+ # ETA calculation still uses rate_sec (steps/second) for time estimation in seconds
94
+ if rate_sec > 0:
95
+ remaining_steps = self.total_potential_steps - current_steps
96
+ eta = remaining_steps / rate_sec if remaining_steps > 0 else 0.0
97
+
98
+ return rate_min, eta # Return rate in steps/min
99
+
100
+ def is_finished(self) -> bool:
101
+ # async with self._lock:
102
+ return self._tasks_finished >= self.total_tasks
103
+
104
+ def display(self, bar_length: int = 40) -> str:
105
+ """Generates a progress string similar to tqdm."""
106
+ current_steps, total_steps, percentage = self.get_progress()
107
+ rate_min, eta = self.get_stats() # Rate is now per minute
108
+
109
+ # Ensure valid values for display
110
+ current_steps = min(current_steps, total_steps)
111
+ percentage = max(0.0, min(100.0, percentage))
112
+
113
+ filled_length = int(bar_length * current_steps // total_steps) if total_steps else 0
114
+ bar = "█" * filled_length + "-" * (bar_length - filled_length)
115
+
116
+ # Format time
117
+ elapsed_str = "0:00"
118
+ eta_str = "??:??"
119
+ if self.start_time:
120
+ elapsed_seconds = int(time.monotonic() - self.start_time)
121
+ elapsed_str = f"{elapsed_seconds // 60}:{elapsed_seconds % 60:02d}"
122
+ if eta is not None:
123
+ eta_seconds = int(eta)
124
+ eta_str = f"{eta_seconds // 60}:{eta_seconds % 60:02d}"
125
+ elif self.is_finished():
126
+ eta_str = "0:00"
127
+
128
+ # Update rate string format
129
+ rate_str = f"{rate_min:.1f} steps/min" if rate_min > 0 else "?? steps/min"
130
+
131
+ # Format steps - use K/M for large numbers if desired, keep simple for now
132
+ steps_str = f"{current_steps}/{total_steps}"
133
+
134
+ # tasks_str = f" {self._tasks_finished}/{self.total_tasks} tasks" # Optional tasks counter
135
+
136
+ return f"{percentage:3.0f}%|{bar}| {steps_str} [{elapsed_str}<{eta_str}, {rate_str}]"
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: SDK for the HUD evaluation platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
7
7
  Project-URL: Documentation, https://hud.so
8
- Author-email: Human Union Data SDK <founders@hud.so>
8
+ Author-email: HUD SDK <founders@hud.so>
9
9
  License: MIT License
10
10
 
11
11
  Copyright (c) 2025 Human Union Data, Inc
@@ -37,8 +37,14 @@ Classifier: Programming Language :: Python :: 3.12
37
37
  Classifier: Programming Language :: Python :: 3.13
38
38
  Requires-Python: <3.14,>=3.10
39
39
  Requires-Dist: aiodocker>=0.24.0
40
+ Requires-Dist: anthropic
40
41
  Requires-Dist: httpx<1,>=0.23.0
41
42
  Requires-Dist: inspect-ai>=0.3.80
43
+ Requires-Dist: ipykernel
44
+ Requires-Dist: langchain
45
+ Requires-Dist: langchain-openai
46
+ Requires-Dist: numpy
47
+ Requires-Dist: openai
42
48
  Requires-Dist: pillow>=11.1.0
43
49
  Requires-Dist: pydantic-settings<3,>=2
44
50
  Requires-Dist: pydantic<3,>=2
@@ -117,10 +123,9 @@ async def main():
117
123
  obs, _ = await env.reset() # Gets first observation
118
124
  for i in range(5):
119
125
  actions, done = await agent.predict(obs)
120
- if done:
121
- break
122
-
126
+
123
127
  obs, reward, terminated, info = await env.step(actions)
128
+ if done or terminated: break
124
129
 
125
130
  # Evaluate and close
126
131
  result = await env.evaluate()
@@ -132,22 +137,37 @@ if __name__ == "__main__":
132
137
 
133
138
  ```
134
139
 
140
+ Alternatively, run a full evaluation set via the ```run_job``` command:
141
+
142
+ ```python
143
+ from hud import load_taskset, run_job, ClaudeAgent
144
+
145
+ # load
146
+ taskset = load_taskset("GAIA")
147
+
148
+ # evaluate
149
+ job = await run_job(ClaudeAgent, taskset, "test-gaia-job")
150
+
151
+ # get results OR view them in app.hud.so
152
+ print(await job.get_analytics())
153
+ ```
154
+
135
155
  ## Documentation Sections
136
156
 
137
157
  Explore the core concepts and features of the SDK:
138
158
 
139
- * **[Tasks and TaskSets](/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios. This includes both interactive and **question-answering (QA)** style tasks.
140
- * **[Environments](/concepts/environment)**: Understand the browser and OS runtimes where agents interact.
141
- * **[Agents](/concepts/agent)**: Learn about the agent architecture (Claude, Operator) and how they process observations and predict actions.
142
- * **[Adapters](/concepts/adapter)**: See how actions and observations are translated between agents and environments.
143
- * **[Jobs](/concepts/job)**: Group related runs for analysis and viewing on the HUD platform.
144
- * **[Trajectories](/concepts/trajectory)**: Understand the recorded data from each agent run.
159
+ * **[Tasks and TaskSets](https://documentation.hud.so/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios. This includes both interactive and **question-answering (QA)** style tasks.
160
+ * **[Environments](https://documentation.hud.so/concepts/environment)**: Understand the browser and OS runtimes where agents interact.
161
+ * **[Agents](https://documentation.hud.so/concepts/agent)**: Learn about the agent architecture (Claude, Operator) and how they process observations and predict actions.
162
+ * **[Adapters](https://documentation.hud.so/concepts/adapter)**: See how actions and observations are translated between agents and environments.
163
+ * **[Jobs](https://documentation.hud.so/concepts/job)**: Group related runs for analysis and viewing on the HUD platform.
164
+ * **[Trajectories](https://documentation.hud.so/concepts/trajectory)**: Understand the recorded data from each agent run.
145
165
  * **Advanced Topics**:
146
- * **[CLA Action Details](/advanced/cla-details)**: Explore the standardized action format.
147
- * **[Custom Environments](/advanced/custom-environments)**: Build your own Docker-based local or remote environments.
148
- * **[Advanced Environment Control](/advanced/environment-control)**: Use `invoke`, `execute`, and `_setup` for finer control.
166
+ * **[CLA Action Details](https://documentation.hud.so/advanced/cla-details)**: Explore the standardized action format.
167
+ * **[Custom Environments](https://documentation.hud.so/advanced/custom-environments)**: Build your own Docker-based local or remote environments.
168
+ * **[Advanced Environment Control](https://documentation.hud.so/advanced/environment-control)**: Use `invoke`, `execute`, and `_setup` for finer control.
149
169
 
150
- * **[Full API Reference](/api-reference/gym)**: Detailed specifications for all modules and classes.
170
+ * **[Full API Reference](https://documentation.hud.so/api-reference/gym)**: Detailed specifications for all modules and classes.
151
171
 
152
172
  ## [Examples](examples/)
153
173
 
@@ -160,7 +180,7 @@ We recommend you first take a look at the example notebooks showing how to use t
160
180
 
161
181
  ## Documentation
162
182
 
163
- For comprehensive guides, examples, and API reference, visit [our docs](https://docs.hud.so/introduction)
183
+ For comprehensive guides, examples, and API reference, visit [our docs](https://documentation.hud.so/introduction)
164
184
 
165
185
  ## License
166
186
 
@@ -1,29 +1,30 @@
1
- hud/__init__.py,sha256=HFL1iwPhLZd7z--2QADzipur68XlekwGrOzU2vWL-Vw,464
2
- hud/gym.py,sha256=cKjIuJS7A0vJx4K7fctpUjIEv8TkW5x6aB_PRrODrDY,3651
3
- hud/job.py,sha256=E4RN1CkppRQVy46RWCUDjNIyhMa7lNlFfCgpky2vKFk,5463
1
+ hud/__init__.py,sha256=XJXuALIb-pRnnVdfEkjpuiLtS77WD3Idv5VOLECY3eo,488
2
+ hud/gym.py,sha256=ErNwJgCJVhWZHzMILfzVXX0Dawh5Cy0nIQWWh7fsKW4,3641
3
+ hud/job.py,sha256=IvW2sBFoQpExXVi2FL3cEwnrxVIGp8RBfVj2s8edn20,22387
4
4
  hud/settings.py,sha256=rv8TiZx4wmBzIoEEkOzoywC0nt8UZXlHxIa_LW4tWAg,1346
5
- hud/task.py,sha256=aNbHMlO7r1cm5DcO0QLU1SZ7EawOFw9W6DZwTNy72-4,5383
5
+ hud/task.py,sha256=kuP69hIxV0ZsHRsZ1XEq6lzYnUSD3b6ywWzloCGW5DU,5380
6
6
  hud/taskset.py,sha256=xDPBXeDm4AlSOwl-MM98lN0x6PmGV8t9jv7sNyS_u0c,2426
7
7
  hud/trajectory.py,sha256=PA-sE2iyt2BctO2Dex-2ZaRmS95AkEXTicZjHCVCYqE,3749
8
- hud/types.py,sha256=fJZnzK3j3mq7G0gO5TbqRaN92qT4xAb4jUNOXIX8ZZ0,2395
9
- hud/adapters/__init__.py,sha256=0RNQgrzBCkhNBq1Q7JRESN1WfUVLs_99fR5g1re3APs,207
8
+ hud/types.py,sha256=D_OGPutR55PlWrUDqehYLlR-FqQp9GyKlxJhNmCRyFE,2485
9
+ hud/adapters/__init__.py,sha256=zz24KdC_e9TJPgWo6y57_8SzevEE5ak4Cm6tXzMxwRk,266
10
10
  hud/adapters/claude/__init__.py,sha256=i7QEF-29FLb9qxp1eYtXs-adIk_tG54tL-9g6d3xodk,100
11
- hud/adapters/claude/adapter.py,sha256=x0qQglWsg7n8DJ_NacsymlUQBnkpqNVguUlkQRpYX-A,5955
11
+ hud/adapters/claude/adapter.py,sha256=viZDCNjM6aCCfpxt3PIxfVOz3rrlOgZli5WyHUxEGjc,6079
12
12
  hud/adapters/common/__init__.py,sha256=BjdZWJVs_AKtpFrt-tNsdQRjnz7D97DFEQirJ-r0mp8,118
13
13
  hud/adapters/common/adapter.py,sha256=ls-gXtg1N_SQc211rkDb3LL511HNZv6etm1nx2ZtrkQ,5808
14
- hud/adapters/common/types.py,sha256=APxGEmoePwjF7OYXAKqBTVT73PJTFV0eBmbURbaT5xk,5091
14
+ hud/adapters/common/types.py,sha256=9RWLZp6sViu9uPSU5K8-TRaQkdirunxZfDsPIxAR_TM,4995
15
15
  hud/adapters/operator/__init__.py,sha256=31vTRs268_TOLd-TeQRKau5bDYy78wxCNpJFhD5_l8U,104
16
- hud/adapters/operator/adapter.py,sha256=svHgjCdUeMyfgfGzRO3ItGWTKGkm3tmldO2zfjX_sGI,3301
17
- hud/agent/__init__.py,sha256=cI3bqfmG2_Lwzn2RjrxV0X9qIxCRDiffwd1UaWToct4,238
16
+ hud/adapters/operator/adapter.py,sha256=NNbNYPqSquIh4KHCk9aN7dARe7yPUx0J2kDIk-N015s,3309
17
+ hud/agent/__init__.py,sha256=qdCWY6wthkTpyq7SWT1JYAYu1eXk4LfdSAcAfKt0Ohs,294
18
18
  hud/agent/base.py,sha256=RThJ_h4A3oU23zyvvKtxY2a_YM03Vd1XYDXdY3bAf8g,3881
19
19
  hud/agent/claude.py,sha256=tbDKAzGCLJPnUnHc8eV-zZmj3ZG6QQx0ukWKoO4Ekec,7445
20
+ hud/agent/langchain.py,sha256=9ow74ENcJmZ_muzoMdG2tz5VhvAHm2zKiemphHZm-Pg,8683
20
21
  hud/agent/operator.py,sha256=44t19TzcCrS1N3-rnD25ZLXx5s4Io8On27LomALuugs,8185
21
22
  hud/env/__init__.py,sha256=BHFY_N0kEI142pjWtMyqUb3BGnoiekY8evRCIbSbO2w,271
22
23
  hud/env/client.py,sha256=SPR6ct6NFxmIrgIi3K8tEC-vnqOmCbCBtuT81PaVjuY,869
23
24
  hud/env/docker_client.py,sha256=56_u3Ri4NulGcBumAg-7-KilmFmBKthOwEIM5bOLOZc,10418
24
25
  hud/env/environment.py,sha256=Xyq4KQO9aWYPwZ0uESAetB5EEZgmlEnZVc7sA0DLz2c,13706
25
26
  hud/env/local_docker_client.py,sha256=TCD9z1qjafxjwAWLatAL8d587_ioMDHjs8T5cBgusr8,7789
26
- hud/env/remote_client.py,sha256=iJiwueuf98xOx0_Y2ltu_63BwKIKNvohhim73Goq74E,5804
27
+ hud/env/remote_client.py,sha256=XDKmr5ImLBMZn-ToPrXnc4iBNRwDwzPtQIXEcgShbhE,5977
27
28
  hud/env/remote_docker_client.py,sha256=FwaO7NyygDt9oe3pDD7PwUS21pxzc465mwcXk-Cx-60,6838
28
29
  hud/evaluators/__init__.py,sha256=XophB666xPnurhQ_ygfW44h0Jh0BQGCgUzCXEOG2Q1g,158
29
30
  hud/evaluators/base.py,sha256=CNbrvFWQfl1YuBxJKzuG4_TBAdAf0TOQA3hl7eGsbaA,782
@@ -34,11 +35,12 @@ hud/evaluators/remote.py,sha256=NVUJJvrpGQj2eL-aFxzTEnAWW7iuSI9eDWtar54dc6E,2174
34
35
  hud/server/__init__.py,sha256=cxDKTwMdGzhj7bYajtejN8XCt7K8Xq3eKB2No0qBpoY,169
35
36
  hud/server/requests.py,sha256=s8LZZYWT1wl7lPu2vwRaYPZs9_gjKwSg3LZLvS5-s6E,9085
36
37
  hud/utils/__init__.py,sha256=LnoI2tQUnd-mQ4eg-gpJJgmHBBIhggJ6c9ap7MBgrfs,260
37
- hud/utils/common.py,sha256=XJZ-hKJkeaNmelG2QD5ybi9FpZQS1ErA40fAYzUSHVE,2742
38
- hud/utils/config.py,sha256=ePi3GDo8mDUnOZ5G5HyMprqGRvxrxCMfixGNuTOA8rQ,3266
38
+ hud/utils/common.py,sha256=xJWBF2KTAQKYMGFq5hJWcwpcHAEYY3so4ZqvZYf1BjU,2778
39
+ hud/utils/config.py,sha256=Evu2nUCYaujpWXXwLprsgr_KFUkWuSdkibmLRJ_iq64,3281
40
+ hud/utils/progress.py,sha256=gP7_NXG0m_bhNaYPwrwUOeNumwjx4ewjXP7v-_0Lsj0,5684
39
41
  hud/utils/telemetry.py,sha256=md7AuKxtDqsONMeeTOHen1XpmNds8CbXROX_PnkDxFc,1993
40
42
  hud/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
- hud_python-0.2.1.dist-info/METADATA,sha256=f2lyqGmu9L7_zgCOqrhZ6ZX1JUU6Z0e92bRTfmojSqQ,7219
42
- hud_python-0.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
43
- hud_python-0.2.1.dist-info/licenses/LICENSE,sha256=yIzBheVUf86FC1bztAcr7RYWWNxyd3B-UJQ3uddg1HA,1078
44
- hud_python-0.2.1.dist-info/RECORD,,
43
+ hud_python-0.2.2.dist-info/METADATA,sha256=I26pZPqv8O5r36BNehTmJuYQjtbRu-C3bCjt37Iwync,7963
44
+ hud_python-0.2.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
45
+ hud_python-0.2.2.dist-info/licenses/LICENSE,sha256=yIzBheVUf86FC1bztAcr7RYWWNxyd3B-UJQ3uddg1HA,1078
46
+ hud_python-0.2.2.dist-info/RECORD,,