hud-python 0.1.5__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (46) hide show
  1. hud/__init__.py +16 -12
  2. hud/adapters/__init__.py +4 -2
  3. hud/adapters/claude/adapter.py +0 -1
  4. hud/adapters/common/adapter.py +11 -10
  5. hud/adapters/common/types.py +27 -13
  6. hud/adapters/operator/__init__.py +5 -0
  7. hud/adapters/operator/adapter.py +93 -0
  8. hud/agent/__init__.py +7 -0
  9. hud/agent/base.py +109 -0
  10. hud/agent/claude.py +187 -0
  11. hud/agent/operator.py +190 -0
  12. hud/env/__init__.py +11 -0
  13. hud/env/client.py +35 -0
  14. hud/env/docker_client.py +306 -0
  15. hud/env/environment.py +181 -0
  16. hud/env/local_docker_client.py +249 -0
  17. hud/env/remote_client.py +185 -0
  18. hud/env/remote_docker_client.py +221 -0
  19. hud/evaluators/__init__.py +10 -0
  20. hud/evaluators/base.py +31 -0
  21. hud/evaluators/inspect.py +29 -0
  22. hud/evaluators/judge.py +213 -0
  23. hud/evaluators/match.py +163 -0
  24. hud/evaluators/remote.py +78 -0
  25. hud/gym.py +101 -15
  26. hud/job.py +185 -0
  27. hud/server/__init__.py +2 -2
  28. hud/server/requests.py +87 -0
  29. hud/settings.py +13 -2
  30. hud/task.py +133 -0
  31. hud/taskset.py +95 -0
  32. hud/trajectory.py +90 -0
  33. hud/types.py +65 -0
  34. hud/utils/__init__.py +4 -2
  35. hud/utils/common.py +69 -0
  36. hud/utils/config.py +182 -4
  37. hud/utils/telemetry.py +67 -0
  38. hud_python-0.2.0.dist-info/METADATA +188 -0
  39. hud_python-0.2.0.dist-info/RECORD +44 -0
  40. {hud_python-0.1.5.dist-info → hud_python-0.2.0.dist-info}/licenses/LICENSE +1 -1
  41. hud/client.py +0 -200
  42. hud/environment.py +0 -318
  43. hud/run.py +0 -208
  44. hud_python-0.1.5.dist-info/METADATA +0 -125
  45. hud_python-0.1.5.dist-info/RECORD +0 -21
  46. {hud_python-0.1.5.dist-info → hud_python-0.2.0.dist-info}/WHEEL +0 -0
hud/taskset.py ADDED
@@ -0,0 +1,95 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from pydantic import BaseModel
6
+
7
+ from hud.server import make_request
8
+ from hud.settings import settings
9
+ from hud.task import Task
10
+
11
+ if TYPE_CHECKING:
12
+ from inspect_ai.dataset import Dataset
13
+
14
+
15
+ class TaskSet(BaseModel):
16
+ """
17
+ Collection of related tasks for benchmarking.
18
+
19
+ Attributes:
20
+ id: Unique identifier for the taskset
21
+ description: Description of the taskset
22
+ tasks: List of Task objects in the taskset
23
+ """
24
+ id: str | None = None
25
+ description: str | None = None
26
+ tasks: list[Task] = []
27
+
28
+ def __getitem__(self, index: int) -> Task:
29
+ """
30
+ Allows accessing tasks by index using square bracket notation.
31
+
32
+ Args:
33
+ index: The index of the task to retrieve
34
+
35
+ Returns:
36
+ Task: The task at the specified index
37
+
38
+ Raises:
39
+ IndexError: If the index is out of range
40
+ """
41
+ return self.tasks[index]
42
+
43
+ def __len__(self) -> int:
44
+ """
45
+ Returns the number of tasks in the taskset.
46
+
47
+ Returns:
48
+ int: The number of tasks in the taskset
49
+ """
50
+ return len(self.tasks)
51
+
52
+
53
+ async def load_taskset(taskset_id: str, api_key: str | None = None) -> TaskSet:
54
+ """
55
+ Loads a TaskSet by its ID.
56
+
57
+ Args:
58
+ taskset_id: The ID of the taskset to load
59
+ api_key: Optional API key to use for the request
60
+
61
+ Returns:
62
+ TaskSet: The loaded taskset
63
+ """
64
+
65
+ if api_key is None:
66
+ api_key = settings.api_key
67
+
68
+ data = await make_request(
69
+ method="GET",
70
+ url=f"{settings.base_url}/v2/tasksets/{taskset_id}/tasks",
71
+ api_key=api_key,
72
+ )
73
+
74
+ return TaskSet.model_validate({
75
+ "id": taskset_id,
76
+ "tasks": data["evalset"],
77
+ })
78
+
79
+ def load_from_inspect(dataset: Dataset) -> TaskSet:
80
+ """
81
+ Creates a TaskSet from an inspect-ai dataset.
82
+
83
+ Args:
84
+ dataset: An inspect-ai dataset
85
+
86
+ Returns:
87
+ TaskSet: A new TaskSet instance
88
+ """
89
+ tasks = [Task.from_inspect_sample(sample) for sample in dataset]
90
+
91
+ return TaskSet(
92
+ id=None,
93
+ tasks=tasks,
94
+ description=dataset.name,
95
+ )
hud/trajectory.py ADDED
@@ -0,0 +1,90 @@
1
+ # ruff: noqa: T201
2
+ from __future__ import annotations
3
+
4
+ import datetime
5
+
6
+ from IPython.display import HTML, Markdown, display
7
+ from pydantic import BaseModel, Field
8
+
9
+
10
+ class TrajectoryStep(BaseModel):
11
+ """Model representing a single task run's trajectory information."""
12
+
13
+ observation_url: str | None = None
14
+ observation_text: str | None = None
15
+ actions: list[dict]
16
+ start_timestamp: str | None = None
17
+ end_timestamp: str | None = None
18
+
19
+
20
+ class Trajectory(BaseModel):
21
+ """Model representing a single task run's trajectory information."""
22
+
23
+ id: str
24
+ reward: float | None = None
25
+ logs: str | None = None
26
+ error: str | None = None
27
+ trajectory: list[TrajectoryStep] = Field(default_factory=list)
28
+
29
+ def display(self) -> None:
30
+ trajectory_start_timestamp_str = self.trajectory[0].start_timestamp
31
+ t_start_dt = (
32
+ datetime.datetime.fromisoformat(
33
+ trajectory_start_timestamp_str.replace("Z", "+00:00")
34
+ )
35
+ if trajectory_start_timestamp_str
36
+ else None
37
+ )
38
+ for i, step in enumerate(self.trajectory):
39
+ # Use Markdown for better step separation in Jupyter
40
+ display(Markdown(f"### Step {i + 1}"))
41
+
42
+ # Observation Image
43
+ if step.observation_url:
44
+ try:
45
+ # Display in Jupyter/IPython environment using HTML
46
+ display(Markdown("**Observation Image:**"))
47
+ display(HTML(f'<img src="{step.observation_url}" style="max-width:100%;"/>'))
48
+ display(Markdown(f"[Image Link]({step.observation_url})"))
49
+ except Exception as e:
50
+ print(f" [Error processing image: {e}]")
51
+ elif not step.observation_text: # Only print if no image AND no text
52
+ print(" No visual or text observation provided.")
53
+
54
+
55
+ # Observation Text
56
+ if step.observation_text:
57
+ print(f" Observation Text: {step.observation_text}")
58
+
59
+ # Actions
60
+ print(f"\n Actions: {step.actions}") # Added newline for spacing
61
+
62
+ # Duration
63
+ duration_str = "N/A"
64
+ step_start_timestamp = self.trajectory[i].start_timestamp
65
+ step_end_timestamp = self.trajectory[i].end_timestamp
66
+ if step_start_timestamp and step_end_timestamp and t_start_dt:
67
+ try:
68
+ # Attempt to parse timestamps (assuming ISO format)
69
+ start_dt = datetime.datetime.fromisoformat(
70
+ step_start_timestamp.replace("Z", "+00:00")
71
+ )
72
+ end_dt = datetime.datetime.fromisoformat(
73
+ step_end_timestamp.replace("Z", "+00:00")
74
+ )
75
+ duration = end_dt - start_dt
76
+ total_seconds = duration.total_seconds()
77
+ minutes = int(total_seconds // 60)
78
+ seconds = total_seconds % 60
79
+ duration_str = f"{minutes}m {seconds:.2f}s"
80
+
81
+ # Calculate the total duration up to this step
82
+ total_duration = end_dt - t_start_dt
83
+ total_minutes = int(total_duration.total_seconds() // 60)
84
+ total_seconds = total_duration.total_seconds() % 60
85
+ total_duration_str = f"{total_minutes}m {total_seconds:.2f}s"
86
+ except ValueError:
87
+ duration_str = "Error parsing timestamps" # Handle potential format issues
88
+ print(f" Step Duration: {duration_str}")
89
+ print(f" Total Duration: {total_duration_str}")
90
+ display(Markdown("---")) # Use Markdown horizontal rule
hud/types.py ADDED
@@ -0,0 +1,65 @@
1
+ from __future__ import annotations
2
+
3
+ import enum
4
+ from pathlib import Path
5
+ from typing import Any, Literal
6
+
7
+ from pydantic import BaseModel
8
+
9
+
10
+ class CustomGym(BaseModel):
11
+ """
12
+ Public environment specification with a dockerfile and controller.
13
+
14
+ If the location is remote, the env will be created on the server.
15
+ If the location is dev, the env will be created locally via docker.
16
+
17
+ The dockerfile can be specified directly or automatically found in the controller_source_dir.
18
+ If neither is provided, an error will be raised during validation.
19
+ """
20
+ type: Literal["public"] = "public"
21
+ dockerfile: str | None = None
22
+ location: Literal["local", "remote"]
23
+ ports: list[int] | None = None
24
+ # If path, then it is a development environment on the local computer
25
+ # If none, then the controller must be installed in the environment through the dockerfile
26
+ # Can be provided as a string or Path object
27
+ controller_source_dir: str | Path | None = None
28
+
29
+ def model_post_init(self, __context: Any, /) -> None:
30
+ """Validate and set up dockerfile if not explicitly provided."""
31
+ # Convert string path to Path object if needed
32
+ if isinstance(self.controller_source_dir, str):
33
+ self.controller_source_dir = Path(self.controller_source_dir)
34
+
35
+ if self.dockerfile is None:
36
+ if self.controller_source_dir is None:
37
+ raise ValueError("Either dockerfile or controller_source_dir must be provided")
38
+
39
+ # Look for Dockerfile in the controller_source_dir
40
+ dockerfile_path = self.controller_source_dir / "Dockerfile"
41
+ if not dockerfile_path.exists():
42
+ raise ValueError(f"Dockerfile not found in {self.controller_source_dir}")
43
+
44
+ # Read the Dockerfile content
45
+ self.dockerfile = dockerfile_path.read_text()
46
+
47
+ # Strings are identifiers for gyms on the HUD server
48
+ Gym = CustomGym | str
49
+
50
+ class EnvironmentStatus(str, enum.Enum):
51
+ """
52
+ Status of the environment.
53
+
54
+ Attributes:
55
+ INITIALIZING: The environment is initializing
56
+ RUNNING: The environment is running
57
+ COMPLETED: The environment is completed
58
+ ERROR: The environment is in an error state
59
+ """
60
+
61
+ INITIALIZING = "initializing"
62
+ RUNNING = "running"
63
+ COMPLETED = "completed"
64
+ ERROR = "error"
65
+
hud/utils/__init__.py CHANGED
@@ -1,5 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
- from .config import configuration
3
+ from .common import ExecuteResult
4
+ from .config import HudStyleConfig, HudStyleConfigs, expand_config
5
+ from .telemetry import stream
4
6
 
5
- __all__ = ["configuration"]
7
+ __all__ = ["ExecuteResult", "HudStyleConfig", "HudStyleConfigs", "expand_config", "stream"]
hud/utils/common.py ADDED
@@ -0,0 +1,69 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import logging
5
+ import tarfile
6
+ from typing import TYPE_CHECKING, TypedDict
7
+
8
+ from hud.server.requests import make_request
9
+ from hud.settings import settings
10
+
11
+ if TYPE_CHECKING:
12
+ from pathlib import Path
13
+
14
+ logger = logging.getLogger("hud.utils.common")
15
+
16
+ class ExecuteResult(TypedDict):
17
+ """
18
+ Result of an execute command.
19
+
20
+ Attributes:
21
+ stdout: Standard output from the command
22
+ stderr: Standard error from the command
23
+ exit_code: Exit code of the command
24
+ """
25
+ stdout: bytes
26
+ stderr: bytes
27
+ exit_code: int
28
+
29
+
30
+ def directory_to_tar_bytes(directory_path: Path) -> bytes:
31
+ """
32
+ Converts a directory to a tar archive and returns it as bytes.
33
+
34
+ This function creates a tar archive of the specified directory in memory,
35
+ without writing to a temporary file on disk.
36
+
37
+ Args:
38
+ path: Path to the directory to convert
39
+
40
+ Returns:
41
+ Bytes of the tar archive
42
+ """
43
+ output = io.BytesIO()
44
+
45
+ with tarfile.open(fileobj=output, mode="w") as tar:
46
+ # Walk through the directory
47
+ for file_path in directory_path.rglob("*"):
48
+ if file_path.is_file():
49
+ # Calculate relative path for the archive
50
+ rel_path = file_path.relative_to(directory_path)
51
+ logger.debug("Adding %s to tar archive", rel_path)
52
+ tar.add(file_path, arcname=str(rel_path))
53
+
54
+ # Get the bytes from the BytesIO object
55
+ output.seek(0)
56
+ return output.getvalue()
57
+
58
+
59
+ async def get_gym_id(gym_name_or_id: str) -> str:
60
+ """
61
+ Get the gym ID for a given gym name or ID.
62
+ """
63
+ data = await make_request(
64
+ method="GET",
65
+ url=f"{settings.base_url}/v1/gyms/{gym_name_or_id}",
66
+ api_key=settings.api_key,
67
+ )
68
+
69
+ return data["id"]
hud/utils/config.py CHANGED
@@ -1,7 +1,185 @@
1
1
  from __future__ import annotations
2
2
 
3
- from hud.settings import settings
3
+ import logging
4
+ import re
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from pydantic import BaseModel
8
+
9
+ if TYPE_CHECKING:
10
+ from collections.abc import Iterator
11
+
12
+ from hud.task import Task
13
+
14
+ logger = logging.getLogger("hud.utils.config")
15
+
16
+ REMOTE_FUNCTION_PREFIX = "private_"
17
+ REMOTE_SETUP = "setup"
18
+ REMOTE_EVALUATE = "evaluate"
19
+
20
+ class HudStyleConfig(BaseModel):
21
+ function: str # Format: "x.y.z"
22
+ args: list[Any] # Must be json serializable
23
+
24
+ id: str | None = None # Optional id for remote execution
25
+
26
+ def __len__(self) -> int:
27
+ return len(self.args)
28
+
29
+ def __getitem__(self, index: int) -> Any:
30
+ return self.args[index]
31
+
32
+ def __iter__(self) -> Iterator[Any]:
33
+ return iter(self.args)
34
+
35
+ def __str__(self) -> str:
36
+ return f"{self.function}: {', '.join(str(arg) for arg in self.args)}"
37
+
38
+ # Type alias for the shorthand config, which just converts to function name and args
39
+ ShorthandConfig = tuple[str | dict[str, Any] | list[str] | list[dict[str, Any]], ...]
40
+
41
+ # Type alias for multiple config formats
42
+ HudStyleConfigs = ShorthandConfig | HudStyleConfig | list[HudStyleConfig] | dict[str, Any] | str
43
+
44
+ def _is_valid_python_name(name: str) -> bool:
45
+ """Check if a string is a valid Python identifier."""
46
+ return bool(re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", name))
47
+
48
+ def _validate_hud_config(config: dict) -> HudStyleConfig:
49
+ """Validate and convert a dictionary to an HudStyleConfig."""
50
+ if not isinstance(config.get("function"), str):
51
+ raise ValueError("function must be a string")
52
+
53
+ # Validate function path components
54
+ _split_and_validate_path(config["function"])
55
+
56
+ args = config["args"] if isinstance(config.get("args"), list) else [config["args"]]
57
+
58
+ # Create a proper HudStyleConfig object instead of using cast
59
+ return HudStyleConfig(function=config["function"], args=args, id=config.get("id"))
60
+
61
+ def _split_and_validate_path(path: str) -> None:
62
+ """Split a function path into components, validating each part."""
63
+ parts = path.split(".")
64
+
65
+ if not parts:
66
+ raise ValueError("Empty function path")
67
+
68
+ # Validate each part
69
+ for part in parts:
70
+ if not _is_valid_python_name(part):
71
+ raise ValueError(f"Invalid Python identifier in path: {part}")
72
+
73
+ def expand_config(config: HudStyleConfigs) -> list[HudStyleConfig]:
74
+ """
75
+ Process a config into a standardized list of HudStyleConfig objects.
76
+
77
+ Args:
78
+ config: Can be:
79
+ - A tuple where first element is function name and rest are args
80
+ - A HudStyleConfig object
81
+ - A dictionary with "function" and "args" keys
82
+ - A list of HudStyleConfig objects
83
+
84
+ Returns:
85
+ list[HudStyleConfig]: List of standardized configurations
86
+
87
+ Raises:
88
+ ValueError: If the configuration format is invalid
89
+ """
90
+ logger.debug("Processing config: %s", config)
91
+
92
+ # If it's already a HudStyleConfig, just wrap it in a list
93
+ if isinstance(config, HudStyleConfig):
94
+ return [config]
95
+
96
+ # If it's a list of HudStyleConfigs, return as is
97
+ if isinstance(config, list) and all(isinstance(item, HudStyleConfig) for item in config):
98
+ return config
99
+
100
+ # Handle dictionary configuration
101
+ if isinstance(config, dict):
102
+ return [_validate_hud_config(config)]
103
+
104
+ if isinstance(config, str):
105
+ return [HudStyleConfig(function=config, args=[])]
106
+
107
+ # Handle tuple format
108
+ if isinstance(config, tuple):
109
+ if len(config) < 1 or not isinstance(config[0], str):
110
+ error_msg = "Invalid tuple configuration. "
111
+ "Expected tuple[str, ...], got: {type(config)}"
112
+ logger.error(error_msg)
113
+ raise ValueError(error_msg)
114
+
115
+ # First element is the function name, rest are args
116
+ function_name = config[0]
117
+ args = list(config[1:]) if len(config) > 1 else []
118
+
119
+ return [HudStyleConfig(function=function_name, args=args)]
120
+
121
+ # Unknown configuration type
122
+ error_msg = f"Unknown configuration type: {type(config)}"
123
+ logger.error(error_msg)
124
+ raise ValueError(error_msg)
125
+
126
+ def create_remote_config(
127
+ task: Task | None = None,
128
+ config: HudStyleConfigs | None = None,
129
+ function: str | None = None,
130
+ ) -> list[HudStyleConfig]:
131
+ """
132
+ Create a configuration based on provided inputs.
133
+
134
+ Args:
135
+ task: Task object with configuration
136
+ config: Direct configuration (expanded or not)
137
+ function: Function name to use
138
+
139
+ Returns:
140
+ list[HudStyleConfig]: List of standardized configurations
141
+
142
+ Logic:
143
+ 1) If explicit config: expand and return HudStyleConfig with func of the function,
144
+ and args of expanded config
145
+ 2) If task has the specified function defined: use that
146
+ 3) If no task function: check for task._config and use that
147
+ 4) If no _config: use task.id and create private_[function]
148
+ """
149
+ # If no function provided, just expand the config and return it directly
150
+ if function is None:
151
+ if config:
152
+ return expand_config(config)
153
+ raise ValueError("Either function or config must be provided")
154
+
155
+ # Case 1: Explicit config provided
156
+ if config:
157
+ expanded_configs = expand_config(config)
158
+ return [HudStyleConfig(function=function, args=expanded_configs)]
159
+
160
+ # Must have a task for the remaining cases
161
+ if task is None:
162
+ raise ValueError("Either task or config must be provided")
163
+
164
+ # Case 2: Task has the specified function attribute
165
+ task_config = getattr(task, function, None)
166
+ if task_config and len(task_config) > 0:
167
+ expanded_configs = expand_config(task_config)
168
+ if task.id:
169
+ expanded_configs[0].id = task.id # for remote IDs
170
+ return [HudStyleConfig(function=function, args=expanded_configs)]
171
+
172
+ # Case 3: Check for _config
173
+ if hasattr(task, "config") and task.config:
174
+ if task.id:
175
+ task.config["id"] = task.id # for remote IDs
176
+ return [HudStyleConfig(function=function, args=[task.config])]
177
+
178
+ # Case 4: Use task.id
179
+ if task.id:
180
+ return [HudStyleConfig(function=f"{REMOTE_FUNCTION_PREFIX}{function}", args=[task.id])]
181
+
182
+ # No valid configuration found
183
+ #logger.warning("No valid configuration found for function: %s", function)
184
+ return [HudStyleConfig(function=function, args=[])]
4
185
 
5
- # For backwards compatibility, keep 'configuration'
6
- # but have it point to the settings instance
7
- configuration = settings
hud/utils/telemetry.py ADDED
@@ -0,0 +1,67 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ def stream(live_url: str | None = None) -> str:
8
+ """
9
+ Display a stream in the HUD system.
10
+ """
11
+ if live_url is None:
12
+ raise ValueError("live_url cannot be None")
13
+ from IPython.display import HTML, display
14
+
15
+ html_content = f"""
16
+ <div style="width: 960px; height: 540px; overflow: hidden;">
17
+ <div style="transform: scale(0.5); transform-origin: top left;">
18
+ <iframe src="{live_url}" width="1920" height="1080" style="border: 1px solid #ddd;">
19
+ </iframe>
20
+ </div>
21
+ </div>
22
+ """
23
+ try:
24
+ display(HTML(html_content))
25
+ except Exception as e:
26
+ logger.warning(e)
27
+
28
+ return html_content
29
+
30
+
31
+ def display_screenshot(base64_image: str, width: int = 960, height: int = 540) -> str:
32
+ """
33
+ Display a base64-encoded screenshot image.
34
+
35
+ Args:
36
+ base64_image: Base64-encoded image string (without the data URI prefix)
37
+ width: Display width in pixels
38
+ height: Display height in pixels
39
+
40
+ Returns:
41
+ The HTML string used to display the image
42
+
43
+ Note:
44
+ This function will both display the image in IPython environments
45
+ and return the HTML string for other contexts.
46
+ """
47
+ from IPython.display import HTML, display
48
+
49
+ # Ensure the base64 image doesn't already have the data URI prefix
50
+ if base64_image.startswith("data:image"):
51
+ img_src = base64_image
52
+ else:
53
+ img_src = f"data:image/png;base64,{base64_image}"
54
+
55
+ html_content = f"""
56
+ <div style="width: {width}px; height: {height}px; overflow: hidden; margin: 10px 0; border: 1px solid #ddd;">
57
+ <img src="{img_src}" style="max-width: 100%; max-height: 100%;">
58
+ </div>
59
+ """ # noqa: E501
60
+
61
+ # Display in IPython environments
62
+ try:
63
+ display(HTML(html_content))
64
+ except Exception as e:
65
+ logger.warning(e)
66
+
67
+ return html_content