hud-python 0.4.28__py3-none-any.whl → 0.4.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (77) hide show
  1. hud/__init__.py +2 -1
  2. hud/agents/base.py +81 -45
  3. hud/agents/claude.py +8 -4
  4. hud/agents/openai_chat_generic.py +66 -40
  5. hud/agents/tests/test_base.py +0 -4
  6. hud/agents/tests/test_openai.py +1 -1
  7. hud/cli/__init__.py +182 -52
  8. hud/cli/dev.py +8 -9
  9. hud/cli/eval.py +317 -119
  10. hud/cli/flows/__init__.py +0 -0
  11. hud/cli/flows/tasks.py +0 -0
  12. hud/cli/get.py +160 -0
  13. hud/cli/rl/__init__.py +567 -71
  14. hud/cli/rl/config.py +94 -0
  15. hud/cli/rl/display.py +133 -0
  16. hud/cli/rl/gpu.py +63 -0
  17. hud/cli/rl/gpu_utils.py +318 -0
  18. hud/cli/rl/presets.py +96 -0
  19. hud/cli/rl/remote_runner.py +347 -0
  20. hud/cli/rl/rl_api.py +150 -0
  21. hud/cli/rl/vllm.py +177 -0
  22. hud/cli/tests/test_analyze_metadata.py +0 -1
  23. hud/cli/utils/tasks.py +26 -0
  24. hud/clients/base.py +21 -23
  25. hud/clients/mcp_use.py +36 -44
  26. hud/clients/tests/test_mcp_use_retry.py +10 -10
  27. hud/datasets/__init__.py +4 -3
  28. hud/datasets/{execution/parallel.py → parallel.py} +1 -1
  29. hud/datasets/{execution/runner.py → runner.py} +1 -1
  30. hud/datasets/utils.py +1 -1
  31. hud/native/comparator.py +6 -6
  32. hud/native/tests/test_comparator.py +8 -8
  33. hud/native/tests/test_native_init.py +13 -11
  34. hud/otel/config.py +1 -1
  35. hud/otel/instrumentation.py +35 -0
  36. hud/rl/README.md +30 -0
  37. hud/rl/__init__.py +1 -0
  38. hud/rl/actor.py +174 -0
  39. hud/rl/buffer.py +371 -0
  40. hud/rl/chat_template.jinja +101 -0
  41. hud/rl/config.py +184 -0
  42. hud/rl/distributed.py +95 -0
  43. hud/rl/learner.py +589 -0
  44. hud/rl/tests/__init__.py +1 -0
  45. hud/rl/tests/test_learner.py +171 -0
  46. hud/rl/train.py +354 -0
  47. hud/rl/types.py +101 -0
  48. hud/rl/utils/start_vllm_server.sh +30 -0
  49. hud/rl/utils.py +524 -0
  50. hud/rl/vllm_adapter.py +125 -0
  51. hud/settings.py +6 -0
  52. hud/telemetry/__init__.py +2 -1
  53. hud/telemetry/job.py +46 -3
  54. hud/telemetry/tests/test_trace.py +3 -3
  55. hud/telemetry/trace.py +85 -13
  56. hud/tools/tests/test_computer.py +3 -3
  57. hud/tools/tests/test_computer_actions.py +1 -1
  58. hud/types.py +123 -2
  59. hud/utils/group_eval.py +223 -0
  60. hud/utils/hud_console.py +113 -13
  61. hud/utils/tasks.py +119 -0
  62. hud/utils/tests/test_version.py +1 -1
  63. hud/version.py +1 -1
  64. {hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/METADATA +20 -2
  65. {hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/RECORD +68 -48
  66. hud/cli/hf.py +0 -406
  67. hud/cli/rl/README.md +0 -243
  68. hud/cli/rl/init.py +0 -370
  69. hud/cli/rl/pod.py +0 -501
  70. hud/cli/rl/ssh.py +0 -322
  71. hud/cli/rl/train.py +0 -562
  72. hud/cli/rl/utils.py +0 -165
  73. hud/datasets/execution/__init__.py +0 -13
  74. hud/datasets/task.py +0 -116
  75. {hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/WHEEL +0 -0
  76. {hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/entry_points.txt +0 -0
  77. {hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/licenses/LICENSE +0 -0
@@ -11,7 +11,7 @@ from hud.native.comparator import (
11
11
  ComparisonResult,
12
12
  DataType,
13
13
  auto_select_mode,
14
- comparator_server,
14
+ comparator,
15
15
  detect_type,
16
16
  extract_boolean,
17
17
  extract_json,
@@ -321,10 +321,10 @@ class TestAliasTools:
321
321
  @pytest.mark.asyncio
322
322
  async def test_aliases_work(self):
323
323
  """Test that aliases are properly registered and work."""
324
- from hud.native.comparator import comparator_server
324
+ from hud.native.comparator import comparator
325
325
 
326
326
  # Check that aliases are registered
327
- tool_names = [t.name for t in comparator_server._tool_manager._tools.values()]
327
+ tool_names = [t.name for t in comparator._tool_manager._tools.values()]
328
328
 
329
329
  expected_aliases = [
330
330
  "compare_exact",
@@ -433,7 +433,7 @@ class TestAliasPreprocessing:
433
433
  @pytest.mark.asyncio
434
434
  async def test_json_alias_preprocessing(self):
435
435
  """Test JSON extraction in compare_json tool."""
436
- tools = {t.name: t for t in comparator_server._tool_manager._tools.values()}
436
+ tools = {t.name: t for t in comparator._tool_manager._tools.values()}
437
437
  json_tool = tools["compare_json"]
438
438
 
439
439
  assert isinstance(json_tool, FunctionTool)
@@ -448,7 +448,7 @@ class TestAliasPreprocessing:
448
448
  @pytest.mark.asyncio
449
449
  async def test_numeric_alias_preprocessing(self):
450
450
  """Test number extraction in numeric tools."""
451
- tools = {t.name: t for t in comparator_server._tool_manager._tools.values()}
451
+ tools = {t.name: t for t in comparator._tool_manager._tools.values()}
452
452
 
453
453
  # Float tool
454
454
  float_tool = tools["compare_float"]
@@ -471,7 +471,7 @@ class TestAliasPreprocessing:
471
471
  @pytest.mark.asyncio
472
472
  async def test_boolean_alias_preprocessing(self):
473
473
  """Test boolean extraction in compare_boolean tool."""
474
- tools = {t.name: t for t in comparator_server._tool_manager._tools.values()}
474
+ tools = {t.name: t for t in comparator._tool_manager._tools.values()}
475
475
  bool_tool = tools["compare_boolean"]
476
476
 
477
477
  assert isinstance(bool_tool, FunctionTool)
@@ -485,7 +485,7 @@ class TestAliasPreprocessing:
485
485
  @pytest.mark.asyncio
486
486
  async def test_list_alias_preprocessing(self):
487
487
  """Test list extraction in compare_list tool."""
488
- tools = {t.name: t for t in comparator_server._tool_manager._tools.values()}
488
+ tools = {t.name: t for t in comparator._tool_manager._tools.values()}
489
489
  list_tool = tools["compare_list"]
490
490
 
491
491
  assert isinstance(list_tool, FunctionTool)
@@ -499,7 +499,7 @@ class TestAliasPreprocessing:
499
499
  @pytest.mark.asyncio
500
500
  async def test_complex_llm_output(self):
501
501
  """Test extraction from complex LLM outputs with reasoning."""
502
- tools = {t.name: t for t in comparator_server._tool_manager._tools.values()}
502
+ tools = {t.name: t for t in comparator._tool_manager._tools.values()}
503
503
  json_tool = tools["compare_json"]
504
504
 
505
505
  llm_output = """
@@ -8,12 +8,12 @@ class TestNativeInit:
8
8
 
9
9
  def test_comparator_server_import(self):
10
10
  """Test that comparator server can be imported."""
11
- from hud.native.comparator import comparator_server
11
+ from hud.native.comparator import comparator
12
12
  from hud.server import MCPServer
13
13
 
14
14
  # Verify comparator is an MCPServer instance
15
- assert isinstance(comparator_server, MCPServer)
16
- assert comparator_server.name == "comparator"
15
+ assert isinstance(comparator, MCPServer)
16
+ assert comparator.name == "comparator"
17
17
 
18
18
  def test_all_exports(self):
19
19
  """Test that __all__ is properly defined."""
@@ -31,17 +31,17 @@ class TestNativeInit:
31
31
 
32
32
  def test_comparator_tools_registered(self):
33
33
  """Test that comparator server has tools registered."""
34
- from hud.native.comparator import comparator_server
34
+ from hud.native.comparator import comparator
35
35
 
36
36
  # The server should have tools registered
37
37
  # We can check that the tool manager has tools
38
- tool_names = [t.name for t in comparator_server._tool_manager._tools.values()]
38
+ tool_names = [t.name for t in comparator._tool_manager._tools.values()]
39
39
 
40
40
  # Should have the main compare tool
41
41
  assert "compare" in tool_names
42
42
 
43
43
  # Should have the submit tool
44
- assert "submit" in tool_names
44
+ assert "response" in tool_names
45
45
 
46
46
  # Should have all the alias tools
47
47
  expected_aliases = [
@@ -64,16 +64,18 @@ class TestNativeInit:
64
64
 
65
65
  def test_comparator_tool_functionality(self):
66
66
  """Test that we can get the CompareTool from the comparator."""
67
- from hud.native.comparator import comparator_server
68
- from hud.tools import BaseTool
67
+ from hud.native.comparator import comparator
69
68
 
70
69
  # Get the compare tool
71
70
  compare_tool = None
72
- for tool in comparator_server._tool_manager._tools.values():
71
+ for tool in comparator._tool_manager._tools.values():
73
72
  if tool.name == "compare":
74
73
  compare_tool = tool
75
74
  break
76
75
 
77
76
  assert compare_tool is not None
78
- assert isinstance(compare_tool, BaseTool)
79
- assert hasattr(compare_tool, "__call__")
77
+ # FastMCP wraps tools as FunctionTool instances
78
+ assert hasattr(compare_tool, "name")
79
+ assert compare_tool.name == "compare"
80
+ # FunctionTool has a 'fn' attribute for the callable
81
+ assert hasattr(compare_tool, "fn") or hasattr(compare_tool, "__call__")
hud/otel/config.py CHANGED
@@ -111,7 +111,7 @@ def configure_telemetry(
111
111
  # Error if no exporters are configured
112
112
  raise ValueError(
113
113
  "No telemetry backend configured. Either:\n"
114
- "1. Set HUD_API_KEY environment variable for HUD telemetry\n"
114
+ "1. Set HUD_API_KEY environment variable for HUD telemetry (https://app.hud.so)\n"
115
115
  "2. Use enable_otlp=True with configure_telemetry() for alternative backends (e.g., Jaeger)\n" # noqa: E501
116
116
  )
117
117
  elif not settings.telemetry_enabled:
@@ -55,6 +55,9 @@ def _patch_mcp_instrumentation() -> None:
55
55
  try:
56
56
  from opentelemetry.instrumentation.mcp.instrumentation import McpInstrumentor
57
57
 
58
+ # First, patch the get_error_type function to handle invalid HTTP status codes
59
+ _patch_get_error_type()
60
+
58
61
  def patched_transport_wrapper(self: Any, tracer: Any) -> Callable[..., Any]:
59
62
  @asynccontextmanager
60
63
  async def traced_method(
@@ -98,3 +101,35 @@ def _patch_mcp_instrumentation() -> None:
98
101
 
99
102
  logger = logging.getLogger(__name__)
100
103
  logger.warning("Failed to patch MCP instrumentation: %s", e)
104
+
105
+
106
+ def _patch_get_error_type() -> None:
107
+ """Patch get_error_type to handle invalid HTTP status codes gracefully."""
108
+ import re
109
+ from http import HTTPStatus
110
+
111
+ try:
112
+ import opentelemetry.instrumentation.mcp.instrumentation as mcp_inst
113
+
114
+ def patched_get_error_type(error_message: str) -> str | None:
115
+ """Extract HTTP status from error message, handling invalid codes."""
116
+ if not isinstance(error_message, str):
117
+ return None
118
+ match = re.search(r"\b(4\d{2}|5\d{2})\b", error_message)
119
+ if match:
120
+ num = int(match.group())
121
+ try:
122
+ # Only return if it's a valid HTTPStatus
123
+ if 400 <= num <= 599:
124
+ return HTTPStatus(num).name
125
+ except ValueError:
126
+ # Not a valid HTTP status code
127
+ logger.debug("Ignoring invalid HTTP status code: %s", num)
128
+ return None
129
+
130
+ # Apply the patch
131
+ mcp_inst.get_error_type = patched_get_error_type
132
+ logger.debug("Patched get_error_type to handle invalid HTTP status codes")
133
+
134
+ except Exception as e:
135
+ logger.warning("Failed to patch get_error_type: %s", e)
hud/rl/README.md ADDED
@@ -0,0 +1,30 @@
1
+ We suggest running hud rl (or with the --local flag) for optimal hyperparameters and native HuggingFace running.
2
+
3
+ However, to run this independently, sping up an instance with at least 2 GPUs and run:
4
+ ```bash
5
+ sudo apt-get update -y && sudo apt-get install -y cuda-toolkit-12-6
6
+ uv pip install -e .[rl]
7
+ uv pip install ninja
8
+ uv pip install flash-attn --no-build-isolation
9
+ ```
10
+
11
+ Launch a vllm server with:
12
+ ```bash
13
+ export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
14
+ export TOKENIZERS_PARALLELISM=false
15
+ export VLLM_LOGGING_LEVEL=INFO
16
+ export CUDA_VISIBLE_DEVICES=7 # Set this to your last GPU
17
+
18
+ uv run vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
19
+ --api-key token-abc123 --host 0.0.0.0 --port 8000 --tensor-parallel-size 1 --trust-remote-code \
20
+ --max-model-len 16384 --enable-lora --max-lora-rank 64 --max-cpu-loras 4 --enable-auto-tool-choice \
21
+ --tool-call-parser hermes --disable-log-requests --dtype auto
22
+ ```
23
+
24
+ And training with (replace 2 with your spare GPUs):
25
+ ```bash
26
+ hud get hud-evals/2048-basic
27
+ torchrun --nproc-per-node 2 -m hud.rl.train --tasks 2048-basic.json --verbose
28
+ ```
29
+
30
+ Add a `--config path/to/config.json` flag to run a specific configuration (or change the defaults in config.py)
hud/rl/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """RL module for HUD."""
hud/rl/actor.py ADDED
@@ -0,0 +1,174 @@
1
+ """Actor for episode collection using vLLM and HUD."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+
8
+ import httpx
9
+ from openai import AsyncOpenAI
10
+
11
+ import hud
12
+ from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
13
+ from hud.clients.utils.retry_transport import create_retry_httpx_client
14
+ from hud.types import Task, Trace
15
+ from hud.utils.hud_console import HUDConsole
16
+
17
+ from .config import Config
18
+
19
+ logger = logging.getLogger(__name__)
20
+ hud_console = HUDConsole(logger)
21
+
22
+
23
+ class Actor:
24
+ """Collects episodes using vLLM-served models via HUD agents."""
25
+
26
+ def __init__(self, config: Config) -> None:
27
+ self.config = config
28
+ self.actor_config = config.actor
29
+ self.current_adapter = config.model.base_model
30
+
31
+ # Setup OpenAI client for vLLM
32
+ base_url = self.actor_config.vllm_base_url.replace("localhost", "127.0.0.1")
33
+ self.openai_client = self._create_openai_client(base_url)
34
+
35
+ def _create_openai_client(self, base_url: str) -> AsyncOpenAI:
36
+ """Create OpenAI client with optimized settings for vLLM."""
37
+ # Match connection limits to parallel_episodes to avoid bottlenecks
38
+ # Use shorter per-request timeout and keep retries modest to avoid long blocking
39
+ http_client = create_retry_httpx_client(
40
+ timeout=httpx.Timeout(30.0),
41
+ )
42
+ return AsyncOpenAI(
43
+ base_url=base_url,
44
+ api_key=self.actor_config.vllm_api_key,
45
+ http_client=http_client,
46
+ max_retries=2,
47
+ )
48
+
49
+ def create_agent(self) -> GenericOpenAIChatAgent:
50
+ """Create an agent with the current adapter."""
51
+ return GenericOpenAIChatAgent(
52
+ openai_client=self.openai_client,
53
+ model_name=self.current_adapter,
54
+ allowed_tools=self.actor_config.allowed_tools,
55
+ append_setup_output=False,
56
+ system_prompt=self.actor_config.system_prompt,
57
+ verbose=self.config.verbose,
58
+ completion_kwargs={
59
+ "temperature": self.actor_config.temperature,
60
+ "max_tokens": self.actor_config.max_new_tokens,
61
+ "tool_choice": "required" if self.actor_config.force_tool_choice else "auto",
62
+ },
63
+ )
64
+
65
+ def update_adapter(self, adapter_name: str) -> None:
66
+ """Update the current adapter being used."""
67
+ self.current_adapter = adapter_name
68
+ hud_console.info(f"[Actor] Using adapter: {adapter_name}")
69
+
70
+ async def run_tasks(self, tasks: list[Task], job_id: str) -> list[Trace]:
71
+ """Run tasks and collect traces."""
72
+ traces = []
73
+
74
+ # Process tasks in batches respecting max_parallel_episodes limit
75
+ for batch_start in range(0, len(tasks), self.actor_config.max_parallel_episodes):
76
+ batch_end = min(batch_start + self.actor_config.max_parallel_episodes, len(tasks))
77
+ batch = tasks[batch_start:batch_end]
78
+
79
+ # Run batch in parallel with per-episode timeout protection
80
+ async def run_with_timeout(t: Task) -> Trace:
81
+ try:
82
+ return await asyncio.wait_for(
83
+ self._run_task(t, job_id),
84
+ timeout=self.actor_config.episode_timeout_sec,
85
+ )
86
+ except TimeoutError:
87
+ hud_console.warning_log(f"Episode timed out for task {t.id}")
88
+ return Trace(isError=True, content="Episode timeout")
89
+
90
+ results = await asyncio.gather(
91
+ *[run_with_timeout(t) for t in batch],
92
+ return_exceptions=True,
93
+ )
94
+
95
+ # Normalize exceptions to error traces
96
+ for res in results:
97
+ if isinstance(res, Exception):
98
+ hud_console.warning_log(f"Episode error: {res}")
99
+ traces.append(Trace(isError=True, content=str(res)))
100
+ else:
101
+ traces.append(res)
102
+
103
+ return traces
104
+
105
+ async def _run_task(self, task: Task, job_id: str) -> Trace:
106
+ """Run a single task."""
107
+ agent = self.create_agent()
108
+
109
+ # Run the task
110
+ try:
111
+ with hud.trace(f"Training | {task.id}", job_id=job_id):
112
+ result = await agent.run(task, max_steps=self.actor_config.max_steps_per_episode)
113
+
114
+ except Exception:
115
+ logger.info("GOT EXCEPTION")
116
+ return Trace(isError=True)
117
+
118
+ result.info["tool_spec"] = agent.get_tool_schemas()
119
+
120
+ return result
121
+
122
+
123
+ if __name__ == "__main__":
124
+ from hud.types import Task
125
+
126
+ async def test_actor() -> None:
127
+ """Test the actor with a single 2048 task using local hud-browser image."""
128
+ config = Config()
129
+ config.actor.max_parallel_episodes = 1
130
+ config.actor.max_steps_per_episode = 6
131
+ config.actor.episode_timeout_sec = 120
132
+ config.verbose = True
133
+
134
+ # Create test task with local hud-browser image
135
+ task_data = {
136
+ "id": "test_2048_128",
137
+ "prompt": "Play the browser-based 2048 game and try to reach the 128 tile. Start by taking a screenshot, then make strategic moves using arrow keys.", # noqa: E501
138
+ "mcp_config": {
139
+ "local": {
140
+ "command": "sh",
141
+ "args": [
142
+ "-c",
143
+ "docker run --rm --platform linux/amd64 -i hud-browser:latest 2>/dev/null",
144
+ ],
145
+ }
146
+ },
147
+ "setup_tool": {"name": "launch_app", "arguments": {"app_name": "2048"}},
148
+ "evaluate_tool": {
149
+ "name": "evaluate",
150
+ "arguments": {"name": "game_2048_max_number", "arguments": {"target": 128}},
151
+ },
152
+ "system_prompt": "You are an expert 2048 game player. Use arrow keys to reach the target tile. First take a screenshot, then make strategic moves.", # noqa: E501
153
+ }
154
+
155
+ task = Task(**task_data)
156
+ actor = Actor(config)
157
+
158
+ logger.info("Testing actor with task: %s", task.id)
159
+ logger.info("Model: %s", config.model.base_model)
160
+ logger.info("VLLM: %s", config.actor.vllm_base_url)
161
+
162
+ traces = await actor.run_tasks([task], job_id="test_2048")
163
+
164
+ for trace in traces:
165
+ if trace.isError:
166
+ logger.info("Error: %s", trace.content)
167
+ else:
168
+ logger.info("Success!")
169
+ logger.info("Trace info: %s", trace.info if hasattr(trace, "info") else "No info")
170
+ # Check for evaluation in the trace info
171
+ if hasattr(trace, "info") and "evaluation" in trace.info:
172
+ logger.info(" Evaluation: %s", trace.info["evaluation"])
173
+
174
+ asyncio.run(test_actor())