hud-python 0.4.35__py3-none-any.whl → 0.4.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (63) hide show
  1. hud/agents/__init__.py +2 -0
  2. hud/agents/lite_llm.py +72 -0
  3. hud/agents/openai_chat_generic.py +21 -7
  4. hud/agents/tests/test_claude.py +32 -7
  5. hud/agents/tests/test_openai.py +29 -6
  6. hud/cli/__init__.py +228 -79
  7. hud/cli/build.py +26 -6
  8. hud/cli/dev.py +21 -40
  9. hud/cli/eval.py +96 -15
  10. hud/cli/flows/tasks.py +198 -65
  11. hud/cli/init.py +222 -629
  12. hud/cli/pull.py +6 -0
  13. hud/cli/push.py +11 -1
  14. hud/cli/rl/__init__.py +14 -4
  15. hud/cli/rl/celebrate.py +187 -0
  16. hud/cli/rl/config.py +15 -8
  17. hud/cli/rl/local_runner.py +44 -20
  18. hud/cli/rl/remote_runner.py +166 -87
  19. hud/cli/rl/viewer.py +141 -0
  20. hud/cli/rl/wait_utils.py +89 -0
  21. hud/cli/tests/test_build.py +3 -27
  22. hud/cli/tests/test_mcp_server.py +1 -12
  23. hud/cli/utils/config.py +85 -0
  24. hud/cli/utils/docker.py +21 -39
  25. hud/cli/utils/env_check.py +196 -0
  26. hud/cli/utils/environment.py +4 -3
  27. hud/cli/utils/interactive.py +2 -1
  28. hud/cli/utils/local_runner.py +204 -0
  29. hud/cli/utils/metadata.py +3 -1
  30. hud/cli/utils/package_runner.py +292 -0
  31. hud/cli/utils/remote_runner.py +4 -1
  32. hud/cli/utils/source_hash.py +108 -0
  33. hud/clients/base.py +1 -1
  34. hud/clients/fastmcp.py +1 -1
  35. hud/clients/mcp_use.py +30 -7
  36. hud/datasets/parallel.py +3 -1
  37. hud/datasets/runner.py +4 -1
  38. hud/otel/config.py +1 -1
  39. hud/otel/context.py +40 -6
  40. hud/rl/buffer.py +3 -0
  41. hud/rl/tests/test_learner.py +1 -1
  42. hud/rl/vllm_adapter.py +1 -1
  43. hud/server/server.py +234 -7
  44. hud/server/tests/test_add_tool.py +60 -0
  45. hud/server/tests/test_context.py +128 -0
  46. hud/server/tests/test_mcp_server_handlers.py +44 -0
  47. hud/server/tests/test_mcp_server_integration.py +405 -0
  48. hud/server/tests/test_mcp_server_more.py +247 -0
  49. hud/server/tests/test_run_wrapper.py +53 -0
  50. hud/server/tests/test_server_extra.py +166 -0
  51. hud/server/tests/test_sigterm_runner.py +78 -0
  52. hud/settings.py +38 -0
  53. hud/shared/hints.py +2 -2
  54. hud/telemetry/job.py +2 -2
  55. hud/types.py +9 -2
  56. hud/utils/tasks.py +32 -24
  57. hud/utils/tests/test_version.py +1 -1
  58. hud/version.py +1 -1
  59. {hud_python-0.4.35.dist-info → hud_python-0.4.37.dist-info}/METADATA +43 -23
  60. {hud_python-0.4.35.dist-info → hud_python-0.4.37.dist-info}/RECORD +63 -46
  61. {hud_python-0.4.35.dist-info → hud_python-0.4.37.dist-info}/WHEEL +0 -0
  62. {hud_python-0.4.35.dist-info → hud_python-0.4.37.dist-info}/entry_points.txt +0 -0
  63. {hud_python-0.4.35.dist-info → hud_python-0.4.37.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,166 @@
1
+ # filename: hud/server/tests/test_server_extra.py
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ from contextlib import asynccontextmanager, suppress
6
+
7
+ import anyio
8
+ import pytest
9
+
10
+ from hud.server import MCPServer
11
+ from hud.server import server as server_mod
12
+
13
+
14
+ @asynccontextmanager
15
+ async def _fake_stdio_server():
16
+ """
17
+ Stand-in for stdio_server that avoids reading real stdin.
18
+
19
+ It yields a pair of in-memory streams (receive, send) so the low-level server
20
+ can start and idle without touching sys.stdin/sys.stdout.
21
+ """
22
+ send_in, recv_in = anyio.create_memory_object_stream(100)
23
+ send_out, recv_out = anyio.create_memory_object_stream(100)
24
+ try:
25
+ yield recv_in, send_out
26
+ finally:
27
+ # best-effort close across anyio versions
28
+ for s in (send_in, recv_in, send_out, recv_out):
29
+ close = getattr(s, "close", None) or getattr(s, "aclose", None)
30
+ try:
31
+ if close is not None:
32
+ res = close()
33
+ if asyncio.iscoroutine(res):
34
+ await res
35
+ except Exception:
36
+ pass
37
+
38
+
39
+ @pytest.fixture
40
+ def patch_stdio(monkeypatch: pytest.MonkeyPatch):
41
+ """Patch stdio server to avoid stdin issues during tests."""
42
+ monkeypatch.setenv("FASTMCP_DISABLE_BANNER", "1")
43
+ monkeypatch.setattr("mcp.server.stdio.stdio_server", _fake_stdio_server)
44
+ monkeypatch.setattr("fastmcp.server.server.stdio_server", _fake_stdio_server)
45
+
46
+
47
+ @pytest.mark.asyncio
48
+ async def test_sigterm_flag_remains_true_without_shutdown_handler(patch_stdio):
49
+ """
50
+ When no @mcp.shutdown is registered, neither the lifespan.finally nor run_async.finally
51
+ should reset the global SIGTERM flag. This exercises the 'no handler' branches.
52
+ """
53
+ mcp = MCPServer(name="NoShutdownHandler")
54
+
55
+ task = asyncio.create_task(mcp.run_async(transport="stdio", show_banner=False))
56
+ try:
57
+ await asyncio.sleep(0.05)
58
+ # Simulate SIGTERM path
59
+ server_mod._sigterm_received = True # type: ignore[attr-defined]
60
+ finally:
61
+ with suppress(asyncio.CancelledError):
62
+ task.cancel()
63
+ await task
64
+
65
+ # Flag must remain set since no shutdown handler was installed
66
+ assert getattr(server_mod, "_sigterm_received") is True
67
+
68
+ # Always reset for other tests
69
+ server_mod._sigterm_received = False # type: ignore[attr-defined]
70
+
71
+
72
+ @pytest.mark.asyncio
73
+ async def test_last_shutdown_handler_wins(patch_stdio):
74
+ """
75
+ If multiple @mcp.shutdown decorators are applied, the last one should be the one that runs.
76
+ """
77
+ mcp = MCPServer(name="ShutdownOverride")
78
+ calls: list[str] = []
79
+
80
+ @mcp.shutdown
81
+ async def _first() -> None:
82
+ calls.append("first")
83
+
84
+ @mcp.shutdown
85
+ async def _second() -> None:
86
+ calls.append("second")
87
+
88
+ task = asyncio.create_task(mcp.run_async(transport="stdio", show_banner=False))
89
+ try:
90
+ await asyncio.sleep(0.05)
91
+ server_mod._sigterm_received = True # type: ignore[attr-defined]
92
+ finally:
93
+ with suppress(asyncio.CancelledError):
94
+ task.cancel()
95
+ await task
96
+
97
+ assert calls == ["second"], "Only the last registered shutdown handler should run"
98
+ server_mod._sigterm_received = False # type: ignore[attr-defined]
99
+
100
+
101
+ def test__run_with_sigterm_registers_handlers_when_enabled(monkeypatch: pytest.MonkeyPatch):
102
+ """
103
+ Verify that _run_with_sigterm attempts to register SIGTERM/SIGINT handlers
104
+ when the env var does NOT disable the handler. We stub AnyIO's TaskGroup so
105
+ the watcher doesn't block and the test returns immediately.
106
+ """
107
+ # Ensure handler is enabled
108
+ monkeypatch.delenv("FASTMCP_DISABLE_SIGTERM_HANDLER", raising=False)
109
+
110
+ # Record what the server tries to register
111
+ added_signals: list[int] = []
112
+
113
+ import asyncio as _asyncio
114
+
115
+ orig_get_running_loop = _asyncio.get_running_loop
116
+
117
+ def proxy_get_running_loop():
118
+ real = orig_get_running_loop()
119
+
120
+ class _LoopProxy:
121
+ __slots__ = ("_inner",)
122
+
123
+ def __init__(self, inner):
124
+ self._inner = inner
125
+
126
+ def add_signal_handler(self, signum, callback, *args):
127
+ added_signals.append(signum) # don't actually install
128
+ # no-op: skip calling inner.add_signal_handler to avoid OS constraints
129
+
130
+ def __getattr__(self, name):
131
+ # delegate everything else (create_task, call_soon, etc.)
132
+ return getattr(self._inner, name)
133
+
134
+ return _LoopProxy(real)
135
+
136
+ # Patch globally so both the test and hud.server.server see the proxy
137
+ monkeypatch.setattr(_asyncio, "get_running_loop", proxy_get_running_loop)
138
+
139
+ # Dummy TaskGroup that runs the work but skips _watch
140
+ class _DummyTG:
141
+ async def __aenter__(self):
142
+ return self
143
+
144
+ async def __aexit__(self, exc_type, exc, tb):
145
+ return False
146
+
147
+ def start_soon(self, fn, *args, **kwargs):
148
+ if getattr(fn, "__name__", "") == "_watch":
149
+ return
150
+ _asyncio.get_running_loop().create_task(fn(*args, **kwargs))
151
+
152
+ monkeypatch.setattr("anyio.create_task_group", lambda: _DummyTG())
153
+
154
+ # Simple coroutine that should run to completion
155
+ hit = {"v": False}
156
+
157
+ async def work():
158
+ hit["v"] = True
159
+
160
+ server_mod._run_with_sigterm(work)
161
+ assert hit["v"] is True
162
+
163
+ import signal as _signal
164
+
165
+ assert _signal.SIGTERM in added_signals
166
+ assert _signal.SIGINT in added_signals
@@ -0,0 +1,78 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ from contextlib import asynccontextmanager, suppress
5
+
6
+ import anyio
7
+ import pytest
8
+
9
+ from hud.server import MCPServer
10
+ from hud.server import server as server_mod
11
+
12
+
13
+ def test__run_with_sigterm_executes_coro_when_handler_disabled(monkeypatch: pytest.MonkeyPatch):
14
+ """With FASTMCP_DISABLE_SIGTERM_HANDLER=1, _run_with_sigterm should just run the task."""
15
+ monkeypatch.setenv("FASTMCP_DISABLE_SIGTERM_HANDLER", "1")
16
+
17
+ hit = {"v": False}
18
+
19
+ async def work(arg, *, kw=None):
20
+ assert arg == 123 and kw == "ok"
21
+ hit["v"] = True
22
+
23
+ # Wrapper to exercise kwargs since TaskGroup.start_soon only accepts positional args
24
+ async def wrapper(arg):
25
+ await work(arg, kw="ok")
26
+
27
+ # Should return cleanly and mark hit
28
+ server_mod._run_with_sigterm(wrapper, 123)
29
+ assert hit["v"] is True
30
+
31
+
32
+ @asynccontextmanager
33
+ async def _fake_stdio_server():
34
+ """Stand-in for stdio_server that avoids reading real stdin."""
35
+ send_in, recv_in = anyio.create_memory_object_stream(100)
36
+ send_out, recv_out = anyio.create_memory_object_stream(100)
37
+ try:
38
+ yield recv_in, send_out
39
+ finally:
40
+ for s in (send_in, recv_in, send_out, recv_out):
41
+ close = getattr(s, "close", None) or getattr(s, "aclose", None)
42
+ try:
43
+ if close is not None:
44
+ res = close()
45
+ if asyncio.iscoroutine(res):
46
+ await res
47
+ except Exception:
48
+ pass
49
+
50
+
51
+ @pytest.fixture
52
+ def patch_stdio(monkeypatch: pytest.MonkeyPatch):
53
+ """Patch stdio server to avoid stdin issues during tests."""
54
+ monkeypatch.setenv("FASTMCP_DISABLE_BANNER", "1")
55
+ monkeypatch.setattr("mcp.server.stdio.stdio_server", _fake_stdio_server)
56
+ monkeypatch.setattr("fastmcp.server.server.stdio_server", _fake_stdio_server)
57
+
58
+
59
+ @pytest.mark.asyncio
60
+ async def test_shutdown_handler_exception_does_not_crash_and_resets_flag(patch_stdio):
61
+ """If @shutdown raises, run_async must swallow it and still reset the SIGTERM flag."""
62
+ mcp = MCPServer(name="ShutdownRaises")
63
+
64
+ @mcp.shutdown
65
+ async def _boom() -> None:
66
+ raise RuntimeError("kaboom")
67
+
68
+ task = asyncio.create_task(mcp.run_async(transport="stdio", show_banner=False))
69
+ try:
70
+ await asyncio.sleep(0.05)
71
+ server_mod._sigterm_received = True # trigger shutdown path
72
+ finally:
73
+ with suppress(asyncio.CancelledError):
74
+ task.cancel()
75
+ await task
76
+
77
+ # No exception propagated; flag must be reset
78
+ assert not getattr(server_mod, "_sigterm_received")
hud/settings.py CHANGED
@@ -1,7 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from pathlib import Path
4
+
3
5
  from pydantic import Field
4
6
  from pydantic_settings import BaseSettings, SettingsConfigDict
7
+ from pydantic_settings.sources import DotEnvSettingsSource, PydanticBaseSettingsSource
5
8
 
6
9
 
7
10
  class Settings(BaseSettings):
@@ -14,6 +17,41 @@ class Settings(BaseSettings):
14
17
 
15
18
  model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="allow")
16
19
 
20
+ @classmethod
21
+ def settings_customise_sources(
22
+ cls,
23
+ settings_cls: type[BaseSettings],
24
+ init_settings: PydanticBaseSettingsSource,
25
+ env_settings: PydanticBaseSettingsSource,
26
+ dotenv_settings: PydanticBaseSettingsSource,
27
+ file_secret_settings: PydanticBaseSettingsSource,
28
+ ) -> tuple[PydanticBaseSettingsSource, ...]:
29
+ """
30
+ Customize settings source precedence to include a user-level env file.
31
+
32
+ Precedence (highest to lowest):
33
+ - init_settings (explicit kwargs)
34
+ - env_settings (process environment)
35
+ - dotenv_settings (project .env)
36
+ - user_dotenv_settings (~/.hud/.env) ← added
37
+ - file_secret_settings
38
+ """
39
+
40
+ user_env_path = Path.home() / ".hud" / ".env"
41
+ user_dotenv_settings = DotEnvSettingsSource(
42
+ settings_cls,
43
+ env_file=user_env_path,
44
+ env_file_encoding="utf-8",
45
+ )
46
+
47
+ return (
48
+ init_settings,
49
+ env_settings,
50
+ dotenv_settings,
51
+ user_dotenv_settings,
52
+ file_secret_settings,
53
+ )
54
+
17
55
  hud_telemetry_url: str = Field(
18
56
  default="https://telemetry.hud.so/v3/api",
19
57
  description="Base URL for the HUD API",
hud/shared/hints.py CHANGED
@@ -37,8 +37,8 @@ HUD_API_KEY_MISSING = Hint(
37
37
  title="HUD API key required",
38
38
  message="Missing or invalid HUD_API_KEY.",
39
39
  tips=[
40
- "Set HUD_API_KEY environment variable",
41
- "Get a key at https://app.hud.so",
40
+ "Set HUD_API_KEY in your environment or run: hud set HUD_API_KEY=your-key-here",
41
+ "Get a key at https://hud.so",
42
42
  "Check for whitespace or truncation",
43
43
  ],
44
44
  docs_url=None,
hud/telemetry/job.py CHANGED
@@ -143,7 +143,7 @@ def _print_job_url(job_id: str, job_name: str) -> None:
143
143
  if not (settings.telemetry_enabled and settings.api_key):
144
144
  return
145
145
 
146
- url = f"https://app.hud.so/jobs/{job_id}"
146
+ url = f"https://hud.so/jobs/{job_id}"
147
147
  header = f"🚀 Job '{job_name}' started:"
148
148
 
149
149
  # ANSI color codes
@@ -182,7 +182,7 @@ def _print_job_complete_url(job_id: str, job_name: str, error_occurred: bool = F
182
182
  if not (settings.telemetry_enabled and settings.api_key):
183
183
  return
184
184
 
185
- url = f"https://app.hud.so/jobs/{job_id}"
185
+ url = f"https://hud.so/jobs/{job_id}"
186
186
 
187
187
  # ANSI color codes
188
188
  GREEN = "\033[92m"
hud/types.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import contextlib
3
4
  import json
4
5
  import logging
5
6
  import uuid
@@ -107,7 +108,13 @@ class Task(BaseModel):
107
108
 
108
109
  # Start with current environment variables
109
110
  mapping = dict(os.environ)
110
- mapping.update(settings.model_dump())
111
+ # Include settings (from process env, project .env, and user .env)
112
+ settings_dict = settings.model_dump()
113
+ mapping.update(settings_dict)
114
+ # Add UPPERCASE aliases for settings keys
115
+ for _key, _val in settings_dict.items():
116
+ with contextlib.suppress(Exception):
117
+ mapping[_key.upper()] = _val
111
118
 
112
119
  if settings.api_key:
113
120
  mapping["HUD_API_KEY"] = settings.api_key
@@ -208,7 +215,7 @@ class AgentResponse(BaseModel):
208
215
  tool_calls: list[MCPToolCall] = Field(default_factory=list)
209
216
  done: bool = Field(default=False)
210
217
 
211
- # --- TELEMETRY [app.hud.so] ---
218
+ # --- TELEMETRY [hud.so] ---
212
219
  # Responses
213
220
  content: str | None = Field(default=None)
214
221
  reasoning: str | None = Field(default=None)
hud/utils/tasks.py CHANGED
@@ -9,7 +9,7 @@ from hud.utils.hud_console import HUDConsole
9
9
  hud_console = HUDConsole()
10
10
 
11
11
 
12
- def load_tasks(tasks_input: str | list[dict]) -> list[Task]:
12
+ def load_tasks(tasks_input: str | list[dict], *, raw: bool = False) -> list[Task] | list[dict]:
13
13
  """Load tasks from various sources.
14
14
 
15
15
  Args:
@@ -18,16 +18,19 @@ def load_tasks(tasks_input: str | list[dict]) -> list[Task]:
18
18
  - Path to a JSONL file (one task per line)
19
19
  - HuggingFace dataset name (format: "username/dataset" or "username/dataset:split")
20
20
  - List of task dictionaries
21
- system_prompt: Default system prompt to use if not specified in task
21
+ raw: If True, return raw dicts without validation or env substitution
22
22
 
23
23
  Returns:
24
- List of validated HUD Task objects
24
+ - If raw=False (default): list[Task]
25
+ - If raw=True: list[dict]
25
26
  """
26
- tasks = []
27
+ tasks: list[Task] | list[dict] = []
27
28
 
28
29
  if isinstance(tasks_input, list):
29
30
  # Direct list of task dicts
30
31
  hud_console.info(f"Loading {len(tasks_input)} tasks from provided list")
32
+ if raw:
33
+ return [item for item in tasks_input if isinstance(item, dict)]
31
34
  for item in tasks_input:
32
35
  task = Task(**item)
33
36
  tasks.append(task)
@@ -36,7 +39,6 @@ def load_tasks(tasks_input: str | list[dict]) -> list[Task]:
36
39
  # Check if it's a file path
37
40
  if Path(tasks_input).exists():
38
41
  file_path = Path(tasks_input)
39
- hud_console.info(f"Loading tasks from file: {tasks_input}")
40
42
 
41
43
  with open(file_path) as f:
42
44
  # Handle JSON files (array of tasks)
@@ -46,31 +48,33 @@ def load_tasks(tasks_input: str | list[dict]) -> list[Task]:
46
48
  raise ValueError(
47
49
  f"JSON file must contain an array of tasks, got {type(data)}"
48
50
  )
49
-
51
+ if raw:
52
+ return [item for item in data if isinstance(item, dict)]
50
53
  for item in data:
51
54
  task = Task(**item)
52
55
  tasks.append(task)
53
56
 
54
57
  # Handle JSONL files (one task per line)
55
58
  else:
59
+ raw_items: list[dict] = []
56
60
  for line in f:
57
61
  line = line.strip()
58
- if line: # Skip empty lines
59
- item = json.loads(line)
60
-
61
- # Handle case where line contains an array of tasks
62
- if isinstance(item, list):
63
- for task_item in item:
64
- task = Task(**task_item)
65
- tasks.append(task)
66
- # Handle normal case where line contains a single task object
67
- elif isinstance(item, dict):
68
- task = Task(**item)
69
- tasks.append(task)
70
- else:
71
- raise ValueError(
72
- f"Invalid JSONL format: expected dict or list of dicts, got {type(item)}" # noqa: E501
73
- )
62
+ if not line:
63
+ continue
64
+ item = json.loads(line)
65
+ if isinstance(item, list):
66
+ raw_items.extend([it for it in item if isinstance(it, dict)])
67
+ elif isinstance(item, dict):
68
+ raw_items.append(item)
69
+ else:
70
+ raise ValueError(
71
+ f"Invalid JSONL format: expected dict or list of dicts, got {type(item)}" # noqa: E501
72
+ )
73
+ if raw:
74
+ return raw_items
75
+ for it in raw_items:
76
+ task = Task(**it)
77
+ tasks.append(task)
74
78
 
75
79
  # Check if it's a HuggingFace dataset
76
80
  elif "/" in tasks_input:
@@ -88,6 +92,7 @@ def load_tasks(tasks_input: str | list[dict]) -> list[Task]:
88
92
  dataset = load_dataset(dataset_name, split=split)
89
93
 
90
94
  # Convert dataset rows to Task objects
95
+ raw_rows: list[dict] = []
91
96
  for item in dataset:
92
97
  if not isinstance(item, dict):
93
98
  raise ValueError(
@@ -97,7 +102,11 @@ def load_tasks(tasks_input: str | list[dict]) -> list[Task]:
97
102
  raise ValueError(
98
103
  f"Invalid HuggingFace dataset: expected mcp_config and prompt, got {item}" # noqa: E501
99
104
  )
100
- task = Task(**item)
105
+ raw_rows.append(item)
106
+ if raw:
107
+ return raw_rows
108
+ for row in raw_rows:
109
+ task = Task(**row)
101
110
  tasks.append(task)
102
111
 
103
112
  except ImportError as e:
@@ -115,5 +124,4 @@ def load_tasks(tasks_input: str | list[dict]) -> list[Task]:
115
124
  else:
116
125
  raise TypeError(f"tasks_input must be str or list, got {type(tasks_input)}")
117
126
 
118
- hud_console.info(f"Loaded {len(tasks)} tasks")
119
127
  return tasks
@@ -5,4 +5,4 @@ def test_import():
5
5
  """Test that the package can be imported."""
6
6
  import hud
7
7
 
8
- assert hud.__version__ == "0.4.35"
8
+ assert hud.__version__ == "0.4.37"
hud/version.py CHANGED
@@ -4,4 +4,4 @@ Version information for the HUD SDK.
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
- __version__ = "0.4.35"
7
+ __version__ = "0.4.37"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.35
3
+ Version: 0.4.37
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -36,11 +36,13 @@ Classifier: Programming Language :: Python :: 3.12
36
36
  Classifier: Programming Language :: Python :: 3.13
37
37
  Requires-Python: <3.13,>=3.11
38
38
  Requires-Dist: anthropic
39
+ Requires-Dist: blessed>=1.20.0
39
40
  Requires-Dist: datasets>=2.14.0
40
41
  Requires-Dist: httpx<1,>=0.23.0
41
42
  Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
42
43
  Requires-Dist: hud-mcp-python-sdk>=3.13.2
43
- Requires-Dist: hud-mcp-use-python-sdk>=2.3.16
44
+ Requires-Dist: hud-mcp-use-python-sdk==2.3.19
45
+ Requires-Dist: litellm>=1.55.0
44
46
  Requires-Dist: numpy>=1.24.0
45
47
  Requires-Dist: openai
46
48
  Requires-Dist: opentelemetry-api>=1.34.1
@@ -50,8 +52,8 @@ Requires-Dist: opentelemetry-sdk>=1.34.1
50
52
  Requires-Dist: pathspec>=0.12.1
51
53
  Requires-Dist: pillow>=11.1.0
52
54
  Requires-Dist: prompt-toolkit==3.0.51
53
- Requires-Dist: pydantic-settings<3,>=2
54
- Requires-Dist: pydantic<3,>=2
55
+ Requires-Dist: pydantic-settings<3,>=2.2
56
+ Requires-Dist: pydantic<3,>=2.6
55
57
  Requires-Dist: questionary==2.1.0
56
58
  Requires-Dist: rich>=13.0.0
57
59
  Requires-Dist: toml>=0.10.2
@@ -59,7 +61,9 @@ Requires-Dist: typer>=0.9.0
59
61
  Requires-Dist: watchfiles>=0.21.0
60
62
  Requires-Dist: wrapt>=1.14.0
61
63
  Provides-Extra: agent
64
+ Requires-Dist: aiodocker>=0.24.0; extra == 'agent'
62
65
  Requires-Dist: dotenv>=0.9.9; extra == 'agent'
66
+ Requires-Dist: inspect-ai>=0.3.80; extra == 'agent'
63
67
  Requires-Dist: ipykernel; extra == 'agent'
64
68
  Requires-Dist: ipython<9; extra == 'agent'
65
69
  Requires-Dist: jupyter-client; extra == 'agent'
@@ -67,8 +71,21 @@ Requires-Dist: jupyter-core; extra == 'agent'
67
71
  Requires-Dist: langchain; extra == 'agent'
68
72
  Requires-Dist: langchain-anthropic; extra == 'agent'
69
73
  Requires-Dist: langchain-openai; extra == 'agent'
74
+ Requires-Dist: pillow>=11.1.0; extra == 'agent'
75
+ Requires-Dist: playwright; extra == 'agent'
76
+ Requires-Dist: pyautogui>=0.9.54; extra == 'agent'
77
+ Requires-Dist: pyright==1.1.401; extra == 'agent'
78
+ Requires-Dist: pytest-asyncio; extra == 'agent'
79
+ Requires-Dist: pytest-cov; extra == 'agent'
80
+ Requires-Dist: pytest-mock; extra == 'agent'
81
+ Requires-Dist: pytest<9,>=8.1.1; extra == 'agent'
82
+ Requires-Dist: ruff>=0.11.8; extra == 'agent'
83
+ Requires-Dist: setuptools; extra == 'agent'
84
+ Requires-Dist: textdistance<5,>=4.5.0; extra == 'agent'
70
85
  Provides-Extra: agents
86
+ Requires-Dist: aiodocker>=0.24.0; extra == 'agents'
71
87
  Requires-Dist: dotenv>=0.9.9; extra == 'agents'
88
+ Requires-Dist: inspect-ai>=0.3.80; extra == 'agents'
72
89
  Requires-Dist: ipykernel; extra == 'agents'
73
90
  Requires-Dist: ipython<9; extra == 'agents'
74
91
  Requires-Dist: jupyter-client; extra == 'agents'
@@ -76,6 +93,17 @@ Requires-Dist: jupyter-core; extra == 'agents'
76
93
  Requires-Dist: langchain; extra == 'agents'
77
94
  Requires-Dist: langchain-anthropic; extra == 'agents'
78
95
  Requires-Dist: langchain-openai; extra == 'agents'
96
+ Requires-Dist: pillow>=11.1.0; extra == 'agents'
97
+ Requires-Dist: playwright; extra == 'agents'
98
+ Requires-Dist: pyautogui>=0.9.54; extra == 'agents'
99
+ Requires-Dist: pyright==1.1.401; extra == 'agents'
100
+ Requires-Dist: pytest-asyncio; extra == 'agents'
101
+ Requires-Dist: pytest-cov; extra == 'agents'
102
+ Requires-Dist: pytest-mock; extra == 'agents'
103
+ Requires-Dist: pytest<9,>=8.1.1; extra == 'agents'
104
+ Requires-Dist: ruff>=0.11.8; extra == 'agents'
105
+ Requires-Dist: setuptools; extra == 'agents'
106
+ Requires-Dist: textdistance<5,>=4.5.0; extra == 'agents'
79
107
  Provides-Extra: dev
80
108
  Requires-Dist: aiodocker>=0.24.0; extra == 'dev'
81
109
  Requires-Dist: dotenv>=0.9.9; extra == 'dev'
@@ -100,14 +128,6 @@ Requires-Dist: setuptools; extra == 'dev'
100
128
  Requires-Dist: textdistance<5,>=4.5.0; extra == 'dev'
101
129
  Provides-Extra: rl
102
130
  Requires-Dist: bitsandbytes>=0.41.0; (sys_platform == 'linux') and extra == 'rl'
103
- Requires-Dist: dotenv>=0.9.9; extra == 'rl'
104
- Requires-Dist: ipykernel; extra == 'rl'
105
- Requires-Dist: ipython<9; extra == 'rl'
106
- Requires-Dist: jupyter-client; extra == 'rl'
107
- Requires-Dist: jupyter-core; extra == 'rl'
108
- Requires-Dist: langchain; extra == 'rl'
109
- Requires-Dist: langchain-anthropic; extra == 'rl'
110
- Requires-Dist: langchain-openai; extra == 'rl'
111
131
  Requires-Dist: liger-kernel>=0.5.0; (sys_platform == 'linux') and extra == 'rl'
112
132
  Requires-Dist: peft>=0.17.1; extra == 'rl'
113
133
  Requires-Dist: vllm==0.10.1.1; extra == 'rl'
@@ -138,8 +158,8 @@ OSS RL environment + evals toolkit. Wrap software as environments, run benchmark
138
158
  ## Highlights
139
159
 
140
160
  - 🚀 **[MCP environment skeleton](https://docs.hud.so/core-concepts/mcp-protocol)** – any agent can call any environment.
141
- - ⚡️ **[Live telemetry](https://app.hud.so)** – inspect every tool call, observation, and reward in real time.
142
- - 🗂️ **[Public benchmarks](https://app.hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
161
+ - ⚡️ **[Live telemetry](https://hud.so)** – inspect every tool call, observation, and reward in real time.
162
+ - 🗂️ **[Public benchmarks](https://hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
143
163
  - 🌱 **[Reinforcement learning built-in](rl/)** – Verifiers gym pipelines for GRPO on any environment.
144
164
  - 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
145
165
  - 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
@@ -185,14 +205,14 @@ from hud.agents import ClaudeAgent
185
205
  from hud.datasets import Task # See docs: https://docs.hud.so/reference/tasks
186
206
 
187
207
  async def main() -> None:
188
- with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://app.hud.so)
208
+ with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://hud.so)
189
209
  task = {
190
210
  "prompt": "Reach 64 in 2048.",
191
211
  "mcp_config": {
192
212
  "hud": {
193
213
  "url": "https://mcp.hud.so/v3/mcp", # HUD's cloud MCP server (see https://docs.hud.so/core-concepts/architecture)
194
214
  "headers": {
195
- "Authorization": f"Bearer {settings.api_key}", # Get your key at https://app.hud.so
215
+ "Authorization": f"Bearer {settings.api_key}", # Get your key at https://hud.so
196
216
  "Mcp-Image": "hudpython/hud-text-2048:v1.2" # Docker image from https://hub.docker.com/u/hudpython
197
217
  }
198
218
  }
@@ -219,7 +239,7 @@ async def main() -> None:
219
239
  asyncio.run(main())
220
240
  ```
221
241
 
222
- The above example let's the agent play 2048 ([See replay](https://app.hud.so/trace/6feed7bd-5f67-4d66-b77f-eb1e3164604f))
242
+ The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6feed7bd-5f67-4d66-b77f-eb1e3164604f))
223
243
 
224
244
  ![Agent playing 2048](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/2048_1.gif)
225
245
 
@@ -250,7 +270,7 @@ Supports multi‑turn RL for both:
250
270
  - Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
251
271
  - Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
252
272
 
253
- By default, `hud rl` provisions a persistant server and trainer in the cloud, streams telemetry to `app.hud.so`, and lets you monitor/manage models at `app.hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
273
+ By default, `hud rl` provisions a persistant server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
254
274
 
255
275
  Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
256
276
 
@@ -260,7 +280,7 @@ This is Claude Computer Use running on our proprietary financial analyst benchma
260
280
 
261
281
  ![Trace screenshot](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/trace_sheet.gif)
262
282
 
263
- > [See this trace on _app.hud.so_](https://app.hud.so/trace/9e212e9e-3627-4f1f-9eb5-c6d03c59070a)
283
+ > [See this trace on _hud.so_](https://hud.so/trace/9e212e9e-3627-4f1f-9eb5-c6d03c59070a)
264
284
 
265
285
  This example runs the full dataset (only takes ~20 minutes) using [run_evaluation.py](examples/run_evaluation.py):
266
286
 
@@ -286,7 +306,7 @@ results = await run_dataset(
286
306
  print(f"Average reward: {sum(r.reward for r in results) / len(results):.2f}")
287
307
  ```
288
308
 
289
- > Running a dataset creates a job and streams results to the [app.hud.so](https://app.hud.so) platform for analysis and [leaderboard submission](https://docs.hud.so/evaluate-agents/leaderboards).
309
+ > Running a dataset creates a job and streams results to the [hud.so](https://hud.so) platform for analysis and [leaderboard submission](https://docs.hud.so/evaluate-agents/leaderboards).
290
310
 
291
311
  ## Building Environments (MCP)
292
312
 
@@ -377,7 +397,7 @@ Tools
377
397
  hud push # needs docker login, hud api key
378
398
  ```
379
399
 
380
- 5. Now you can use `mcp.hud.so` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [app.hud.so](https://app.hud.so):
400
+ 5. Now you can use `mcp.hud.so` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [hud.so](https://hud.so):
381
401
 
382
402
  ```python
383
403
  from hud.agents import ClaudeAgent
@@ -408,7 +428,7 @@ result = await ClaudeAgent().run({ # See all agents: https://docs.hud.so/refere
408
428
 
409
429
  ## Leaderboards & benchmarks
410
430
 
411
- All leaderboards are publicly available on [app.hud.so/leaderboards](https://app.hud.so/leaderboards) (see [docs](https://docs.hud.so/evaluate-agents/leaderboards))
431
+ All leaderboards are publicly available on [hud.so/leaderboards](https://hud.so/leaderboards) (see [docs](https://docs.hud.so/evaluate-agents/leaderboards))
412
432
 
413
433
  ![Leaderboard](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/leaderboards_3.png)
414
434
 
@@ -422,7 +442,7 @@ Using the [`run_dataset`](https://docs.hud.so/reference/tasks#run_dataset) funct
422
442
  %%{init: {"theme": "neutral", "themeVariables": {"fontSize": "14px"}} }%%
423
443
  graph LR
424
444
  subgraph "Platform"
425
- Dashboard["📊 app.hud.so"]
445
+ Dashboard["📊 hud.so"]
426
446
  API["🔌 mcp.hud.so"]
427
447
  end
428
448