hud-python 0.4.43__py3-none-any.whl → 0.4.45__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/agents/misc/__init__.py +2 -1
- hud/agents/misc/integration_test_agent.py +56 -0
- hud/agents/tests/test_openai.py +32 -26
- hud/cli/__init__.py +17 -4
- hud/cli/eval.py +85 -64
- hud/cli/rl/gpu_utils.py +1 -2
- hud/rl/distributed.py +40 -3
- hud/rl/learner.py +53 -5
- hud/rl/train.py +71 -52
- hud/telemetry/trace.py +4 -1
- hud/types.py +2 -1
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.43.dist-info → hud_python-0.4.45.dist-info}/METADATA +2 -2
- {hud_python-0.4.43.dist-info → hud_python-0.4.45.dist-info}/RECORD +18 -17
- {hud_python-0.4.43.dist-info → hud_python-0.4.45.dist-info}/WHEEL +0 -0
- {hud_python-0.4.43.dist-info → hud_python-0.4.45.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.43.dist-info → hud_python-0.4.45.dist-info}/licenses/LICENSE +0 -0
hud/agents/misc/__init__.py
CHANGED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from hud.agents.base import MCPAgent, find_reward
|
|
6
|
+
from hud.types import AgentResponse, Task, Trace
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class IntegrationTestRunner(MCPAgent):
|
|
10
|
+
def __init__(self, **kwargs: Any) -> None:
|
|
11
|
+
kwargs["auto_trace"] = False
|
|
12
|
+
super().__init__(**kwargs)
|
|
13
|
+
self.metadata = {}
|
|
14
|
+
|
|
15
|
+
async def run(self, task: Task, max_steps: int = 10) -> Trace:
|
|
16
|
+
try:
|
|
17
|
+
# Initialize using base to set up client and telemetry correctly
|
|
18
|
+
await self.initialize(task)
|
|
19
|
+
|
|
20
|
+
# Validate task shape
|
|
21
|
+
if not getattr(task, "integration_test_tool", None):
|
|
22
|
+
raise ValueError(
|
|
23
|
+
"--integration-test requires task.integration_test_tool (single call)"
|
|
24
|
+
)
|
|
25
|
+
elif not getattr(task, "evaluate_tool", None):
|
|
26
|
+
raise ValueError("--integration-test requires task.evaluate_tool (single call)")
|
|
27
|
+
|
|
28
|
+
if task.setup_tool:
|
|
29
|
+
_ = await self.call_tools(task.setup_tool)
|
|
30
|
+
|
|
31
|
+
_ = await self.call_tools(task.integration_test_tool)
|
|
32
|
+
evaluate_result = await self.call_tools(task.evaluate_tool)
|
|
33
|
+
|
|
34
|
+
reward = float(find_reward(evaluate_result[0])) if evaluate_result else 0.0
|
|
35
|
+
|
|
36
|
+
return Trace(done=True, reward=reward, info={})
|
|
37
|
+
finally:
|
|
38
|
+
# Ensure resources are cleaned up so the CLI can exit cleanly
|
|
39
|
+
await self._cleanup()
|
|
40
|
+
|
|
41
|
+
# Stub implementations to satisfy abstract base class; not used in --integration-test path
|
|
42
|
+
async def get_system_messages(self) -> list[Any]:
|
|
43
|
+
return []
|
|
44
|
+
|
|
45
|
+
async def get_response(self, messages: list[Any]) -> AgentResponse:
|
|
46
|
+
raise NotImplementedError("IntegrationTestRunner does not implement agent loop")
|
|
47
|
+
|
|
48
|
+
async def format_blocks(self, blocks: list[Any]) -> list[Any]:
|
|
49
|
+
return []
|
|
50
|
+
|
|
51
|
+
async def format_tool_results(
|
|
52
|
+
self,
|
|
53
|
+
tool_calls: list[Any],
|
|
54
|
+
tool_results: list[Any],
|
|
55
|
+
) -> list[Any]:
|
|
56
|
+
return []
|
hud/agents/tests/test_openai.py
CHANGED
|
@@ -146,37 +146,43 @@ class TestOperatorAgent:
|
|
|
146
146
|
@pytest.mark.asyncio
|
|
147
147
|
async def test_get_model_response(self, mock_mcp_client, mock_openai):
|
|
148
148
|
"""Test getting model response from OpenAI API."""
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
149
|
+
# Disable telemetry for this test to avoid backend configuration issues
|
|
150
|
+
with patch("hud.settings.settings.telemetry_enabled", False):
|
|
151
|
+
agent = OperatorAgent(
|
|
152
|
+
mcp_client=mock_mcp_client,
|
|
153
|
+
model_client=mock_openai,
|
|
154
|
+
validate_api_key=False, # Skip validation in tests
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# Set up available tools so agent doesn't return "No computer use tools available"
|
|
158
|
+
agent._available_tools = [
|
|
159
|
+
types.Tool(name="computer_openai", description="Computer tool", inputSchema={})
|
|
160
|
+
]
|
|
154
161
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
162
|
+
# Mock OpenAI API response for a successful computer use response
|
|
163
|
+
mock_response = MagicMock()
|
|
164
|
+
mock_response.id = "response_123"
|
|
165
|
+
mock_response.state = "completed"
|
|
166
|
+
# Mock the output message structure
|
|
167
|
+
mock_output_text = MagicMock()
|
|
168
|
+
mock_output_text.type = "output_text"
|
|
169
|
+
mock_output_text.text = "I can see the screen content."
|
|
159
170
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
mock_response.state = "completed"
|
|
164
|
-
# Mock the output message structure
|
|
165
|
-
mock_output_text = MagicMock()
|
|
166
|
-
mock_output_text.type = "output_text"
|
|
167
|
-
mock_output_text.text = "I can see the screen content."
|
|
168
|
-
mock_output_message = MagicMock()
|
|
169
|
-
mock_output_message.type = "message"
|
|
170
|
-
mock_output_message.content = [mock_output_text]
|
|
171
|
-
mock_response.output = [mock_output_message]
|
|
171
|
+
mock_output_message = MagicMock()
|
|
172
|
+
mock_output_message.type = "message"
|
|
173
|
+
mock_output_message.content = [mock_output_text]
|
|
172
174
|
|
|
173
|
-
|
|
175
|
+
mock_response.output = [mock_output_message]
|
|
174
176
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
+
mock_openai.responses.create = AsyncMock(return_value=mock_response)
|
|
178
|
+
|
|
179
|
+
messages = [{"prompt": "What's on the screen?", "screenshot": None}]
|
|
180
|
+
response = await agent.get_response(messages)
|
|
177
181
|
|
|
178
|
-
|
|
179
|
-
|
|
182
|
+
# The test should verify that the response is processed correctly
|
|
183
|
+
# Since the isinstance checks will fail, content will be empty, but done should be True
|
|
184
|
+
assert response.done is True
|
|
185
|
+
assert response.tool_calls == []
|
|
180
186
|
|
|
181
187
|
@pytest.mark.asyncio
|
|
182
188
|
async def test_handle_empty_response(self, mock_mcp_client, mock_openai):
|
hud/cli/__init__.py
CHANGED
|
@@ -144,7 +144,7 @@ def debug(
|
|
|
144
144
|
None,
|
|
145
145
|
help="Docker image, environment directory, or config file followed by optional Docker arguments", # noqa: E501
|
|
146
146
|
),
|
|
147
|
-
config: Path = typer.Option( # noqa: B008
|
|
147
|
+
config: Path | None = typer.Option( # noqa: B008
|
|
148
148
|
None,
|
|
149
149
|
"--config",
|
|
150
150
|
"-c",
|
|
@@ -976,6 +976,15 @@ def eval(
|
|
|
976
976
|
"--group-size",
|
|
977
977
|
help="Number of times to run each task (similar to RL training)",
|
|
978
978
|
),
|
|
979
|
+
integration_test: bool = typer.Option(
|
|
980
|
+
False,
|
|
981
|
+
"--integration-test",
|
|
982
|
+
help=(
|
|
983
|
+
"Run integration_test_tool, where problem is setup, "
|
|
984
|
+
"actions are applied, and evaluation is performed, without "
|
|
985
|
+
"spinning up an agent"
|
|
986
|
+
),
|
|
987
|
+
),
|
|
979
988
|
) -> None:
|
|
980
989
|
"""🚀 Run evaluation on datasets or individual tasks with agents."""
|
|
981
990
|
from hud.settings import settings
|
|
@@ -983,6 +992,9 @@ def eval(
|
|
|
983
992
|
|
|
984
993
|
hud_console = HUDConsole()
|
|
985
994
|
|
|
995
|
+
if integration_test:
|
|
996
|
+
agent = "integration_test"
|
|
997
|
+
|
|
986
998
|
# If no source provided, reuse RL helper to find a tasks file interactively
|
|
987
999
|
if source is None:
|
|
988
1000
|
try:
|
|
@@ -1038,7 +1050,7 @@ def eval(
|
|
|
1038
1050
|
agent = hud_console.select("Select an agent to use:", choices=choices, default=0)
|
|
1039
1051
|
|
|
1040
1052
|
# Handle HUD model selection
|
|
1041
|
-
if agent and agent not in ["claude", "openai", "vllm", "litellm"]:
|
|
1053
|
+
if agent and agent not in ["claude", "openai", "vllm", "litellm", "integration_test"]:
|
|
1042
1054
|
# Find remote model name
|
|
1043
1055
|
model = agent
|
|
1044
1056
|
if not vllm_base_url:
|
|
@@ -1059,7 +1071,7 @@ def eval(
|
|
|
1059
1071
|
hud_console.info(f"Using HUD model: {model} (trained on {base_model})")
|
|
1060
1072
|
|
|
1061
1073
|
# Validate agent choice
|
|
1062
|
-
valid_agents = ["claude", "openai", "vllm", "litellm"]
|
|
1074
|
+
valid_agents = ["claude", "openai", "vllm", "litellm", "integration_test"]
|
|
1063
1075
|
if agent not in valid_agents:
|
|
1064
1076
|
hud_console.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}")
|
|
1065
1077
|
raise typer.Exit(1)
|
|
@@ -1080,6 +1092,7 @@ def eval(
|
|
|
1080
1092
|
very_verbose=very_verbose,
|
|
1081
1093
|
vllm_base_url=vllm_base_url,
|
|
1082
1094
|
group_size=group_size,
|
|
1095
|
+
integration_test=integration_test,
|
|
1083
1096
|
)
|
|
1084
1097
|
|
|
1085
1098
|
|
|
@@ -1105,7 +1118,7 @@ def get(
|
|
|
1105
1118
|
),
|
|
1106
1119
|
) -> None:
|
|
1107
1120
|
"""📥 Download a HuggingFace dataset and save it as JSONL."""
|
|
1108
|
-
from .get import get_command
|
|
1121
|
+
from hud.cli.get import get_command
|
|
1109
1122
|
|
|
1110
1123
|
get_command(
|
|
1111
1124
|
dataset_name=dataset_name,
|
hud/cli/eval.py
CHANGED
|
@@ -69,7 +69,7 @@ def get_available_models() -> list[dict[str, str | None]]:
|
|
|
69
69
|
|
|
70
70
|
|
|
71
71
|
def build_agent(
|
|
72
|
-
agent_type: Literal["claude", "openai", "vllm", "litellm"],
|
|
72
|
+
agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"],
|
|
73
73
|
*,
|
|
74
74
|
model: str | None = None,
|
|
75
75
|
allowed_tools: list[str] | None = None,
|
|
@@ -79,7 +79,11 @@ def build_agent(
|
|
|
79
79
|
"""Create and return the requested agent type."""
|
|
80
80
|
|
|
81
81
|
# Import agents lazily to avoid dependency issues
|
|
82
|
-
if agent_type == "
|
|
82
|
+
if agent_type == "integration_test":
|
|
83
|
+
from hud.agents.misc.integration_test_agent import IntegrationTestRunner
|
|
84
|
+
|
|
85
|
+
return IntegrationTestRunner(verbose=verbose)
|
|
86
|
+
elif agent_type == "vllm":
|
|
83
87
|
# Create a generic OpenAI agent for vLLM server
|
|
84
88
|
try:
|
|
85
89
|
from openai import AsyncOpenAI
|
|
@@ -185,7 +189,7 @@ def build_agent(
|
|
|
185
189
|
async def run_single_task(
|
|
186
190
|
source: str,
|
|
187
191
|
*,
|
|
188
|
-
agent_type: Literal["claude", "openai", "vllm", "litellm"] = "claude",
|
|
192
|
+
agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude",
|
|
189
193
|
model: str | None = None,
|
|
190
194
|
allowed_tools: list[str] | None = None,
|
|
191
195
|
max_steps: int = 10,
|
|
@@ -205,12 +209,9 @@ async def run_single_task(
|
|
|
205
209
|
)
|
|
206
210
|
raise typer.Exit(1) from e
|
|
207
211
|
|
|
208
|
-
# Check if it's a file
|
|
209
212
|
path = Path(source)
|
|
210
213
|
if path.exists() and (path.suffix in [".json", ".jsonl"]):
|
|
211
214
|
hud_console.info("📊 Loading task file…")
|
|
212
|
-
|
|
213
|
-
# Use unified loader for both JSON and JSONL
|
|
214
215
|
tasks: list[Task] = load_tasks(str(path)) # type: ignore[assignment]
|
|
215
216
|
|
|
216
217
|
# If tasks reference a local environment (nearby), ensure it's built/up-to-date.
|
|
@@ -218,13 +219,14 @@ async def run_single_task(
|
|
|
218
219
|
env_dir = find_environment_dir(path)
|
|
219
220
|
if env_dir is not None:
|
|
220
221
|
# Non-interactive for eval; warn but don't block
|
|
221
|
-
ensure_built(env_dir, interactive=
|
|
222
|
+
ensure_built(env_dir, interactive=False)
|
|
222
223
|
except Exception as e:
|
|
223
224
|
hud_console.debug(f"Eval preflight env check skipped: {e}")
|
|
224
225
|
|
|
225
226
|
# Single task - use the first (and only) task
|
|
226
227
|
task = tasks[0]
|
|
227
228
|
hud_console.info("Found 1 task, running as single task…")
|
|
229
|
+
|
|
228
230
|
else:
|
|
229
231
|
# Load from HuggingFace dataset or non-file source
|
|
230
232
|
hud_console.info(f"📊 Loading tasks from: {source}…")
|
|
@@ -243,60 +245,67 @@ async def run_single_task(
|
|
|
243
245
|
task_prompt = task.prompt[:50] + "..." if len(task.prompt) > 50 else task.prompt
|
|
244
246
|
|
|
245
247
|
# Use grouped evaluation if group_size > 1
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
248
|
+
agent_config: dict[str, Any] = {}
|
|
249
|
+
if agent_type == "integration_test":
|
|
250
|
+
from hud.agents.misc.integration_test_agent import IntegrationTestRunner
|
|
249
251
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
252
|
+
agent_class = IntegrationTestRunner
|
|
253
|
+
agent_config = {"verbose": verbose}
|
|
254
|
+
if allowed_tools:
|
|
255
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
256
|
+
elif agent_type == "vllm":
|
|
257
|
+
# Special handling for vLLM
|
|
258
|
+
sample_agent = build_agent(
|
|
259
|
+
agent_type,
|
|
260
|
+
model=model,
|
|
261
|
+
allowed_tools=allowed_tools,
|
|
262
|
+
verbose=verbose,
|
|
263
|
+
vllm_base_url=vllm_base_url,
|
|
264
|
+
)
|
|
265
|
+
agent_config = {
|
|
266
|
+
"openai_client": sample_agent.oai,
|
|
267
|
+
"model_name": sample_agent.model_name,
|
|
268
|
+
"verbose": verbose,
|
|
269
|
+
"completion_kwargs": sample_agent.completion_kwargs,
|
|
270
|
+
}
|
|
271
|
+
if allowed_tools:
|
|
272
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
268
273
|
|
|
269
|
-
|
|
274
|
+
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
|
|
270
275
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
276
|
+
agent_class = GenericOpenAIChatAgent
|
|
277
|
+
elif agent_type == "openai":
|
|
278
|
+
from hud.agents import OperatorAgent
|
|
274
279
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
280
|
+
agent_class = OperatorAgent
|
|
281
|
+
agent_config = {"verbose": verbose}
|
|
282
|
+
if allowed_tools:
|
|
283
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
284
|
+
elif agent_type == "litellm":
|
|
285
|
+
from hud.agents.lite_llm import LiteAgent
|
|
281
286
|
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
287
|
+
agent_class = LiteAgent
|
|
288
|
+
agent_config = {
|
|
289
|
+
"model_name": model or "gpt-4o-mini",
|
|
290
|
+
"verbose": verbose,
|
|
291
|
+
}
|
|
292
|
+
if allowed_tools:
|
|
293
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
294
|
+
elif agent_type == "claude":
|
|
295
|
+
from hud.agents import ClaudeAgent
|
|
291
296
|
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
297
|
+
agent_class = ClaudeAgent
|
|
298
|
+
agent_config = {
|
|
299
|
+
"model": model or "claude-sonnet-4-20250514",
|
|
300
|
+
"verbose": verbose,
|
|
301
|
+
}
|
|
302
|
+
if allowed_tools:
|
|
303
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
304
|
+
else:
|
|
305
|
+
raise ValueError(f"Invalid agent type: {agent_type}")
|
|
299
306
|
|
|
307
|
+
if group_size > 1:
|
|
308
|
+
hud_console.info(f"🔄 Running task with group_size={group_size}")
|
|
300
309
|
# Run with grouping
|
|
301
310
|
stats = await run_tasks_grouped(
|
|
302
311
|
tasks=[task],
|
|
@@ -307,10 +316,7 @@ async def run_single_task(
|
|
|
307
316
|
max_steps=max_steps,
|
|
308
317
|
verbose=verbose,
|
|
309
318
|
)
|
|
310
|
-
|
|
311
|
-
# Display results
|
|
312
319
|
display_group_statistics(stats, show_details=True)
|
|
313
|
-
|
|
314
320
|
else:
|
|
315
321
|
# Original single-run logic
|
|
316
322
|
with hud.trace(name=task_prompt):
|
|
@@ -329,7 +335,7 @@ async def run_single_task(
|
|
|
329
335
|
async def run_full_dataset(
|
|
330
336
|
source: str,
|
|
331
337
|
*,
|
|
332
|
-
agent_type: Literal["claude", "openai", "vllm", "litellm"] = "claude",
|
|
338
|
+
agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude",
|
|
333
339
|
model: str | None = None,
|
|
334
340
|
allowed_tools: list[str] | None = None,
|
|
335
341
|
max_concurrent: int = 30,
|
|
@@ -372,10 +378,13 @@ async def run_full_dataset(
|
|
|
372
378
|
path = Path(source)
|
|
373
379
|
dataset_name = f"Dataset: {path.name}" if path.exists() else source.split("/")[-1]
|
|
374
380
|
|
|
375
|
-
hud_console.info(f"Found {len(tasks)} tasks")
|
|
376
|
-
|
|
377
381
|
# Build agent class + config for run_dataset
|
|
378
|
-
if agent_type == "
|
|
382
|
+
if agent_type == "integration_test": # --integration-test mode
|
|
383
|
+
from hud.agents.misc.integration_test_agent import IntegrationTestRunner
|
|
384
|
+
|
|
385
|
+
agent_class = IntegrationTestRunner
|
|
386
|
+
agent_config = {"verbose": verbose}
|
|
387
|
+
elif agent_type == "vllm":
|
|
379
388
|
try:
|
|
380
389
|
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
|
|
381
390
|
|
|
@@ -405,7 +414,6 @@ async def run_full_dataset(
|
|
|
405
414
|
}
|
|
406
415
|
if allowed_tools:
|
|
407
416
|
agent_config["allowed_tools"] = allowed_tools
|
|
408
|
-
|
|
409
417
|
elif agent_type == "openai":
|
|
410
418
|
try:
|
|
411
419
|
from hud.agents import OperatorAgent
|
|
@@ -557,7 +565,7 @@ def eval_command(
|
|
|
557
565
|
"--full",
|
|
558
566
|
help="Run the entire dataset (omit for single-task debug mode)",
|
|
559
567
|
),
|
|
560
|
-
agent: Literal["claude", "openai", "vllm", "litellm"] = typer.Option(
|
|
568
|
+
agent: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = typer.Option(
|
|
561
569
|
"claude",
|
|
562
570
|
"--agent",
|
|
563
571
|
help="Agent backend to use (claude, openai, vllm for local server, or litellm)",
|
|
@@ -573,7 +581,7 @@ def eval_command(
|
|
|
573
581
|
help="Comma-separated list of allowed tools",
|
|
574
582
|
),
|
|
575
583
|
max_concurrent: int = typer.Option(
|
|
576
|
-
|
|
584
|
+
30,
|
|
577
585
|
"--max-concurrent",
|
|
578
586
|
help="Concurrency level for asyncio mode (ignored in parallel mode)",
|
|
579
587
|
),
|
|
@@ -618,6 +626,15 @@ def eval_command(
|
|
|
618
626
|
"--group-size",
|
|
619
627
|
help="Number of times to run each task (similar to RL training)",
|
|
620
628
|
),
|
|
629
|
+
integration_test: bool = typer.Option(
|
|
630
|
+
False,
|
|
631
|
+
"--integration-test",
|
|
632
|
+
help=(
|
|
633
|
+
"Run integration_test_tool tool, where problem is setup, "
|
|
634
|
+
"actions are applied, and evaluation is performed, without "
|
|
635
|
+
"spinning up an agent"
|
|
636
|
+
),
|
|
637
|
+
),
|
|
621
638
|
) -> None:
|
|
622
639
|
"""🚀 Run evaluation on datasets or individual tasks with agents.
|
|
623
640
|
|
|
@@ -674,6 +691,10 @@ def eval_command(
|
|
|
674
691
|
logging.getLogger("hud.agents").setLevel(logging.INFO)
|
|
675
692
|
logging.getLogger("hud.agents.base").setLevel(logging.INFO)
|
|
676
693
|
|
|
694
|
+
# We pass integration_test as the agent_type
|
|
695
|
+
if integration_test:
|
|
696
|
+
agent = "integration_test"
|
|
697
|
+
|
|
677
698
|
# Check for required API keys
|
|
678
699
|
if agent == "claude":
|
|
679
700
|
if not settings.anthropic_api_key:
|
hud/cli/rl/gpu_utils.py
CHANGED
|
@@ -7,8 +7,6 @@ import subprocess
|
|
|
7
7
|
import time
|
|
8
8
|
from typing import TYPE_CHECKING, Any
|
|
9
9
|
|
|
10
|
-
import torch
|
|
11
|
-
|
|
12
10
|
from hud.utils.hud_console import HUDConsole
|
|
13
11
|
|
|
14
12
|
if TYPE_CHECKING:
|
|
@@ -87,6 +85,7 @@ def health_check_gpus(gpu_indices: list[int]) -> dict[str, Any]:
|
|
|
87
85
|
- all_healthy: Boolean indicating if all GPUs are healthy
|
|
88
86
|
- memory_issues: Boolean indicating if there are memory issues
|
|
89
87
|
"""
|
|
88
|
+
import torch
|
|
90
89
|
from rich.console import Console
|
|
91
90
|
from rich.table import Table
|
|
92
91
|
|
hud/rl/distributed.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
|
+
from datetime import timedelta
|
|
6
7
|
from typing import Any
|
|
7
8
|
|
|
8
9
|
import torch
|
|
@@ -17,7 +18,10 @@ def setup_distributed() -> None:
|
|
|
17
18
|
torch.cuda.set_device(local_rank)
|
|
18
19
|
|
|
19
20
|
# Initialize process group
|
|
20
|
-
|
|
21
|
+
# Increase watchdog timeout to accommodate long eval/sampling phases
|
|
22
|
+
# and enable clearer NCCL error handling.
|
|
23
|
+
os.environ.setdefault("TORCH_NCCL_ASYNC_ERROR_HANDLING", "1")
|
|
24
|
+
dist.init_process_group("nccl", timeout=timedelta(minutes=20))
|
|
21
25
|
|
|
22
26
|
|
|
23
27
|
def get_local_rank() -> int:
|
|
@@ -66,15 +70,48 @@ def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor:
|
|
|
66
70
|
|
|
67
71
|
|
|
68
72
|
def broadcast_object(obj: Any, src: int = 0) -> Any:
|
|
69
|
-
"""Broadcast a Python object from src rank to all ranks.
|
|
73
|
+
"""Broadcast a Python object from src rank to all ranks.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
obj: Object to broadcast (used on src rank)
|
|
77
|
+
src: Source rank
|
|
78
|
+
device: Device for temporary tensor buffer during pickling transfer
|
|
79
|
+
"""
|
|
70
80
|
if not dist.is_initialized():
|
|
71
81
|
return obj
|
|
72
82
|
|
|
73
83
|
obj_list = [obj] if dist.get_rank() == src else [None]
|
|
74
|
-
dist.broadcast_object_list(obj_list, src=src)
|
|
84
|
+
dist.broadcast_object_list(obj_list, src=src, device=torch.device("cpu"))
|
|
75
85
|
return obj_list[0]
|
|
76
86
|
|
|
77
87
|
|
|
88
|
+
def scatter_object(
|
|
89
|
+
obj_list: list[Any] | None,
|
|
90
|
+
src: int = 0,
|
|
91
|
+
) -> Any:
|
|
92
|
+
"""Scatter a list of Python objects from src so each rank receives one object.
|
|
93
|
+
|
|
94
|
+
Usage:
|
|
95
|
+
- On src rank: pass the full list (length == world_size)
|
|
96
|
+
- On non-src ranks: pass None
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
The object intended for this rank.
|
|
100
|
+
"""
|
|
101
|
+
if not dist.is_initialized():
|
|
102
|
+
# Single-process: return first element if provided, else None
|
|
103
|
+
if obj_list is None or len(obj_list) == 0:
|
|
104
|
+
return None
|
|
105
|
+
return obj_list[0]
|
|
106
|
+
|
|
107
|
+
out: list[Any] = [None]
|
|
108
|
+
if dist.get_rank() == src:
|
|
109
|
+
dist.scatter_object_list(out, obj_list, src=src)
|
|
110
|
+
else:
|
|
111
|
+
dist.scatter_object_list(out, None, src=src)
|
|
112
|
+
return out[0]
|
|
113
|
+
|
|
114
|
+
|
|
78
115
|
def gather_tensors(tensor: torch.Tensor) -> list[torch.Tensor] | None:
|
|
79
116
|
"""Gather tensors from all ranks to rank 0.
|
|
80
117
|
|
hud/rl/learner.py
CHANGED
|
@@ -7,7 +7,6 @@ import os
|
|
|
7
7
|
from typing import TYPE_CHECKING, Any
|
|
8
8
|
|
|
9
9
|
import torch
|
|
10
|
-
import torch.nn.functional as F
|
|
11
10
|
from peft import LoraConfig, get_peft_model
|
|
12
11
|
from torch.nn.parallel import DistributedDataParallel as DDP
|
|
13
12
|
from transformers import (
|
|
@@ -240,6 +239,8 @@ class GRPOLearner:
|
|
|
240
239
|
if sample.inputs:
|
|
241
240
|
sample = sample.to_device(self.device)
|
|
242
241
|
sample.old_logprobs, _ = self.compute_logprobs(self.policy, sample.inputs)
|
|
242
|
+
# Free GPU memory for this sample immediately
|
|
243
|
+
sample.to_device(torch.device("cpu"))
|
|
243
244
|
|
|
244
245
|
policy_module = self.policy.module if hasattr(self.policy, "module") else self.policy
|
|
245
246
|
with policy_module.disable_adapter():
|
|
@@ -247,7 +248,10 @@ class GRPOLearner:
|
|
|
247
248
|
if is_main_process():
|
|
248
249
|
progress.update(f"Processing batch of traces... {i}/{len(batch)}")
|
|
249
250
|
if sample.inputs:
|
|
251
|
+
# Move back to GPU for reference computation, then free
|
|
252
|
+
sample = sample.to_device(self.device)
|
|
250
253
|
sample.ref_logprobs, _ = self.compute_logprobs(self.policy, sample.inputs)
|
|
254
|
+
sample.to_device(torch.device("cpu"))
|
|
251
255
|
|
|
252
256
|
hud_console.info_log("Creating mini-batches...")
|
|
253
257
|
group_size = self.config.training.group_size
|
|
@@ -488,10 +492,13 @@ class GRPOLearner:
|
|
|
488
492
|
out = model(**model_inputs)
|
|
489
493
|
|
|
490
494
|
logits = out.logits / self.config.actor.temperature
|
|
491
|
-
log_probs = F.log_softmax(logits, dim=-1)
|
|
492
495
|
|
|
493
496
|
targets = inputs["input_ids"][:, 1:]
|
|
494
|
-
|
|
497
|
+
|
|
498
|
+
# Align logits to predict next token: use logits[:, :-1, :]
|
|
499
|
+
next_logits = logits[:, :-1, :]
|
|
500
|
+
|
|
501
|
+
token_log_probs = _selective_log_softmax(next_logits, targets)
|
|
495
502
|
|
|
496
503
|
# Compute entropy only for assistant tokens to save memory
|
|
497
504
|
assistant_mask = inputs["assistant_mask"]
|
|
@@ -506,8 +513,19 @@ class GRPOLearner:
|
|
|
506
513
|
# Return dummy values that match expected shapes
|
|
507
514
|
seq_len = inputs["input_ids"].shape[1] - 1 if "input_ids" in inputs else 0
|
|
508
515
|
batch_size = inputs["input_ids"].shape[0] if "input_ids" in inputs else 1
|
|
509
|
-
|
|
510
|
-
|
|
516
|
+
# Create dummy tensors that still participate in autograd so backward doesn't fail
|
|
517
|
+
try:
|
|
518
|
+
# Touch params to build a graph
|
|
519
|
+
param_sum = torch.sum(next(self.policy.parameters()))
|
|
520
|
+
base = param_sum * 0.0
|
|
521
|
+
except StopIteration:
|
|
522
|
+
base = torch.tensor(0.0, device=self.device)
|
|
523
|
+
dummy_logprobs = (
|
|
524
|
+
base + torch.zeros(batch_size, seq_len, device=self.device)
|
|
525
|
+
).requires_grad_(True)
|
|
526
|
+
dummy_entropy = (
|
|
527
|
+
base + torch.zeros(batch_size, seq_len, device=self.device)
|
|
528
|
+
).requires_grad_(True)
|
|
511
529
|
return dummy_logprobs, dummy_entropy
|
|
512
530
|
|
|
513
531
|
def save(self, path: str) -> None:
|
|
@@ -587,3 +605,33 @@ def sanity_check(
|
|
|
587
605
|
rho_diag[m] = torch.exp(masked_log_rho[m].clamp(-20.0, 20.0))
|
|
588
606
|
_stats("ratio_tok(masked)", ratio_diag)
|
|
589
607
|
_stats("rho_tok(masked)", rho_diag)
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
def _selective_log_softmax(
|
|
611
|
+
logits_bt_v: torch.Tensor,
|
|
612
|
+
index_bt: torch.Tensor,
|
|
613
|
+
) -> torch.Tensor:
|
|
614
|
+
"""Gather log softmax for selected indices with reduced peak memory.
|
|
615
|
+
|
|
616
|
+
Uses logsumexp subtraction for float32/64; falls back to per-row
|
|
617
|
+
log_softmax for bf16/fp16.
|
|
618
|
+
logits_bt_v: [B, T, V]
|
|
619
|
+
index_bt: [B, T]
|
|
620
|
+
Returns: [B, T]
|
|
621
|
+
"""
|
|
622
|
+
if logits_bt_v.dtype in (torch.float32, torch.float64):
|
|
623
|
+
# Compute logsumexp per [B, T] in a loop over batch to reduce
|
|
624
|
+
# peak from B*T*V to T*V
|
|
625
|
+
logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits_bt_v])
|
|
626
|
+
selected_logits = torch.gather(logits_bt_v, dim=-1, index=index_bt.unsqueeze(-1)).squeeze(
|
|
627
|
+
-1
|
|
628
|
+
)
|
|
629
|
+
return selected_logits - logsumexp_values
|
|
630
|
+
# Reduced precision: numerically stable route using per-row log_softmax
|
|
631
|
+
token_logprobs_rows: list[torch.Tensor] = []
|
|
632
|
+
for logits_row, index_row in zip(logits_bt_v, index_bt, strict=True):
|
|
633
|
+
logprobs_row = logits_row.log_softmax(dim=-1)
|
|
634
|
+
token_logprobs_rows.append(
|
|
635
|
+
torch.gather(logprobs_row, dim=-1, index=index_row.unsqueeze(-1)).squeeze(-1)
|
|
636
|
+
)
|
|
637
|
+
return torch.stack(token_logprobs_rows)
|
hud/rl/train.py
CHANGED
|
@@ -11,9 +11,8 @@ import argparse
|
|
|
11
11
|
import asyncio
|
|
12
12
|
import json
|
|
13
13
|
import logging
|
|
14
|
-
from datetime import datetime
|
|
15
14
|
from pathlib import Path
|
|
16
|
-
from typing import TYPE_CHECKING
|
|
15
|
+
from typing import TYPE_CHECKING, cast
|
|
17
16
|
|
|
18
17
|
import hud
|
|
19
18
|
from hud.rl.actor import Actor
|
|
@@ -25,6 +24,7 @@ from hud.rl.distributed import (
|
|
|
25
24
|
get_global_rank,
|
|
26
25
|
get_world_size,
|
|
27
26
|
is_main_process,
|
|
27
|
+
scatter_object,
|
|
28
28
|
setup_distributed,
|
|
29
29
|
synchronize,
|
|
30
30
|
)
|
|
@@ -133,53 +133,71 @@ async def train(config: Config, tasks: list[Task]) -> None:
|
|
|
133
133
|
global_reward_stats = None
|
|
134
134
|
global_advantage_stats = None
|
|
135
135
|
|
|
136
|
-
#
|
|
136
|
+
# Step-state gate: ensure all ranks branch coherently
|
|
137
|
+
state = {"ok": False, "err": None, "num_samples": 0}
|
|
138
|
+
rank_samples = None
|
|
139
|
+
episode_time_value = None
|
|
140
|
+
|
|
141
|
+
# Only rank 0 runs tasks and prepares distribution
|
|
137
142
|
if is_main_process() and actor is not None:
|
|
138
143
|
import time
|
|
139
144
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
145
|
+
try:
|
|
146
|
+
episode_start_time = time.time()
|
|
147
|
+
traces = await actor.run_tasks(tasks, job_id=job_id)
|
|
148
|
+
episode_time = time.time() - episode_start_time
|
|
149
|
+
hud_console.info(f"Sampled {len(traces)} traces in {episode_time:.1f}s")
|
|
150
|
+
trace_buffer.add(traces)
|
|
151
|
+
global_reward_stats = [trace.reward for trace in traces]
|
|
152
|
+
|
|
153
|
+
# Get all traces from buffer for distribution
|
|
154
|
+
all_traces = trace_buffer.sample_traces()
|
|
155
|
+
|
|
156
|
+
# Preprocess traces to training samples
|
|
157
|
+
preprocessed_traces = preprocess_advantages(all_traces, config)
|
|
158
|
+
|
|
159
|
+
# Store these for later use in metrics
|
|
160
|
+
global_advantage_stats = [sample.advantage for sample in preprocessed_traces]
|
|
161
|
+
|
|
162
|
+
# Distribute preprocessed samples in groups across ranks via scatter
|
|
163
|
+
# Ensure list length is a multiple of num_gpus by allowing empty per-rank slices
|
|
164
|
+
gpu_batch_size = max(1, (len(preprocessed_traces) + num_gpus - 1) // num_gpus)
|
|
165
|
+
rank_samples = [
|
|
166
|
+
preprocessed_traces[i : i + gpu_batch_size]
|
|
167
|
+
for i in range(0, len(preprocessed_traces), gpu_batch_size)
|
|
168
|
+
]
|
|
169
|
+
# Pad rank_samples to exactly num_gpus entries
|
|
170
|
+
if len(rank_samples) < num_gpus:
|
|
171
|
+
rank_samples.extend([[] for _ in range(num_gpus - len(rank_samples))])
|
|
172
|
+
|
|
173
|
+
# Log distribution info
|
|
174
|
+
dist_msg = (
|
|
175
|
+
f"Distributing {len(preprocessed_traces)} samples as {gpu_batch_size} "
|
|
176
|
+
f"sized batches across {num_gpus} GPUs"
|
|
177
|
+
)
|
|
178
|
+
hud_console.info(dist_msg)
|
|
179
|
+
for rank in range(num_gpus):
|
|
180
|
+
n_samples = len(rank_samples[rank]) if rank < len(rank_samples) else 0
|
|
181
|
+
hud_console.info(f" Rank {rank}: {n_samples} samples")
|
|
182
|
+
|
|
183
|
+
hud_console.section_title(f"Training on {len(all_traces)} traces")
|
|
184
|
+
episode_time_value = episode_time
|
|
185
|
+
|
|
186
|
+
state.update({"ok": True, "num_samples": len(preprocessed_traces)})
|
|
187
|
+
except Exception as e:
|
|
188
|
+
state.update({"ok": False, "err": str(e)})
|
|
189
|
+
|
|
190
|
+
# Broadcast step-state to keep ranks in lockstep
|
|
191
|
+
state = broadcast_object(state, src=0)
|
|
192
|
+
if not state.get("ok", False):
|
|
193
|
+
hud_console.warning("Step failed on rank 0; skipping this step coherently")
|
|
194
|
+
synchronize()
|
|
195
|
+
continue
|
|
164
196
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
)
|
|
169
|
-
for rank in range(num_gpus):
|
|
170
|
-
n_samples = len(rank_samples[rank])
|
|
171
|
-
hud_console.info(f" Rank {rank}: {n_samples} samples")
|
|
172
|
-
|
|
173
|
-
hud_console.section_title(f"Training on {len(all_traces)} traces")
|
|
174
|
-
episode_time_value = episode_time
|
|
175
|
-
else:
|
|
176
|
-
rank_samples = None
|
|
177
|
-
episode_time_value = None
|
|
178
|
-
|
|
179
|
-
# Broadcast each rank's samples and episode time
|
|
180
|
-
rank_samples = broadcast_object(rank_samples, src=0)
|
|
197
|
+
# Scatter per-rank samples; each rank receives only its slice
|
|
198
|
+
my_samples = scatter_object(rank_samples if is_main_process() else None, src=0)
|
|
199
|
+
# Broadcast the episode time (small object)
|
|
181
200
|
episode_time_value = broadcast_object(episode_time_value, src=0)
|
|
182
|
-
my_samples = rank_samples[get_global_rank()] if rank_samples else []
|
|
183
201
|
|
|
184
202
|
# Process only assigned samples
|
|
185
203
|
last_metrics = learner.update(my_samples)
|
|
@@ -230,18 +248,18 @@ async def train(config: Config, tasks: list[Task]) -> None:
|
|
|
230
248
|
if step % config.training.save_every_batches == 0:
|
|
231
249
|
if is_main_process() and vllm is not None and actor is not None:
|
|
232
250
|
hud_console.section_title("Saving checkpoint and updating vLLM")
|
|
233
|
-
|
|
234
|
-
now = datetime.now()
|
|
235
|
-
checkpoint_id = now.strftime("%Y%m%d_%H%M%S") + f"-{get_global_rank()}"
|
|
236
|
-
checkpoint_path = (
|
|
237
|
-
Path(config.out_dir) / f"{config.adapter_prefix}-{checkpoint_id}"
|
|
238
|
-
)
|
|
251
|
+
checkpoint_path = Path(config.out_dir) / f"{config.adapter_prefix}-{step}"
|
|
239
252
|
learner.save(str(checkpoint_path))
|
|
240
253
|
|
|
241
254
|
# Wait for 6 seconds to ensure the checkpoint is saved
|
|
242
255
|
await asyncio.sleep(6)
|
|
243
256
|
|
|
244
|
-
|
|
257
|
+
# If there is a previous adapter, unload it
|
|
258
|
+
current_adapter = vllm.get_current()
|
|
259
|
+
if current_adapter is not None:
|
|
260
|
+
vllm.unload_adapter(current_adapter)
|
|
261
|
+
|
|
262
|
+
adapter_name = f"{config.adapter_prefix}-{step}"
|
|
245
263
|
if vllm.load_adapter(adapter_name, str(checkpoint_path)):
|
|
246
264
|
actor.update_adapter(adapter_name)
|
|
247
265
|
hud_console.info(f"✓ Checkpoint saved and loaded: {adapter_name}")
|
|
@@ -356,7 +374,8 @@ async def main() -> None:
|
|
|
356
374
|
)
|
|
357
375
|
|
|
358
376
|
# Run training
|
|
359
|
-
|
|
377
|
+
tasks_typed = cast("list[Task]", tasks)
|
|
378
|
+
await train(config, tasks_typed)
|
|
360
379
|
|
|
361
380
|
|
|
362
381
|
if __name__ == "__main__":
|
hud/telemetry/trace.py
CHANGED
|
@@ -138,7 +138,10 @@ def trace(
|
|
|
138
138
|
task_run_id = str(uuid.uuid4())
|
|
139
139
|
else:
|
|
140
140
|
# Use a placeholder for custom backends
|
|
141
|
-
|
|
141
|
+
logger.warning(
|
|
142
|
+
"HUD API key is not set, using a placeholder for the task run ID. If this looks wrong, check your API key." # noqa: E501
|
|
143
|
+
)
|
|
144
|
+
task_run_id = str(uuid.uuid4())
|
|
142
145
|
|
|
143
146
|
# Create trace object
|
|
144
147
|
trace_obj = Trace(task_run_id, name, job_id, task_id)
|
hud/types.py
CHANGED
|
@@ -42,6 +42,7 @@ class Task(BaseModel):
|
|
|
42
42
|
mcp_config: dict[str, Any]
|
|
43
43
|
setup_tool: MCPToolCall | list[MCPToolCall] | None = None
|
|
44
44
|
evaluate_tool: MCPToolCall | list[MCPToolCall] | None = None
|
|
45
|
+
integration_test_tool: MCPToolCall | list[MCPToolCall] | None = None
|
|
45
46
|
agent_tools: list[str] | None = None
|
|
46
47
|
system_prompt: str | None = None
|
|
47
48
|
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
@@ -59,7 +60,7 @@ class Task(BaseModel):
|
|
|
59
60
|
raise HudConfigError(f"Invalid JSON string: {e}") from e
|
|
60
61
|
return v
|
|
61
62
|
|
|
62
|
-
@field_validator("setup_tool", "evaluate_tool", mode="before")
|
|
63
|
+
@field_validator("setup_tool", "evaluate_tool", "integration_test_tool", mode="before")
|
|
63
64
|
@classmethod
|
|
64
65
|
def convert_dict_to_tool_call(cls, v: Any, info: Any) -> Any:
|
|
65
66
|
"""Convert dict (with shorthands) to MCPToolCall instance.
|
hud/utils/tests/test_version.py
CHANGED
hud/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.45
|
|
4
4
|
Summary: SDK for the HUD platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-python
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
|
|
@@ -41,7 +41,7 @@ Requires-Dist: datasets>=2.14.0
|
|
|
41
41
|
Requires-Dist: httpx<1,>=0.23.0
|
|
42
42
|
Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
|
|
43
43
|
Requires-Dist: hud-mcp-python-sdk>=3.13.2
|
|
44
|
-
Requires-Dist: hud-mcp-use-python-sdk==2.3.
|
|
44
|
+
Requires-Dist: hud-mcp-use-python-sdk==2.3.20
|
|
45
45
|
Requires-Dist: numpy>=1.24.0
|
|
46
46
|
Requires-Dist: openai
|
|
47
47
|
Requires-Dist: opentelemetry-api>=1.34.1
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
hud/__init__.py,sha256=JMDFUE1pP0J1Xl_miBdt7ERvoffZmTzSFe8yxz512A8,552
|
|
2
2
|
hud/__main__.py,sha256=YR8Dq8OhINOsVfQ55PmRXXg4fEK84Rt_-rMtJ5rvhWo,145
|
|
3
3
|
hud/settings.py,sha256=disObWa-DgXzoDcCDp3y1dTPaNsbR0IvoMJL9Eg4zyo,3947
|
|
4
|
-
hud/types.py,sha256=
|
|
5
|
-
hud/version.py,sha256=
|
|
4
|
+
hud/types.py,sha256=RVwfx9rIF-D6P5HPwz9WuCzcbNhWHd_wId4uqanjah4,11170
|
|
5
|
+
hud/version.py,sha256=2pC5m12J53jX9Lpu_BkaFfqK7F0i6YtYE5ycic5fiZY,105
|
|
6
6
|
hud/agents/__init__.py,sha256=UoIkljWdbq4bM0LD-mSaw6w826EqdEjOk7r6glNYwYQ,286
|
|
7
7
|
hud/agents/base.py,sha256=_u1zR3gXzZ1RlTCUYdMcvgHqdJBC4-AB1lZt0yBx8lg,35406
|
|
8
8
|
hud/agents/claude.py,sha256=TGhm5gE2ltINDAdEsDxKuT9iGMQ5G87R6kmabU3KPt8,16101
|
|
@@ -11,22 +11,23 @@ hud/agents/langchain.py,sha256=1EgCy8jfjunsWxlPC5XfvfLS6_XZVrIF1ZjtHcrvhYw,9584
|
|
|
11
11
|
hud/agents/lite_llm.py,sha256=_3wbUiYCp7q8Vyu9rhaoJDvmb_bsyUsLYWP3iQJ2bHo,2239
|
|
12
12
|
hud/agents/openai.py,sha256=O1xV1h1l-W8lmnmXqTYr5CwnmnaniMqOxAZbl2CTTng,14576
|
|
13
13
|
hud/agents/openai_chat_generic.py,sha256=_vAID9dZ_UxL0elYwafskRcsdrSsLsxJ4zPrP58oBiw,12151
|
|
14
|
-
hud/agents/misc/__init__.py,sha256=
|
|
14
|
+
hud/agents/misc/__init__.py,sha256=LbVpHl2bDtheGPixbRRKsEjujwzmrXs7sCS8u1sYfAk,219
|
|
15
|
+
hud/agents/misc/integration_test_agent.py,sha256=-gxn8U7MKGKcq6e6uc64neY8iCrP0PutjL7qWTY8bfg,2017
|
|
15
16
|
hud/agents/misc/response_agent.py,sha256=uMuRDkz5QgaMQliNzBRepond5sb7KyqIiKm3LstjVnw,3753
|
|
16
17
|
hud/agents/tests/__init__.py,sha256=W-O-_4i34d9TTyEHV-O_q1Ai1gLhzwDaaPo02_TWQIY,34
|
|
17
18
|
hud/agents/tests/test_base.py,sha256=bDznxQDv2ickRkw98joH9zfuZT6ItHbmWvQ67iboa4g,28733
|
|
18
19
|
hud/agents/tests/test_claude.py,sha256=0nZnfsbGoECvsLPdmaRnc9jVmrehVvc3kxeyiCQI2Cc,13807
|
|
19
20
|
hud/agents/tests/test_client.py,sha256=uikgh6yhjPPX2RBU4XJQMz1mNox9uXjuwsP8t93id18,13337
|
|
20
21
|
hud/agents/tests/test_grounded_openai_agent.py,sha256=VK8lUvHIjWicMX00VKPE-FZyjiJqTEhb80MuRRa9fVc,5437
|
|
21
|
-
hud/agents/tests/test_openai.py,sha256=
|
|
22
|
-
hud/cli/__init__.py,sha256=
|
|
22
|
+
hud/agents/tests/test_openai.py,sha256=dnAFAoBKZf-5dtDpj6UC3q7oZv2tdMFcniPU0emfImw,8020
|
|
23
|
+
hud/cli/__init__.py,sha256=KFC2PLi_1wIxVIx2HB4qk3m9G4-Q5UXyxBHiZANhC4I,46221
|
|
23
24
|
hud/cli/__main__.py,sha256=fDH7XITyuDITwSDIVwRso06aouADO0CzTHKqp5TOwJE,143
|
|
24
25
|
hud/cli/analyze.py,sha256=4u5oYfJMquOjT9PzzRTYVcTZDxDi0ilNP_g532_hpOU,14716
|
|
25
26
|
hud/cli/build.py,sha256=h-4SAoe3j8Pth3mPYf26vh7q1Do5JADlvKKwkZrf2AU,19551
|
|
26
27
|
hud/cli/clone.py,sha256=AwVDIuhr8mHb1oT2Af2HrD25SiTdwATpE6zd93vzLgA,6099
|
|
27
28
|
hud/cli/debug.py,sha256=jtFW8J5F_3rhq1Hf1_SkJ7aLS3wjnyIs_LsC8k5cnzc,14200
|
|
28
29
|
hud/cli/dev.py,sha256=2zUeVz5S__WrV-DLSDqOlQawcJS7eYPKiDRVUaJ8mAk,31579
|
|
29
|
-
hud/cli/eval.py,sha256=
|
|
30
|
+
hud/cli/eval.py,sha256=ssnYc8FfjbPIfFr30Pq82JuX20Hk8-z6EfDcEuOj37s,26610
|
|
30
31
|
hud/cli/get.py,sha256=sksKrdzBGZa7ZuSoQkc0haj-CvOGVSSikoVXeaUd3N4,6274
|
|
31
32
|
hud/cli/init.py,sha256=YkWxkIDCnhnxGGpbm7IvYMcfDqWuO1X9wxDxE4k-9ew,9721
|
|
32
33
|
hud/cli/list_func.py,sha256=EVi2Vc3Lb3glBNJxFx4MPnZknZ4xmuJz1OFg_dc8a_E,7177
|
|
@@ -40,7 +41,7 @@ hud/cli/rl/celebrate.py,sha256=trGEJn3xebexlHwFVKPJKhRujVVV8sy7TQTJvRd2p9A,5947
|
|
|
40
41
|
hud/cli/rl/config.py,sha256=A-4WWwAS68GRKx1cP_DJ-NZD_96cFNnGwx0P3pQT1ps,3271
|
|
41
42
|
hud/cli/rl/display.py,sha256=hqJVGmO9csYinladhZwjF-GMvppYWngxDHajTyIJ_gM,5214
|
|
42
43
|
hud/cli/rl/gpu.py,sha256=peXS-NdUF5RyuSs0aZoCzGLboneBUpCy8f9f99WMrG0,2009
|
|
43
|
-
hud/cli/rl/gpu_utils.py,sha256=
|
|
44
|
+
hud/cli/rl/gpu_utils.py,sha256=0nFRrmJZzLOHh_0bjMhIsBj94PAuu95vwxLd_sa4Q5g,11202
|
|
44
45
|
hud/cli/rl/local_runner.py,sha256=NFsNmRZ4nenPnb45ZtdsILeICKEq11wmpLwq9E-a8ZE,22614
|
|
45
46
|
hud/cli/rl/presets.py,sha256=DzOO82xL5QyzdVtlX-Do1CODMvDz9ILMPapjU92jcZg,3051
|
|
46
47
|
hud/cli/rl/remote_runner.py,sha256=fKmOVKSBUWfakunfe9-HAllpUJDxfRNZwL00fPw-QTI,17837
|
|
@@ -121,9 +122,9 @@ hud/rl/actor.py,sha256=H6gwRGRY1YpkOyiaJ9yai8yQwcI-Gx0dFxd18jpLx_Q,6950
|
|
|
121
122
|
hud/rl/buffer.py,sha256=z47HOjOBJx3umUzzUfdtq_N4ZoJ8FMBPkX8YQKBtd3A,15457
|
|
122
123
|
hud/rl/chat_template.jinja,sha256=XTdzI8oFGEcSA-exKxyHaprwRDmX5Am1KEb0VxvUc6U,4965
|
|
123
124
|
hud/rl/config.py,sha256=akQ2a53NX3Dh1UWgMyw7mTxq33eiQbZcBpmKTzd79Xk,5624
|
|
124
|
-
hud/rl/distributed.py,sha256=
|
|
125
|
-
hud/rl/learner.py,sha256=
|
|
126
|
-
hud/rl/train.py,sha256
|
|
125
|
+
hud/rl/distributed.py,sha256=Mr3NEj3rbS9FgpHofC_GrqpkvNQSpPFOqLQc2NXPNXs,3678
|
|
126
|
+
hud/rl/learner.py,sha256=K73M50RLHbm7bAMi3hKCqaw_OMZuUcqEUr4YGioqpc4,26756
|
|
127
|
+
hud/rl/train.py,sha256=-ilVkSlwqzfMV8nnCX2OVCqy5GO2perma6BQ5bwx3yY,14971
|
|
127
128
|
hud/rl/types.py,sha256=lrLKo7iaqodYth2EyeuOQfLiuzXfYM2eJjPmpObrD7c,3965
|
|
128
129
|
hud/rl/utils.py,sha256=IsgVUUibxnUzb32a4mu1sYrgJC1CwoG9E-Dd5y5VDOA,19115
|
|
129
130
|
hud/rl/vllm_adapter.py,sha256=2wnTfoXPI4C9EzhVxk0GU-ArLjX7hgXS0BndMwN8Ppg,4751
|
|
@@ -157,7 +158,7 @@ hud/telemetry/__init__.py,sha256=uWiloBMXgEzPRsRIOpiSBhcTxJDyHfBqTg7qi8kxSTc,683
|
|
|
157
158
|
hud/telemetry/instrument.py,sha256=m3u6YK02PTk39Jr4L3se7l-cYyKx0maCaqf5Z5JqWNA,14096
|
|
158
159
|
hud/telemetry/job.py,sha256=LjspT-mSqQO2DnFL6h0ZkCkeMrrpjAuFVZnTJiOaDek,11585
|
|
159
160
|
hud/telemetry/replay.py,sha256=YW17s314s5Wy6Rl8MXHqg1FU8EF9_XcHBMJI0rrkyS4,2306
|
|
160
|
-
hud/telemetry/trace.py,sha256=
|
|
161
|
+
hud/telemetry/trace.py,sha256=nHSw4lKRXuHgKQoMIIYgM635FEHc-9baRLbfn5YwoyQ,4836
|
|
161
162
|
hud/telemetry/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
162
163
|
hud/telemetry/tests/test_replay.py,sha256=eREc6qgSJDRT1pOPdyhiEoEJ9H2yT1ospaU1RvTKlvg,1328
|
|
163
164
|
hud/telemetry/tests/test_trace.py,sha256=0rxR77CjcStat3ILA9QAswieOJ3J_386QmjmNDp34oA,2486
|
|
@@ -218,10 +219,10 @@ hud/utils/tests/test_init.py,sha256=2QLQSGgyP9wJhOvPCusm_zjJad0qApOZi1BXpxcdHXQ,
|
|
|
218
219
|
hud/utils/tests/test_mcp.py,sha256=0pUa16mL-bqbZDXp5NHBnt1gO5o10BOg7zTMHZ1DNPM,4023
|
|
219
220
|
hud/utils/tests/test_progress.py,sha256=QSF7Kpi03Ff_l3mAeqW9qs1nhK50j9vBiSobZq7T4f4,7394
|
|
220
221
|
hud/utils/tests/test_telemetry.py,sha256=5jl7bEx8C8b-FfFUko5pf4UY-mPOR-9HaeL98dGtVHM,2781
|
|
221
|
-
hud/utils/tests/test_version.py,sha256=
|
|
222
|
+
hud/utils/tests/test_version.py,sha256=e2xkBr8ieuCKotlh_ywbJGNvO_dx0C0GPfD4lOUitrU,160
|
|
222
223
|
hud/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
223
|
-
hud_python-0.4.
|
|
224
|
-
hud_python-0.4.
|
|
225
|
-
hud_python-0.4.
|
|
226
|
-
hud_python-0.4.
|
|
227
|
-
hud_python-0.4.
|
|
224
|
+
hud_python-0.4.45.dist-info/METADATA,sha256=j3l9VYG9PmGzvRip759et-evgmGq-nzHGaCSyYFC0og,22275
|
|
225
|
+
hud_python-0.4.45.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
226
|
+
hud_python-0.4.45.dist-info/entry_points.txt,sha256=jJbodNFg1m0-CDofe5AHvB4zKBq7sSdP97-ohaQ3ae4,63
|
|
227
|
+
hud_python-0.4.45.dist-info/licenses/LICENSE,sha256=yIzBheVUf86FC1bztAcr7RYWWNxyd3B-UJQ3uddg1HA,1078
|
|
228
|
+
hud_python-0.4.45.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|