hud-python 0.4.36__py3-none-any.whl → 0.4.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/agents/__init__.py +2 -0
- hud/agents/lite_llm.py +72 -0
- hud/agents/openai_chat_generic.py +21 -7
- hud/cli/__init__.py +19 -4
- hud/cli/build.py +17 -2
- hud/cli/dev.py +1 -1
- hud/cli/eval.py +93 -13
- hud/cli/flows/tasks.py +197 -65
- hud/cli/push.py +9 -0
- hud/cli/rl/__init__.py +14 -4
- hud/cli/rl/celebrate.py +187 -0
- hud/cli/rl/config.py +15 -8
- hud/cli/rl/local_runner.py +44 -20
- hud/cli/rl/remote_runner.py +163 -86
- hud/cli/rl/viewer.py +141 -0
- hud/cli/rl/wait_utils.py +89 -0
- hud/cli/utils/env_check.py +196 -0
- hud/cli/utils/source_hash.py +108 -0
- hud/clients/base.py +1 -1
- hud/clients/fastmcp.py +1 -1
- hud/otel/config.py +1 -1
- hud/otel/context.py +2 -2
- hud/rl/vllm_adapter.py +1 -1
- hud/server/server.py +84 -13
- hud/server/tests/test_add_tool.py +60 -0
- hud/server/tests/test_context.py +128 -0
- hud/server/tests/test_mcp_server_handlers.py +44 -0
- hud/server/tests/test_mcp_server_integration.py +405 -0
- hud/server/tests/test_mcp_server_more.py +247 -0
- hud/server/tests/test_run_wrapper.py +53 -0
- hud/server/tests/test_server_extra.py +166 -0
- hud/server/tests/test_sigterm_runner.py +78 -0
- hud/shared/hints.py +1 -1
- hud/telemetry/job.py +2 -2
- hud/types.py +9 -2
- hud/utils/tasks.py +32 -24
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.36.dist-info → hud_python-0.4.37.dist-info}/METADATA +14 -12
- {hud_python-0.4.36.dist-info → hud_python-0.4.37.dist-info}/RECORD +43 -29
- {hud_python-0.4.36.dist-info → hud_python-0.4.37.dist-info}/WHEEL +0 -0
- {hud_python-0.4.36.dist-info → hud_python-0.4.37.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.36.dist-info → hud_python-0.4.37.dist-info}/licenses/LICENSE +0 -0
hud/agents/__init__.py
CHANGED
|
@@ -2,12 +2,14 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from .base import MCPAgent
|
|
4
4
|
from .claude import ClaudeAgent
|
|
5
|
+
from .lite_llm import LiteAgent
|
|
5
6
|
from .openai import OperatorAgent
|
|
6
7
|
from .openai_chat_generic import GenericOpenAIChatAgent
|
|
7
8
|
|
|
8
9
|
__all__ = [
|
|
9
10
|
"ClaudeAgent",
|
|
10
11
|
"GenericOpenAIChatAgent",
|
|
12
|
+
"LiteAgent",
|
|
11
13
|
"MCPAgent",
|
|
12
14
|
"OperatorAgent",
|
|
13
15
|
]
|
hud/agents/lite_llm.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""LiteLLM MCP Agent implementation.
|
|
2
|
+
|
|
3
|
+
Same OpenAI chat-completions shape + MCP tool plumbing,
|
|
4
|
+
but transport is LiteLLM and (optionally) tools are shaped by LiteLLM's MCP transformer.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Any, ClassVar
|
|
11
|
+
|
|
12
|
+
import litellm
|
|
13
|
+
|
|
14
|
+
from .openai_chat_generic import GenericOpenAIChatAgent
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
# Prefer LiteLLM's built-in MCP -> OpenAI tool transformer (handles Bedrock nuances)
|
|
19
|
+
try:
|
|
20
|
+
from litellm.experimental_mcp_client.tools import (
|
|
21
|
+
transform_mcp_tool_to_openai_tool,
|
|
22
|
+
)
|
|
23
|
+
except Exception: # pragma: no cover - optional dependency
|
|
24
|
+
transform_mcp_tool_to_openai_tool = None # type: ignore
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class LiteAgent(GenericOpenAIChatAgent):
|
|
28
|
+
"""
|
|
29
|
+
Same OpenAI chat-completions shape + MCP tool plumbing,
|
|
30
|
+
but transport is LiteLLM and (optionally) tools are shaped by LiteLLM's MCP transformer.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
metadata: ClassVar[dict[str, Any]] = {}
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
*,
|
|
38
|
+
model_name: str = "gpt-4o-mini",
|
|
39
|
+
completion_kwargs: dict[str, Any] | None = None,
|
|
40
|
+
**agent_kwargs: Any,
|
|
41
|
+
) -> None:
|
|
42
|
+
# We don't need an OpenAI client; pass None
|
|
43
|
+
super().__init__(
|
|
44
|
+
openai_client=None,
|
|
45
|
+
model_name=model_name,
|
|
46
|
+
completion_kwargs=completion_kwargs,
|
|
47
|
+
**agent_kwargs,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
def get_tool_schemas(self) -> list[dict]:
|
|
51
|
+
# Prefer LiteLLM's stricter transformer (handles Bedrock & friends)
|
|
52
|
+
if transform_mcp_tool_to_openai_tool is not None:
|
|
53
|
+
return [
|
|
54
|
+
transform_mcp_tool_to_openai_tool(t) # returns ChatCompletionToolParam-like dict
|
|
55
|
+
for t in self.get_available_tools()
|
|
56
|
+
]
|
|
57
|
+
# Fallback to the generic OpenAI sanitizer
|
|
58
|
+
return GenericOpenAIChatAgent.get_tool_schemas(self)
|
|
59
|
+
|
|
60
|
+
async def _invoke_chat_completion(
|
|
61
|
+
self,
|
|
62
|
+
*,
|
|
63
|
+
messages: list[Any],
|
|
64
|
+
tools: list[dict] | None,
|
|
65
|
+
extra: dict[str, Any],
|
|
66
|
+
):
|
|
67
|
+
return await litellm.acompletion(
|
|
68
|
+
model=self.model_name,
|
|
69
|
+
messages=messages,
|
|
70
|
+
tools=tools or None, # LiteLLM tolerates None better than []
|
|
71
|
+
**extra,
|
|
72
|
+
)
|
|
@@ -42,7 +42,7 @@ class GenericOpenAIChatAgent(MCPAgent):
|
|
|
42
42
|
def __init__(
|
|
43
43
|
self,
|
|
44
44
|
*,
|
|
45
|
-
openai_client: AsyncOpenAI,
|
|
45
|
+
openai_client: AsyncOpenAI | None,
|
|
46
46
|
model_name: str = "gpt-4o-mini",
|
|
47
47
|
completion_kwargs: dict[str, Any] | None = None,
|
|
48
48
|
**agent_kwargs: Any,
|
|
@@ -171,6 +171,23 @@ class GenericOpenAIChatAgent(MCPAgent):
|
|
|
171
171
|
openai_tools.append(openai_tool)
|
|
172
172
|
return openai_tools
|
|
173
173
|
|
|
174
|
+
async def _invoke_chat_completion(
|
|
175
|
+
self,
|
|
176
|
+
*,
|
|
177
|
+
messages: list[Any],
|
|
178
|
+
tools: list[dict] | None,
|
|
179
|
+
extra: dict[str, Any],
|
|
180
|
+
):
|
|
181
|
+
if self.oai is None:
|
|
182
|
+
raise ValueError("openai_client is required for GenericOpenAIChatAgent")
|
|
183
|
+
# default transport = OpenAI SDK
|
|
184
|
+
return await self.oai.chat.completions.create(
|
|
185
|
+
model=self.model_name,
|
|
186
|
+
messages=messages,
|
|
187
|
+
tools=tools, # already ChatCompletionToolParam-shaped
|
|
188
|
+
**extra,
|
|
189
|
+
)
|
|
190
|
+
|
|
174
191
|
@instrument(
|
|
175
192
|
span_type="agent",
|
|
176
193
|
record_args=False,
|
|
@@ -180,17 +197,14 @@ class GenericOpenAIChatAgent(MCPAgent):
|
|
|
180
197
|
"""Send chat request to OpenAI and convert the response."""
|
|
181
198
|
|
|
182
199
|
# Convert MCP tool schemas to OpenAI format
|
|
183
|
-
|
|
200
|
+
tools = cast("list[ChatCompletionToolParam]", self.get_tool_schemas())
|
|
184
201
|
|
|
185
202
|
protected_keys = {"model", "messages", "tools"}
|
|
186
203
|
extra = {k: v for k, v in (self.completion_kwargs or {}).items() if k not in protected_keys}
|
|
187
204
|
|
|
188
205
|
try:
|
|
189
|
-
response = await self.
|
|
190
|
-
|
|
191
|
-
messages=messages,
|
|
192
|
-
tools=cast("list[ChatCompletionToolParam]", mcp_schemas),
|
|
193
|
-
**extra,
|
|
206
|
+
response = await self._invoke_chat_completion(
|
|
207
|
+
messages=messages, tools=tools, extra=extra
|
|
194
208
|
)
|
|
195
209
|
except Exception as e:
|
|
196
210
|
error_content = f"Error getting response {e}"
|
hud/cli/__init__.py
CHANGED
|
@@ -912,7 +912,7 @@ def eval(
|
|
|
912
912
|
agent: str | None = typer.Argument(
|
|
913
913
|
None,
|
|
914
914
|
help=(
|
|
915
|
-
"Agent backend to use (claude, openai, or
|
|
915
|
+
"Agent backend to use (claude, openai, vllm, or litellm). If not provided, will prompt interactively." # noqa: E501
|
|
916
916
|
),
|
|
917
917
|
),
|
|
918
918
|
full: bool = typer.Option(
|
|
@@ -960,6 +960,12 @@ def eval(
|
|
|
960
960
|
"--verbose",
|
|
961
961
|
help="Enable verbose output from the agent",
|
|
962
962
|
),
|
|
963
|
+
very_verbose: bool = typer.Option(
|
|
964
|
+
False,
|
|
965
|
+
"--very-verbose",
|
|
966
|
+
"-vv",
|
|
967
|
+
help="Enable debug-level logs for maximum visibility",
|
|
968
|
+
),
|
|
963
969
|
vllm_base_url: str | None = typer.Option(
|
|
964
970
|
None,
|
|
965
971
|
"--vllm-base-url",
|
|
@@ -1025,13 +1031,14 @@ def eval(
|
|
|
1025
1031
|
{"name": "Claude 4 Sonnet", "value": "claude"},
|
|
1026
1032
|
{"name": "OpenAI Computer Use", "value": "openai"},
|
|
1027
1033
|
{"name": "vLLM (Local Server)", "value": "vllm"},
|
|
1034
|
+
{"name": "LiteLLM (Multi-provider)", "value": "litellm"},
|
|
1028
1035
|
]
|
|
1029
1036
|
)
|
|
1030
1037
|
|
|
1031
1038
|
agent = hud_console.select("Select an agent to use:", choices=choices, default=0)
|
|
1032
1039
|
|
|
1033
1040
|
# Handle HUD model selection
|
|
1034
|
-
if agent and agent not in ["claude", "openai", "vllm"]:
|
|
1041
|
+
if agent and agent not in ["claude", "openai", "vllm", "litellm"]:
|
|
1035
1042
|
# Find remote model name
|
|
1036
1043
|
model = agent
|
|
1037
1044
|
if not vllm_base_url:
|
|
@@ -1052,7 +1059,7 @@ def eval(
|
|
|
1052
1059
|
hud_console.info(f"Using HUD model: {model} (trained on {base_model})")
|
|
1053
1060
|
|
|
1054
1061
|
# Validate agent choice
|
|
1055
|
-
valid_agents = ["claude", "openai", "vllm"]
|
|
1062
|
+
valid_agents = ["claude", "openai", "vllm", "litellm"]
|
|
1056
1063
|
if agent not in valid_agents:
|
|
1057
1064
|
hud_console.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}")
|
|
1058
1065
|
raise typer.Exit(1)
|
|
@@ -1070,6 +1077,7 @@ def eval(
|
|
|
1070
1077
|
max_workers=max_workers,
|
|
1071
1078
|
max_concurrent_per_worker=max_concurrent_per_worker,
|
|
1072
1079
|
verbose=verbose,
|
|
1080
|
+
very_verbose=very_verbose,
|
|
1073
1081
|
vllm_base_url=vllm_base_url,
|
|
1074
1082
|
group_size=group_size,
|
|
1075
1083
|
)
|
|
@@ -1119,7 +1127,7 @@ def rl(
|
|
|
1119
1127
|
),
|
|
1120
1128
|
model: str | None = typer.Argument(
|
|
1121
1129
|
None,
|
|
1122
|
-
help="Model to train (default: interactive selection)",
|
|
1130
|
+
help="Model to train from https://hud.so/models (default: interactive selection)",
|
|
1123
1131
|
),
|
|
1124
1132
|
config_file: Path | None = typer.Option( # noqa: B008
|
|
1125
1133
|
None,
|
|
@@ -1159,6 +1167,12 @@ def rl(
|
|
|
1159
1167
|
"--ddp-gpus",
|
|
1160
1168
|
help="Specific GPUs for DDP (e.g., '0,1,2,3')",
|
|
1161
1169
|
),
|
|
1170
|
+
yes: bool = typer.Option(
|
|
1171
|
+
False,
|
|
1172
|
+
"--yes",
|
|
1173
|
+
"-y",
|
|
1174
|
+
help="Auto-accept all prompts and use defaults (lazy mode)",
|
|
1175
|
+
),
|
|
1162
1176
|
vllm_gpu: int | None = typer.Option(
|
|
1163
1177
|
None,
|
|
1164
1178
|
"--vllm-gpu",
|
|
@@ -1180,6 +1194,7 @@ def rl(
|
|
|
1180
1194
|
no_ddp=no_ddp,
|
|
1181
1195
|
ddp_gpus=ddp_gpus,
|
|
1182
1196
|
vllm_gpu=vllm_gpu,
|
|
1197
|
+
yes=yes,
|
|
1183
1198
|
)
|
|
1184
1199
|
|
|
1185
1200
|
|
hud/cli/build.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
6
|
+
import contextlib
|
|
6
7
|
import hashlib
|
|
7
8
|
import subprocess
|
|
8
9
|
import time
|
|
@@ -13,6 +14,7 @@ from typing import Any
|
|
|
13
14
|
import typer
|
|
14
15
|
import yaml
|
|
15
16
|
|
|
17
|
+
from hud.cli.utils.source_hash import compute_source_hash, list_source_files
|
|
16
18
|
from hud.clients import MCPClient
|
|
17
19
|
from hud.utils.hud_console import HUDConsole
|
|
18
20
|
from hud.version import __version__ as hud_version
|
|
@@ -341,10 +343,11 @@ def build_environment(
|
|
|
341
343
|
required_env, optional_env = extract_env_vars_from_dockerfile(dockerfile_path)
|
|
342
344
|
|
|
343
345
|
# Merge user-provided env vars with detected ones
|
|
344
|
-
provided_env_vars = {}
|
|
346
|
+
provided_env_vars: dict[str, str] = {}
|
|
345
347
|
missing_required = []
|
|
346
348
|
if env_vars:
|
|
347
|
-
|
|
349
|
+
# Use placeholders in lock file for any provided values to avoid storing secrets
|
|
350
|
+
provided_env_vars = {k: f"${{{k}}}" for k in env_vars}
|
|
348
351
|
# Track which required vars are still missing
|
|
349
352
|
missing_required = [e for e in required_env if e not in env_vars]
|
|
350
353
|
|
|
@@ -384,6 +387,8 @@ def build_environment(
|
|
|
384
387
|
"hudVersion": hud_version,
|
|
385
388
|
"directory": str(env_dir.name),
|
|
386
389
|
"version": new_version, # Internal environment version
|
|
390
|
+
# Fast source fingerprint for change detection
|
|
391
|
+
"sourceHash": compute_source_hash(env_dir),
|
|
387
392
|
},
|
|
388
393
|
"environment": {
|
|
389
394
|
"initializeMs": analysis["initializeMs"],
|
|
@@ -424,6 +429,16 @@ def build_environment(
|
|
|
424
429
|
with open(lock_path, "w") as f:
|
|
425
430
|
yaml.dump(lock_content, f, default_flow_style=False, sort_keys=False)
|
|
426
431
|
|
|
432
|
+
# Also write the file list we hashed for transparency (non-essential)
|
|
433
|
+
with contextlib.suppress(Exception):
|
|
434
|
+
files = [
|
|
435
|
+
str(p.resolve().relative_to(env_dir)).replace("\\", "/")
|
|
436
|
+
for p in list_source_files(env_dir)
|
|
437
|
+
]
|
|
438
|
+
lock_content["build"]["sourceFiles"] = files
|
|
439
|
+
with open(lock_path, "w") as f:
|
|
440
|
+
yaml.dump(lock_content, f, default_flow_style=False, sort_keys=False)
|
|
441
|
+
|
|
427
442
|
hud_console.success("Created lock file: hud.lock.yaml")
|
|
428
443
|
|
|
429
444
|
# Calculate lock file hash
|
hud/cli/dev.py
CHANGED
|
@@ -530,7 +530,7 @@ async def start_mcp_proxy(
|
|
|
530
530
|
stderr=asyncio.subprocess.DEVNULL,
|
|
531
531
|
)
|
|
532
532
|
await stop_result.communicate()
|
|
533
|
-
hud_console.success("
|
|
533
|
+
hud_console.success("Container stopped successfully")
|
|
534
534
|
container_stopped = True
|
|
535
535
|
except Exception as e:
|
|
536
536
|
hud_console.warning(f"Failed to stop container: {e}")
|
hud/cli/eval.py
CHANGED
|
@@ -5,15 +5,18 @@ from __future__ import annotations
|
|
|
5
5
|
import asyncio
|
|
6
6
|
import logging
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
from typing import Any, Literal
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
9
9
|
|
|
10
10
|
import typer
|
|
11
11
|
|
|
12
12
|
import hud
|
|
13
|
+
from hud.cli.utils.env_check import ensure_built, find_environment_dir
|
|
13
14
|
from hud.settings import settings
|
|
14
15
|
from hud.utils.group_eval import display_group_statistics, run_tasks_grouped
|
|
15
16
|
from hud.utils.hud_console import HUDConsole
|
|
16
17
|
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from hud.types import Task
|
|
17
20
|
logger = logging.getLogger(__name__)
|
|
18
21
|
hud_console = HUDConsole()
|
|
19
22
|
|
|
@@ -27,7 +30,7 @@ def get_available_models() -> list[dict[str, str | None]]:
|
|
|
27
30
|
try:
|
|
28
31
|
from hud.cli.rl import rl_api
|
|
29
32
|
|
|
30
|
-
hud_console.info("Fetching your models from https://
|
|
33
|
+
hud_console.info("Fetching your models from https://hud.so/models")
|
|
31
34
|
models = rl_api.list_models()
|
|
32
35
|
|
|
33
36
|
# Filter for ready models only and sort by recency
|
|
@@ -66,7 +69,7 @@ def get_available_models() -> list[dict[str, str | None]]:
|
|
|
66
69
|
|
|
67
70
|
|
|
68
71
|
def build_agent(
|
|
69
|
-
agent_type: Literal["claude", "openai", "vllm"],
|
|
72
|
+
agent_type: Literal["claude", "openai", "vllm", "litellm"],
|
|
70
73
|
*,
|
|
71
74
|
model: str | None = None,
|
|
72
75
|
allowed_tools: list[str] | None = None,
|
|
@@ -138,6 +141,22 @@ def build_agent(
|
|
|
138
141
|
else:
|
|
139
142
|
return OperatorAgent(verbose=verbose)
|
|
140
143
|
|
|
144
|
+
elif agent_type == "litellm":
|
|
145
|
+
try:
|
|
146
|
+
from hud.agents.lite_llm import LiteAgent
|
|
147
|
+
except ImportError as e:
|
|
148
|
+
hud_console.error(
|
|
149
|
+
"LiteLLM agent dependencies are not installed. "
|
|
150
|
+
"Please install with: pip install 'hud-python[agent]'"
|
|
151
|
+
)
|
|
152
|
+
raise typer.Exit(1) from e
|
|
153
|
+
|
|
154
|
+
return LiteAgent(
|
|
155
|
+
model_name=model or "gpt-4o-mini",
|
|
156
|
+
allowed_tools=allowed_tools,
|
|
157
|
+
verbose=verbose,
|
|
158
|
+
)
|
|
159
|
+
|
|
141
160
|
# Fallback Claude agent (Anthropic)
|
|
142
161
|
try:
|
|
143
162
|
from hud.agents import ClaudeAgent
|
|
@@ -166,7 +185,7 @@ def build_agent(
|
|
|
166
185
|
async def run_single_task(
|
|
167
186
|
source: str,
|
|
168
187
|
*,
|
|
169
|
-
agent_type: Literal["claude", "openai", "vllm"] = "claude",
|
|
188
|
+
agent_type: Literal["claude", "openai", "vllm", "litellm"] = "claude",
|
|
170
189
|
model: str | None = None,
|
|
171
190
|
allowed_tools: list[str] | None = None,
|
|
172
191
|
max_steps: int = 10,
|
|
@@ -192,7 +211,16 @@ async def run_single_task(
|
|
|
192
211
|
hud_console.info("📊 Loading task file…")
|
|
193
212
|
|
|
194
213
|
# Use unified loader for both JSON and JSONL
|
|
195
|
-
tasks = load_tasks(str(path))
|
|
214
|
+
tasks: list[Task] = load_tasks(str(path)) # type: ignore[assignment]
|
|
215
|
+
|
|
216
|
+
# If tasks reference a local environment (nearby), ensure it's built/up-to-date.
|
|
217
|
+
try:
|
|
218
|
+
env_dir = find_environment_dir(path)
|
|
219
|
+
if env_dir is not None:
|
|
220
|
+
# Non-interactive for eval; warn but don't block
|
|
221
|
+
ensure_built(env_dir, interactive=True)
|
|
222
|
+
except Exception as e:
|
|
223
|
+
hud_console.debug(f"Eval preflight env check skipped: {e}")
|
|
196
224
|
|
|
197
225
|
# Single task - use the first (and only) task
|
|
198
226
|
task = tasks[0]
|
|
@@ -200,7 +228,7 @@ async def run_single_task(
|
|
|
200
228
|
else:
|
|
201
229
|
# Load from HuggingFace dataset or non-file source
|
|
202
230
|
hud_console.info(f"📊 Loading tasks from: {source}…")
|
|
203
|
-
tasks = load_tasks(source)
|
|
231
|
+
tasks: list[Task] = load_tasks(source) # type: ignore[assignment]
|
|
204
232
|
|
|
205
233
|
if not tasks:
|
|
206
234
|
hud_console.error(f"No tasks found in: {source}")
|
|
@@ -248,6 +276,16 @@ async def run_single_task(
|
|
|
248
276
|
agent_config = {"verbose": verbose}
|
|
249
277
|
if allowed_tools:
|
|
250
278
|
agent_config["allowed_tools"] = allowed_tools
|
|
279
|
+
elif agent_type == "litellm":
|
|
280
|
+
from hud.agents.lite_llm import LiteAgent
|
|
281
|
+
|
|
282
|
+
agent_class = LiteAgent
|
|
283
|
+
agent_config = {
|
|
284
|
+
"model_name": model or "gpt-4o-mini",
|
|
285
|
+
"verbose": verbose,
|
|
286
|
+
}
|
|
287
|
+
if allowed_tools:
|
|
288
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
251
289
|
else:
|
|
252
290
|
from hud.agents import ClaudeAgent
|
|
253
291
|
|
|
@@ -292,7 +330,7 @@ async def run_single_task(
|
|
|
292
330
|
async def run_full_dataset(
|
|
293
331
|
source: str,
|
|
294
332
|
*,
|
|
295
|
-
agent_type: Literal["claude", "openai", "vllm"] = "claude",
|
|
333
|
+
agent_type: Literal["claude", "openai", "vllm", "litellm"] = "claude",
|
|
296
334
|
model: str | None = None,
|
|
297
335
|
allowed_tools: list[str] | None = None,
|
|
298
336
|
max_concurrent: int = 30,
|
|
@@ -322,7 +360,7 @@ async def run_full_dataset(
|
|
|
322
360
|
|
|
323
361
|
# Load tasks using unified loader
|
|
324
362
|
hud_console.info(f"📊 Loading tasks from: {source}…")
|
|
325
|
-
tasks = load_tasks(source)
|
|
363
|
+
tasks: list[Task] = load_tasks(source) # type: ignore[assignment]
|
|
326
364
|
|
|
327
365
|
if not tasks:
|
|
328
366
|
hud_console.error(f"No tasks found in: {source}")
|
|
@@ -385,6 +423,25 @@ async def run_full_dataset(
|
|
|
385
423
|
if allowed_tools:
|
|
386
424
|
agent_config["allowed_tools"] = allowed_tools
|
|
387
425
|
|
|
426
|
+
elif agent_type == "litellm":
|
|
427
|
+
try:
|
|
428
|
+
from hud.agents.lite_llm import LiteAgent
|
|
429
|
+
|
|
430
|
+
agent_class = LiteAgent
|
|
431
|
+
except ImportError as e:
|
|
432
|
+
hud_console.error(
|
|
433
|
+
"LiteLLM agent dependencies are not installed. "
|
|
434
|
+
"Please install with: pip install 'hud-python[agent]'"
|
|
435
|
+
)
|
|
436
|
+
raise typer.Exit(1) from e
|
|
437
|
+
|
|
438
|
+
agent_config = {
|
|
439
|
+
"model_name": model or "gpt-4o-mini",
|
|
440
|
+
"verbose": verbose,
|
|
441
|
+
}
|
|
442
|
+
if allowed_tools:
|
|
443
|
+
agent_config["allowed_tools"] = allowed_tools
|
|
444
|
+
|
|
388
445
|
else:
|
|
389
446
|
try:
|
|
390
447
|
from hud.agents import ClaudeAgent
|
|
@@ -501,10 +558,10 @@ def eval_command(
|
|
|
501
558
|
"--full",
|
|
502
559
|
help="Run the entire dataset (omit for single-task debug mode)",
|
|
503
560
|
),
|
|
504
|
-
agent: Literal["claude", "openai", "vllm"] = typer.Option(
|
|
561
|
+
agent: Literal["claude", "openai", "vllm", "litellm"] = typer.Option(
|
|
505
562
|
"claude",
|
|
506
563
|
"--agent",
|
|
507
|
-
help="Agent backend to use (claude, openai,
|
|
564
|
+
help="Agent backend to use (claude, openai, vllm for local server, or litellm)",
|
|
508
565
|
),
|
|
509
566
|
model: str | None = typer.Option(
|
|
510
567
|
None,
|
|
@@ -546,6 +603,12 @@ def eval_command(
|
|
|
546
603
|
"--verbose",
|
|
547
604
|
help="Enable verbose output from the agent",
|
|
548
605
|
),
|
|
606
|
+
very_verbose: bool = typer.Option(
|
|
607
|
+
False,
|
|
608
|
+
"--very-verbose",
|
|
609
|
+
"-vv",
|
|
610
|
+
help="Enable debug-level logs for maximum visibility",
|
|
611
|
+
),
|
|
549
612
|
vllm_base_url: str | None = typer.Option(
|
|
550
613
|
None,
|
|
551
614
|
"--vllm-base-url",
|
|
@@ -595,6 +658,23 @@ def eval_command(
|
|
|
595
658
|
"""
|
|
596
659
|
from hud.settings import settings
|
|
597
660
|
|
|
661
|
+
if very_verbose:
|
|
662
|
+
logging.basicConfig(
|
|
663
|
+
level=logging.DEBUG,
|
|
664
|
+
format="%(asctime)s - %(name)s - %(message)s",
|
|
665
|
+
datefmt="%H:%M:%S",
|
|
666
|
+
)
|
|
667
|
+
logging.getLogger("hud.agents").setLevel(logging.DEBUG)
|
|
668
|
+
logging.getLogger("hud.agents.base").setLevel(logging.DEBUG)
|
|
669
|
+
elif verbose:
|
|
670
|
+
logging.basicConfig(
|
|
671
|
+
level=logging.INFO,
|
|
672
|
+
format="%(asctime)s - %(name)s - %(message)s",
|
|
673
|
+
datefmt="%H:%M:%S",
|
|
674
|
+
)
|
|
675
|
+
logging.getLogger("hud.agents").setLevel(logging.INFO)
|
|
676
|
+
logging.getLogger("hud.agents.base").setLevel(logging.INFO)
|
|
677
|
+
|
|
598
678
|
# Check for required API keys
|
|
599
679
|
if agent == "claude":
|
|
600
680
|
if not settings.anthropic_api_key:
|
|
@@ -617,7 +697,7 @@ def eval_command(
|
|
|
617
697
|
# Check for HUD_API_KEY if using HUD services
|
|
618
698
|
if not settings.api_key:
|
|
619
699
|
hud_console.warning("HUD_API_KEY not set. Some features may be limited.")
|
|
620
|
-
hud_console.info("Get your API key at: https://
|
|
700
|
+
hud_console.info("Get your API key at: https://hud.so")
|
|
621
701
|
hud_console.info("Set it in your environment or run: hud set HUD_API_KEY=your-key-here")
|
|
622
702
|
|
|
623
703
|
# Parse allowed tools
|
|
@@ -642,7 +722,7 @@ def eval_command(
|
|
|
642
722
|
parallel=parallel,
|
|
643
723
|
max_workers=max_workers,
|
|
644
724
|
max_concurrent_per_worker=max_concurrent_per_worker,
|
|
645
|
-
verbose=verbose,
|
|
725
|
+
verbose=very_verbose or verbose,
|
|
646
726
|
vllm_base_url=vllm_base_url,
|
|
647
727
|
group_size=group_size,
|
|
648
728
|
)
|
|
@@ -655,7 +735,7 @@ def eval_command(
|
|
|
655
735
|
model=model,
|
|
656
736
|
allowed_tools=allowed_tools_list,
|
|
657
737
|
max_steps=max_steps,
|
|
658
|
-
verbose=verbose,
|
|
738
|
+
verbose=very_verbose or verbose,
|
|
659
739
|
vllm_base_url=vllm_base_url,
|
|
660
740
|
group_size=group_size,
|
|
661
741
|
)
|