hud-python 0.4.28__py3-none-any.whl → 0.4.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +2 -1
- hud/agents/base.py +81 -45
- hud/agents/claude.py +8 -4
- hud/agents/openai_chat_generic.py +66 -40
- hud/agents/tests/test_base.py +0 -4
- hud/agents/tests/test_openai.py +1 -1
- hud/cli/__init__.py +182 -52
- hud/cli/dev.py +8 -9
- hud/cli/eval.py +317 -119
- hud/cli/flows/__init__.py +0 -0
- hud/cli/flows/tasks.py +0 -0
- hud/cli/get.py +160 -0
- hud/cli/rl/__init__.py +567 -71
- hud/cli/rl/config.py +94 -0
- hud/cli/rl/display.py +133 -0
- hud/cli/rl/gpu.py +63 -0
- hud/cli/rl/gpu_utils.py +318 -0
- hud/cli/rl/presets.py +96 -0
- hud/cli/rl/remote_runner.py +347 -0
- hud/cli/rl/rl_api.py +150 -0
- hud/cli/rl/vllm.py +177 -0
- hud/cli/tests/test_analyze_metadata.py +0 -1
- hud/cli/utils/tasks.py +26 -0
- hud/clients/base.py +21 -23
- hud/clients/mcp_use.py +36 -44
- hud/clients/tests/test_mcp_use_retry.py +10 -10
- hud/datasets/__init__.py +4 -3
- hud/datasets/{execution/parallel.py → parallel.py} +1 -1
- hud/datasets/{execution/runner.py → runner.py} +1 -1
- hud/datasets/utils.py +1 -1
- hud/native/comparator.py +6 -6
- hud/native/tests/test_comparator.py +8 -8
- hud/native/tests/test_native_init.py +13 -11
- hud/otel/config.py +1 -1
- hud/otel/instrumentation.py +35 -0
- hud/rl/README.md +30 -0
- hud/rl/__init__.py +1 -0
- hud/rl/actor.py +174 -0
- hud/rl/buffer.py +371 -0
- hud/rl/chat_template.jinja +101 -0
- hud/rl/config.py +184 -0
- hud/rl/distributed.py +95 -0
- hud/rl/learner.py +589 -0
- hud/rl/tests/__init__.py +1 -0
- hud/rl/tests/test_learner.py +171 -0
- hud/rl/train.py +354 -0
- hud/rl/types.py +101 -0
- hud/rl/utils/start_vllm_server.sh +30 -0
- hud/rl/utils.py +524 -0
- hud/rl/vllm_adapter.py +125 -0
- hud/settings.py +6 -0
- hud/telemetry/__init__.py +2 -1
- hud/telemetry/job.py +46 -3
- hud/telemetry/tests/test_trace.py +3 -3
- hud/telemetry/trace.py +85 -13
- hud/tools/tests/test_computer.py +3 -3
- hud/tools/tests/test_computer_actions.py +1 -1
- hud/types.py +123 -2
- hud/utils/group_eval.py +223 -0
- hud/utils/hud_console.py +113 -13
- hud/utils/tasks.py +119 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/METADATA +20 -2
- {hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/RECORD +68 -48
- hud/cli/hf.py +0 -406
- hud/cli/rl/README.md +0 -243
- hud/cli/rl/init.py +0 -370
- hud/cli/rl/pod.py +0 -501
- hud/cli/rl/ssh.py +0 -322
- hud/cli/rl/train.py +0 -562
- hud/cli/rl/utils.py +0 -165
- hud/datasets/execution/__init__.py +0 -13
- hud/datasets/task.py +0 -116
- {hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/WHEEL +0 -0
- {hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/licenses/LICENSE +0 -0
|
@@ -11,7 +11,7 @@ from hud.native.comparator import (
|
|
|
11
11
|
ComparisonResult,
|
|
12
12
|
DataType,
|
|
13
13
|
auto_select_mode,
|
|
14
|
-
|
|
14
|
+
comparator,
|
|
15
15
|
detect_type,
|
|
16
16
|
extract_boolean,
|
|
17
17
|
extract_json,
|
|
@@ -321,10 +321,10 @@ class TestAliasTools:
|
|
|
321
321
|
@pytest.mark.asyncio
|
|
322
322
|
async def test_aliases_work(self):
|
|
323
323
|
"""Test that aliases are properly registered and work."""
|
|
324
|
-
from hud.native.comparator import
|
|
324
|
+
from hud.native.comparator import comparator
|
|
325
325
|
|
|
326
326
|
# Check that aliases are registered
|
|
327
|
-
tool_names = [t.name for t in
|
|
327
|
+
tool_names = [t.name for t in comparator._tool_manager._tools.values()]
|
|
328
328
|
|
|
329
329
|
expected_aliases = [
|
|
330
330
|
"compare_exact",
|
|
@@ -433,7 +433,7 @@ class TestAliasPreprocessing:
|
|
|
433
433
|
@pytest.mark.asyncio
|
|
434
434
|
async def test_json_alias_preprocessing(self):
|
|
435
435
|
"""Test JSON extraction in compare_json tool."""
|
|
436
|
-
tools = {t.name: t for t in
|
|
436
|
+
tools = {t.name: t for t in comparator._tool_manager._tools.values()}
|
|
437
437
|
json_tool = tools["compare_json"]
|
|
438
438
|
|
|
439
439
|
assert isinstance(json_tool, FunctionTool)
|
|
@@ -448,7 +448,7 @@ class TestAliasPreprocessing:
|
|
|
448
448
|
@pytest.mark.asyncio
|
|
449
449
|
async def test_numeric_alias_preprocessing(self):
|
|
450
450
|
"""Test number extraction in numeric tools."""
|
|
451
|
-
tools = {t.name: t for t in
|
|
451
|
+
tools = {t.name: t for t in comparator._tool_manager._tools.values()}
|
|
452
452
|
|
|
453
453
|
# Float tool
|
|
454
454
|
float_tool = tools["compare_float"]
|
|
@@ -471,7 +471,7 @@ class TestAliasPreprocessing:
|
|
|
471
471
|
@pytest.mark.asyncio
|
|
472
472
|
async def test_boolean_alias_preprocessing(self):
|
|
473
473
|
"""Test boolean extraction in compare_boolean tool."""
|
|
474
|
-
tools = {t.name: t for t in
|
|
474
|
+
tools = {t.name: t for t in comparator._tool_manager._tools.values()}
|
|
475
475
|
bool_tool = tools["compare_boolean"]
|
|
476
476
|
|
|
477
477
|
assert isinstance(bool_tool, FunctionTool)
|
|
@@ -485,7 +485,7 @@ class TestAliasPreprocessing:
|
|
|
485
485
|
@pytest.mark.asyncio
|
|
486
486
|
async def test_list_alias_preprocessing(self):
|
|
487
487
|
"""Test list extraction in compare_list tool."""
|
|
488
|
-
tools = {t.name: t for t in
|
|
488
|
+
tools = {t.name: t for t in comparator._tool_manager._tools.values()}
|
|
489
489
|
list_tool = tools["compare_list"]
|
|
490
490
|
|
|
491
491
|
assert isinstance(list_tool, FunctionTool)
|
|
@@ -499,7 +499,7 @@ class TestAliasPreprocessing:
|
|
|
499
499
|
@pytest.mark.asyncio
|
|
500
500
|
async def test_complex_llm_output(self):
|
|
501
501
|
"""Test extraction from complex LLM outputs with reasoning."""
|
|
502
|
-
tools = {t.name: t for t in
|
|
502
|
+
tools = {t.name: t for t in comparator._tool_manager._tools.values()}
|
|
503
503
|
json_tool = tools["compare_json"]
|
|
504
504
|
|
|
505
505
|
llm_output = """
|
|
@@ -8,12 +8,12 @@ class TestNativeInit:
|
|
|
8
8
|
|
|
9
9
|
def test_comparator_server_import(self):
|
|
10
10
|
"""Test that comparator server can be imported."""
|
|
11
|
-
from hud.native.comparator import
|
|
11
|
+
from hud.native.comparator import comparator
|
|
12
12
|
from hud.server import MCPServer
|
|
13
13
|
|
|
14
14
|
# Verify comparator is an MCPServer instance
|
|
15
|
-
assert isinstance(
|
|
16
|
-
assert
|
|
15
|
+
assert isinstance(comparator, MCPServer)
|
|
16
|
+
assert comparator.name == "comparator"
|
|
17
17
|
|
|
18
18
|
def test_all_exports(self):
|
|
19
19
|
"""Test that __all__ is properly defined."""
|
|
@@ -31,17 +31,17 @@ class TestNativeInit:
|
|
|
31
31
|
|
|
32
32
|
def test_comparator_tools_registered(self):
|
|
33
33
|
"""Test that comparator server has tools registered."""
|
|
34
|
-
from hud.native.comparator import
|
|
34
|
+
from hud.native.comparator import comparator
|
|
35
35
|
|
|
36
36
|
# The server should have tools registered
|
|
37
37
|
# We can check that the tool manager has tools
|
|
38
|
-
tool_names = [t.name for t in
|
|
38
|
+
tool_names = [t.name for t in comparator._tool_manager._tools.values()]
|
|
39
39
|
|
|
40
40
|
# Should have the main compare tool
|
|
41
41
|
assert "compare" in tool_names
|
|
42
42
|
|
|
43
43
|
# Should have the submit tool
|
|
44
|
-
assert "
|
|
44
|
+
assert "response" in tool_names
|
|
45
45
|
|
|
46
46
|
# Should have all the alias tools
|
|
47
47
|
expected_aliases = [
|
|
@@ -64,16 +64,18 @@ class TestNativeInit:
|
|
|
64
64
|
|
|
65
65
|
def test_comparator_tool_functionality(self):
|
|
66
66
|
"""Test that we can get the CompareTool from the comparator."""
|
|
67
|
-
from hud.native.comparator import
|
|
68
|
-
from hud.tools import BaseTool
|
|
67
|
+
from hud.native.comparator import comparator
|
|
69
68
|
|
|
70
69
|
# Get the compare tool
|
|
71
70
|
compare_tool = None
|
|
72
|
-
for tool in
|
|
71
|
+
for tool in comparator._tool_manager._tools.values():
|
|
73
72
|
if tool.name == "compare":
|
|
74
73
|
compare_tool = tool
|
|
75
74
|
break
|
|
76
75
|
|
|
77
76
|
assert compare_tool is not None
|
|
78
|
-
|
|
79
|
-
assert hasattr(compare_tool, "
|
|
77
|
+
# FastMCP wraps tools as FunctionTool instances
|
|
78
|
+
assert hasattr(compare_tool, "name")
|
|
79
|
+
assert compare_tool.name == "compare"
|
|
80
|
+
# FunctionTool has a 'fn' attribute for the callable
|
|
81
|
+
assert hasattr(compare_tool, "fn") or hasattr(compare_tool, "__call__")
|
hud/otel/config.py
CHANGED
|
@@ -111,7 +111,7 @@ def configure_telemetry(
|
|
|
111
111
|
# Error if no exporters are configured
|
|
112
112
|
raise ValueError(
|
|
113
113
|
"No telemetry backend configured. Either:\n"
|
|
114
|
-
"1. Set HUD_API_KEY environment variable for HUD telemetry\n"
|
|
114
|
+
"1. Set HUD_API_KEY environment variable for HUD telemetry (https://app.hud.so)\n"
|
|
115
115
|
"2. Use enable_otlp=True with configure_telemetry() for alternative backends (e.g., Jaeger)\n" # noqa: E501
|
|
116
116
|
)
|
|
117
117
|
elif not settings.telemetry_enabled:
|
hud/otel/instrumentation.py
CHANGED
|
@@ -55,6 +55,9 @@ def _patch_mcp_instrumentation() -> None:
|
|
|
55
55
|
try:
|
|
56
56
|
from opentelemetry.instrumentation.mcp.instrumentation import McpInstrumentor
|
|
57
57
|
|
|
58
|
+
# First, patch the get_error_type function to handle invalid HTTP status codes
|
|
59
|
+
_patch_get_error_type()
|
|
60
|
+
|
|
58
61
|
def patched_transport_wrapper(self: Any, tracer: Any) -> Callable[..., Any]:
|
|
59
62
|
@asynccontextmanager
|
|
60
63
|
async def traced_method(
|
|
@@ -98,3 +101,35 @@ def _patch_mcp_instrumentation() -> None:
|
|
|
98
101
|
|
|
99
102
|
logger = logging.getLogger(__name__)
|
|
100
103
|
logger.warning("Failed to patch MCP instrumentation: %s", e)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _patch_get_error_type() -> None:
|
|
107
|
+
"""Patch get_error_type to handle invalid HTTP status codes gracefully."""
|
|
108
|
+
import re
|
|
109
|
+
from http import HTTPStatus
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
import opentelemetry.instrumentation.mcp.instrumentation as mcp_inst
|
|
113
|
+
|
|
114
|
+
def patched_get_error_type(error_message: str) -> str | None:
|
|
115
|
+
"""Extract HTTP status from error message, handling invalid codes."""
|
|
116
|
+
if not isinstance(error_message, str):
|
|
117
|
+
return None
|
|
118
|
+
match = re.search(r"\b(4\d{2}|5\d{2})\b", error_message)
|
|
119
|
+
if match:
|
|
120
|
+
num = int(match.group())
|
|
121
|
+
try:
|
|
122
|
+
# Only return if it's a valid HTTPStatus
|
|
123
|
+
if 400 <= num <= 599:
|
|
124
|
+
return HTTPStatus(num).name
|
|
125
|
+
except ValueError:
|
|
126
|
+
# Not a valid HTTP status code
|
|
127
|
+
logger.debug("Ignoring invalid HTTP status code: %s", num)
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
# Apply the patch
|
|
131
|
+
mcp_inst.get_error_type = patched_get_error_type
|
|
132
|
+
logger.debug("Patched get_error_type to handle invalid HTTP status codes")
|
|
133
|
+
|
|
134
|
+
except Exception as e:
|
|
135
|
+
logger.warning("Failed to patch get_error_type: %s", e)
|
hud/rl/README.md
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
We suggest running hud rl (or with the --local flag) for optimal hyperparameters and native HuggingFace running.
|
|
2
|
+
|
|
3
|
+
However, to run this independently, sping up an instance with at least 2 GPUs and run:
|
|
4
|
+
```bash
|
|
5
|
+
sudo apt-get update -y && sudo apt-get install -y cuda-toolkit-12-6
|
|
6
|
+
uv pip install -e .[rl]
|
|
7
|
+
uv pip install ninja
|
|
8
|
+
uv pip install flash-attn --no-build-isolation
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Launch a vllm server with:
|
|
12
|
+
```bash
|
|
13
|
+
export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
|
|
14
|
+
export TOKENIZERS_PARALLELISM=false
|
|
15
|
+
export VLLM_LOGGING_LEVEL=INFO
|
|
16
|
+
export CUDA_VISIBLE_DEVICES=7 # Set this to your last GPU
|
|
17
|
+
|
|
18
|
+
uv run vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
|
|
19
|
+
--api-key token-abc123 --host 0.0.0.0 --port 8000 --tensor-parallel-size 1 --trust-remote-code \
|
|
20
|
+
--max-model-len 16384 --enable-lora --max-lora-rank 64 --max-cpu-loras 4 --enable-auto-tool-choice \
|
|
21
|
+
--tool-call-parser hermes --disable-log-requests --dtype auto
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
And training with (replace 2 with your spare GPUs):
|
|
25
|
+
```bash
|
|
26
|
+
hud get hud-evals/2048-basic
|
|
27
|
+
torchrun --nproc-per-node 2 -m hud.rl.train --tasks 2048-basic.json --verbose
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Add a `--config path/to/config.json` flag to run a specific configuration (or change the defaults in config.py)
|
hud/rl/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""RL module for HUD."""
|
hud/rl/actor.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""Actor for episode collection using vLLM and HUD."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
import httpx
|
|
9
|
+
from openai import AsyncOpenAI
|
|
10
|
+
|
|
11
|
+
import hud
|
|
12
|
+
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
|
|
13
|
+
from hud.clients.utils.retry_transport import create_retry_httpx_client
|
|
14
|
+
from hud.types import Task, Trace
|
|
15
|
+
from hud.utils.hud_console import HUDConsole
|
|
16
|
+
|
|
17
|
+
from .config import Config
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
hud_console = HUDConsole(logger)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Actor:
|
|
24
|
+
"""Collects episodes using vLLM-served models via HUD agents."""
|
|
25
|
+
|
|
26
|
+
def __init__(self, config: Config) -> None:
|
|
27
|
+
self.config = config
|
|
28
|
+
self.actor_config = config.actor
|
|
29
|
+
self.current_adapter = config.model.base_model
|
|
30
|
+
|
|
31
|
+
# Setup OpenAI client for vLLM
|
|
32
|
+
base_url = self.actor_config.vllm_base_url.replace("localhost", "127.0.0.1")
|
|
33
|
+
self.openai_client = self._create_openai_client(base_url)
|
|
34
|
+
|
|
35
|
+
def _create_openai_client(self, base_url: str) -> AsyncOpenAI:
|
|
36
|
+
"""Create OpenAI client with optimized settings for vLLM."""
|
|
37
|
+
# Match connection limits to parallel_episodes to avoid bottlenecks
|
|
38
|
+
# Use shorter per-request timeout and keep retries modest to avoid long blocking
|
|
39
|
+
http_client = create_retry_httpx_client(
|
|
40
|
+
timeout=httpx.Timeout(30.0),
|
|
41
|
+
)
|
|
42
|
+
return AsyncOpenAI(
|
|
43
|
+
base_url=base_url,
|
|
44
|
+
api_key=self.actor_config.vllm_api_key,
|
|
45
|
+
http_client=http_client,
|
|
46
|
+
max_retries=2,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
def create_agent(self) -> GenericOpenAIChatAgent:
|
|
50
|
+
"""Create an agent with the current adapter."""
|
|
51
|
+
return GenericOpenAIChatAgent(
|
|
52
|
+
openai_client=self.openai_client,
|
|
53
|
+
model_name=self.current_adapter,
|
|
54
|
+
allowed_tools=self.actor_config.allowed_tools,
|
|
55
|
+
append_setup_output=False,
|
|
56
|
+
system_prompt=self.actor_config.system_prompt,
|
|
57
|
+
verbose=self.config.verbose,
|
|
58
|
+
completion_kwargs={
|
|
59
|
+
"temperature": self.actor_config.temperature,
|
|
60
|
+
"max_tokens": self.actor_config.max_new_tokens,
|
|
61
|
+
"tool_choice": "required" if self.actor_config.force_tool_choice else "auto",
|
|
62
|
+
},
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
def update_adapter(self, adapter_name: str) -> None:
|
|
66
|
+
"""Update the current adapter being used."""
|
|
67
|
+
self.current_adapter = adapter_name
|
|
68
|
+
hud_console.info(f"[Actor] Using adapter: {adapter_name}")
|
|
69
|
+
|
|
70
|
+
async def run_tasks(self, tasks: list[Task], job_id: str) -> list[Trace]:
|
|
71
|
+
"""Run tasks and collect traces."""
|
|
72
|
+
traces = []
|
|
73
|
+
|
|
74
|
+
# Process tasks in batches respecting max_parallel_episodes limit
|
|
75
|
+
for batch_start in range(0, len(tasks), self.actor_config.max_parallel_episodes):
|
|
76
|
+
batch_end = min(batch_start + self.actor_config.max_parallel_episodes, len(tasks))
|
|
77
|
+
batch = tasks[batch_start:batch_end]
|
|
78
|
+
|
|
79
|
+
# Run batch in parallel with per-episode timeout protection
|
|
80
|
+
async def run_with_timeout(t: Task) -> Trace:
|
|
81
|
+
try:
|
|
82
|
+
return await asyncio.wait_for(
|
|
83
|
+
self._run_task(t, job_id),
|
|
84
|
+
timeout=self.actor_config.episode_timeout_sec,
|
|
85
|
+
)
|
|
86
|
+
except TimeoutError:
|
|
87
|
+
hud_console.warning_log(f"Episode timed out for task {t.id}")
|
|
88
|
+
return Trace(isError=True, content="Episode timeout")
|
|
89
|
+
|
|
90
|
+
results = await asyncio.gather(
|
|
91
|
+
*[run_with_timeout(t) for t in batch],
|
|
92
|
+
return_exceptions=True,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Normalize exceptions to error traces
|
|
96
|
+
for res in results:
|
|
97
|
+
if isinstance(res, Exception):
|
|
98
|
+
hud_console.warning_log(f"Episode error: {res}")
|
|
99
|
+
traces.append(Trace(isError=True, content=str(res)))
|
|
100
|
+
else:
|
|
101
|
+
traces.append(res)
|
|
102
|
+
|
|
103
|
+
return traces
|
|
104
|
+
|
|
105
|
+
async def _run_task(self, task: Task, job_id: str) -> Trace:
|
|
106
|
+
"""Run a single task."""
|
|
107
|
+
agent = self.create_agent()
|
|
108
|
+
|
|
109
|
+
# Run the task
|
|
110
|
+
try:
|
|
111
|
+
with hud.trace(f"Training | {task.id}", job_id=job_id):
|
|
112
|
+
result = await agent.run(task, max_steps=self.actor_config.max_steps_per_episode)
|
|
113
|
+
|
|
114
|
+
except Exception:
|
|
115
|
+
logger.info("GOT EXCEPTION")
|
|
116
|
+
return Trace(isError=True)
|
|
117
|
+
|
|
118
|
+
result.info["tool_spec"] = agent.get_tool_schemas()
|
|
119
|
+
|
|
120
|
+
return result
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
if __name__ == "__main__":
|
|
124
|
+
from hud.types import Task
|
|
125
|
+
|
|
126
|
+
async def test_actor() -> None:
|
|
127
|
+
"""Test the actor with a single 2048 task using local hud-browser image."""
|
|
128
|
+
config = Config()
|
|
129
|
+
config.actor.max_parallel_episodes = 1
|
|
130
|
+
config.actor.max_steps_per_episode = 6
|
|
131
|
+
config.actor.episode_timeout_sec = 120
|
|
132
|
+
config.verbose = True
|
|
133
|
+
|
|
134
|
+
# Create test task with local hud-browser image
|
|
135
|
+
task_data = {
|
|
136
|
+
"id": "test_2048_128",
|
|
137
|
+
"prompt": "Play the browser-based 2048 game and try to reach the 128 tile. Start by taking a screenshot, then make strategic moves using arrow keys.", # noqa: E501
|
|
138
|
+
"mcp_config": {
|
|
139
|
+
"local": {
|
|
140
|
+
"command": "sh",
|
|
141
|
+
"args": [
|
|
142
|
+
"-c",
|
|
143
|
+
"docker run --rm --platform linux/amd64 -i hud-browser:latest 2>/dev/null",
|
|
144
|
+
],
|
|
145
|
+
}
|
|
146
|
+
},
|
|
147
|
+
"setup_tool": {"name": "launch_app", "arguments": {"app_name": "2048"}},
|
|
148
|
+
"evaluate_tool": {
|
|
149
|
+
"name": "evaluate",
|
|
150
|
+
"arguments": {"name": "game_2048_max_number", "arguments": {"target": 128}},
|
|
151
|
+
},
|
|
152
|
+
"system_prompt": "You are an expert 2048 game player. Use arrow keys to reach the target tile. First take a screenshot, then make strategic moves.", # noqa: E501
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
task = Task(**task_data)
|
|
156
|
+
actor = Actor(config)
|
|
157
|
+
|
|
158
|
+
logger.info("Testing actor with task: %s", task.id)
|
|
159
|
+
logger.info("Model: %s", config.model.base_model)
|
|
160
|
+
logger.info("VLLM: %s", config.actor.vllm_base_url)
|
|
161
|
+
|
|
162
|
+
traces = await actor.run_tasks([task], job_id="test_2048")
|
|
163
|
+
|
|
164
|
+
for trace in traces:
|
|
165
|
+
if trace.isError:
|
|
166
|
+
logger.info("Error: %s", trace.content)
|
|
167
|
+
else:
|
|
168
|
+
logger.info("Success!")
|
|
169
|
+
logger.info("Trace info: %s", trace.info if hasattr(trace, "info") else "No info")
|
|
170
|
+
# Check for evaluation in the trace info
|
|
171
|
+
if hasattr(trace, "info") and "evaluation" in trace.info:
|
|
172
|
+
logger.info(" Evaluation: %s", trace.info["evaluation"])
|
|
173
|
+
|
|
174
|
+
asyncio.run(test_actor())
|