synth-ai 0.2.12__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/multi_step/configs/crafter_rl_outcome.toml +74 -0
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +186 -0
- examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +83 -0
- examples/multi_step/configs/crafter_rl_stepwise_simple.toml +78 -0
- examples/multi_step/crafter_rl_lora.md +51 -10
- examples/multi_step/sse_metrics_streaming_notes.md +357 -0
- examples/multi_step/task_app_config_notes.md +7 -1
- examples/swe/task_app/grpo_swe_mini.py +55 -26
- examples/swe/task_app/hosted/rollout.py +40 -0
- examples/swe/task_app/hosted/test_service.py +5 -6
- examples/task_apps/TESTING.md +275 -0
- examples/task_apps/__init__.py +0 -0
- examples/task_apps/crafter/__init__.py +0 -0
- examples/task_apps/crafter/task_app/__init__.py +2 -0
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +21 -46
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +60 -4
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +109 -45
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +67 -49
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +242 -193
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
- examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
- examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
- examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
- examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
- examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
- examples/task_apps/enron/__init__.py +1 -0
- examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
- examples/task_apps/enron/task_app/README.md +14 -0
- examples/task_apps/enron/task_app/__init__.py +1 -0
- examples/task_apps/enron/task_app/grpo_enron.py +906 -0
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
- examples/task_apps/enron/tests/__init__.py +2 -0
- examples/task_apps/enron/tests/conftest.py +115 -0
- examples/task_apps/enron/tests/integration/__init__.py +2 -0
- examples/task_apps/enron/tests/integration/test_enron_eval.py +177 -0
- examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
- examples/task_apps/enron/tests/unit/__init__.py +2 -0
- examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
- examples/task_apps/math/__init__.py +0 -0
- examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
- examples/task_apps/pokemon_battle/__init__.py +2 -0
- examples/task_apps/pokemon_battle/modal_app.py +104 -0
- examples/task_apps/pokemon_battle/task_app/README.md +68 -0
- examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
- examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
- examples/task_apps/pokemon_red/README.md +357 -0
- examples/task_apps/pokemon_red/__init__.py +3 -0
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
- examples/task_apps/pokemon_red/pallet_town_rl_config.toml +73 -0
- examples/task_apps/pokemon_red/task_app.py +606 -0
- examples/task_apps/pokemon_red/test_pallet_town_rewards.py +191 -0
- examples/task_apps/sokoban/README.md +307 -0
- examples/task_apps/sokoban/__init__.py +3 -0
- examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
- examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
- examples/task_apps/sokoban/task_app.py +1058 -0
- examples/task_apps/sokoban/tests/__init__.py +2 -0
- examples/task_apps/sokoban/tests/conftest.py +113 -0
- examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
- examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
- examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
- examples/task_apps/verilog/__init__.py +1 -0
- examples/task_apps/verilog/eval_groq_qwen32b.toml +20 -0
- examples/task_apps/verilog/task_app/README.md +12 -0
- examples/task_apps/verilog/task_app/__init__.py +1 -0
- examples/task_apps/verilog/task_app/grpo_verilog.py +931 -0
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
- examples/task_apps/verilog/tests/__init__.py +2 -0
- examples/task_apps/verilog/tests/conftest.py +115 -0
- examples/task_apps/verilog/tests/integration/__init__.py +2 -0
- examples/task_apps/verilog/tests/integration/test_verilog_eval.py +179 -0
- examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
- examples/task_apps/verilog/tests/unit/__init__.py +2 -0
- examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
- examples/vlm/crafter_openai_vlm_agent.py +4 -4
- examples/vlm/run_crafter_vlm_benchmark.py +4 -4
- examples/warming_up_to_rl/configs/eval_stepwise_complex.toml +4 -2
- examples/warming_up_to_rl/configs/eval_stepwise_simple.toml +4 -2
- examples/warming_up_to_rl/run_eval.py +127 -18
- examples/workflows/__init__.py +0 -0
- examples/workflows/math_rl/__init__.py +0 -0
- examples/workflows/math_rl/download_dataset.py +80 -0
- synth_ai/__init__.py +41 -1
- synth_ai/api/train/builders.py +73 -29
- synth_ai/api/train/cli.py +12 -6
- synth_ai/api/train/configs/__init__.py +44 -0
- synth_ai/api/train/configs/rl.py +134 -0
- synth_ai/api/train/configs/sft.py +95 -0
- synth_ai/api/train/configs/shared.py +24 -0
- synth_ai/api/train/env_resolver.py +5 -2
- synth_ai/api/train/supported_algos.py +10 -5
- synth_ai/api/train/utils.py +7 -4
- synth_ai/cli/__init__.py +7 -51
- synth_ai/cli/_storage.py +4 -3
- synth_ai/cli/_validate_task_app.py +11 -0
- synth_ai/cli/balance.py +4 -3
- synth_ai/cli/calc.py +2 -2
- synth_ai/cli/demo.py +49 -43
- synth_ai/cli/legacy_root_backup.py +1 -1
- synth_ai/cli/rl_demo.py +86 -106
- synth_ai/cli/root.py +0 -97
- synth_ai/cli/task_apps.py +1710 -186
- synth_ai/demos/core/cli.py +121 -159
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +28 -16
- synth_ai/environments/examples/crafter_classic/environment.py +16 -0
- synth_ai/environments/examples/enron/engine.py +7 -2
- synth_ai/environments/examples/enron/environment.py +68 -0
- synth_ai/environments/examples/red/engine.py +27 -0
- synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
- synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
- synth_ai/environments/examples/red/environment.py +60 -0
- synth_ai/environments/examples/sokoban/taskset.py +116 -0
- synth_ai/environments/examples/verilog/engine.py +30 -4
- synth_ai/evals/__init__.py +15 -0
- synth_ai/evals/client.py +82 -0
- synth_ai/evals/types.py +42 -0
- synth_ai/jobs/client.py +16 -4
- synth_ai/judge_schemas.py +127 -0
- synth_ai/py.typed +0 -0
- synth_ai/task/__init__.py +14 -5
- synth_ai/task/contracts.py +124 -38
- synth_ai/task/proxy.py +48 -56
- synth_ai/task/rubrics/__init__.py +53 -0
- synth_ai/task/rubrics/loaders.py +133 -0
- synth_ai/task/rubrics/models.py +57 -0
- synth_ai/task/rubrics/scoring.py +113 -0
- synth_ai/task/rubrics/strict.py +149 -0
- synth_ai/task/server.py +8 -7
- synth_ai/task/validators.py +269 -6
- synth_ai/tracing_v3/decorators.py +7 -3
- synth_ai/tracing_v3/replica_sync.py +4 -4
- synth_ai/tracing_v3/serialization.py +130 -0
- synth_ai/tracing_v3/trace_utils.py +317 -0
- synth_ai/tracing_v3/turso/native_manager.py +3 -3
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/METADATA +4 -1
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/RECORD +228 -89
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/entry_points.txt +0 -1
- synth_ai/task/rubrics.py +0 -219
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/utils.py +0 -0
- /examples/{rl/task_app → task_apps/math}/README.md +0 -0
- /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
- /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""Shared fixtures for Sokoban tests."""
|
|
2
|
+
import os
|
|
3
|
+
import socket
|
|
4
|
+
import subprocess
|
|
5
|
+
from subprocess import TimeoutExpired
|
|
6
|
+
import time
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Iterator
|
|
9
|
+
|
|
10
|
+
import pytest
|
|
11
|
+
|
|
12
|
+
requests = pytest.importorskip("requests")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _which(executable: str) -> bool:
|
|
16
|
+
return any(
|
|
17
|
+
(Path(path) / executable).exists()
|
|
18
|
+
for path in os.getenv("PATH", "").split(os.pathsep)
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _find_free_port() -> int:
|
|
23
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
|
24
|
+
sock.bind(("127.0.0.1", 0))
|
|
25
|
+
return sock.getsockname()[1]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _wait_for_server(base_url: str, timeout: float = 60.0) -> None:
|
|
29
|
+
"""Wait for the Sokoban server to become ready."""
|
|
30
|
+
deadline = time.time() + timeout
|
|
31
|
+
while time.time() < deadline:
|
|
32
|
+
try:
|
|
33
|
+
# Try /info first (no auth required if --insecure)
|
|
34
|
+
resp = requests.get(f"{base_url}/info", timeout=2.0)
|
|
35
|
+
if resp.status_code == 200:
|
|
36
|
+
return
|
|
37
|
+
# If 400/401, server is up but needs auth - that's OK
|
|
38
|
+
if resp.status_code in (400, 401):
|
|
39
|
+
return
|
|
40
|
+
except Exception:
|
|
41
|
+
time.sleep(0.5)
|
|
42
|
+
raise RuntimeError(f"Task app at {base_url} did not become ready")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@pytest.fixture(scope="module")
|
|
46
|
+
def sokoban_server(tmp_path_factory: pytest.TempPathFactory) -> Iterator[str]:
|
|
47
|
+
"""Start the Sokoban task app server for testing."""
|
|
48
|
+
if not _which("uv"):
|
|
49
|
+
pytest.skip("uv executable not found on PATH")
|
|
50
|
+
|
|
51
|
+
port = _find_free_port()
|
|
52
|
+
base_url = f"http://127.0.0.1:{port}"
|
|
53
|
+
tmp_path = tmp_path_factory.mktemp("sokoban")
|
|
54
|
+
|
|
55
|
+
env = os.environ.copy()
|
|
56
|
+
# Set the test API key
|
|
57
|
+
env["ENVIRONMENT_API_KEY"] = "sk_env_30c78a787bac223c716918181209f263"
|
|
58
|
+
cmd = [
|
|
59
|
+
"uv",
|
|
60
|
+
"run",
|
|
61
|
+
"-m",
|
|
62
|
+
"synth_ai",
|
|
63
|
+
"task-app",
|
|
64
|
+
"serve",
|
|
65
|
+
"sokoban",
|
|
66
|
+
"--port",
|
|
67
|
+
str(port),
|
|
68
|
+
"--no-reload",
|
|
69
|
+
]
|
|
70
|
+
proc = subprocess.Popen(
|
|
71
|
+
cmd,
|
|
72
|
+
stdout=subprocess.PIPE,
|
|
73
|
+
stderr=subprocess.STDOUT,
|
|
74
|
+
text=True,
|
|
75
|
+
env=env,
|
|
76
|
+
stdin=subprocess.PIPE,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# Send "n" to decline tracing
|
|
80
|
+
try:
|
|
81
|
+
if proc.stdin:
|
|
82
|
+
proc.stdin.write("n\n")
|
|
83
|
+
proc.stdin.flush()
|
|
84
|
+
except Exception:
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
stdout_capture = ""
|
|
88
|
+
try:
|
|
89
|
+
time.sleep(2)
|
|
90
|
+
if proc.poll() is not None:
|
|
91
|
+
stdout_capture, _ = proc.communicate(timeout=2)
|
|
92
|
+
tail = "\n".join(stdout_capture.strip().splitlines()[-20:]) if stdout_capture else ""
|
|
93
|
+
pytest.skip(f"Task app terminated immediately:\n{tail}")
|
|
94
|
+
|
|
95
|
+
_wait_for_server(base_url)
|
|
96
|
+
yield base_url
|
|
97
|
+
except RuntimeError as e:
|
|
98
|
+
proc.terminate()
|
|
99
|
+
try:
|
|
100
|
+
stdout_capture, _ = proc.communicate(timeout=10)
|
|
101
|
+
except TimeoutExpired:
|
|
102
|
+
proc.kill()
|
|
103
|
+
stdout_capture, _ = proc.communicate()
|
|
104
|
+
tail = "\n".join((stdout_capture or "").strip().splitlines()[-20:])
|
|
105
|
+
pytest.skip(f"Task app failed to start: {e}\n{tail}")
|
|
106
|
+
finally:
|
|
107
|
+
if proc.poll() is None:
|
|
108
|
+
proc.terminate()
|
|
109
|
+
try:
|
|
110
|
+
proc.wait(timeout=5)
|
|
111
|
+
except TimeoutExpired:
|
|
112
|
+
proc.kill()
|
|
113
|
+
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Integration tests for Sokoban task app with evaluation."""
|
|
2
|
+
import pytest
|
|
3
|
+
|
|
4
|
+
requests = pytest.importorskip("requests")
|
|
5
|
+
|
|
6
|
+
# sokoban_server fixture is in conftest.py
|
|
7
|
+
# Use the actual ENVIRONMENT_API_KEY from .env
|
|
8
|
+
AUTH_HEADER = {"Authorization": "Bearer sk_env_30c78a787bac223c716918181209f263"}
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@pytest.mark.slow
|
|
12
|
+
def test_sokoban_server_health(sokoban_server: str) -> None:
|
|
13
|
+
"""Test that the Sokoban server health endpoint works."""
|
|
14
|
+
resp = requests.get(f"{sokoban_server}/health", timeout=5.0)
|
|
15
|
+
assert resp.status_code in (200, 400), f"Unexpected status: {resp.status_code}"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_sokoban_task_info(sokoban_server: str) -> None:
|
|
19
|
+
"""Test that the Sokoban server returns valid task_info."""
|
|
20
|
+
resp = requests.get(f"{sokoban_server}/task_info", timeout=5.0)
|
|
21
|
+
assert resp.status_code == 200
|
|
22
|
+
data = resp.json()
|
|
23
|
+
assert "task" in data
|
|
24
|
+
assert data["task"]["id"] == "sokoban"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@pytest.mark.fast
|
|
28
|
+
def test_sokoban_manual_rollout(sokoban_server: str) -> None:
|
|
29
|
+
"""Test a manual Sokoban rollout with explicit actions."""
|
|
30
|
+
# Try explicit action rollout (no LLM required)
|
|
31
|
+
# Actions: 0=left, 1=up, 2=right, 3=down
|
|
32
|
+
rollout_payload = {
|
|
33
|
+
"run_id": "test_manual",
|
|
34
|
+
"env": {"seed": 0, "config": {"difficulty": "easy", "max_steps": 50}},
|
|
35
|
+
"ops": [],
|
|
36
|
+
"policy": {
|
|
37
|
+
"policy_name": "manual",
|
|
38
|
+
"config": {
|
|
39
|
+
"provider": "noop",
|
|
40
|
+
"actions": [0, 2, 2, 3], # left, right, right, down
|
|
41
|
+
},
|
|
42
|
+
},
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
resp = requests.post(
|
|
46
|
+
f"{sokoban_server}/rollout",
|
|
47
|
+
json=rollout_payload,
|
|
48
|
+
headers=AUTH_HEADER,
|
|
49
|
+
timeout=30.0,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
assert resp.status_code == 200
|
|
53
|
+
data = resp.json()
|
|
54
|
+
assert "trajectories" in data
|
|
55
|
+
assert len(data["trajectories"]) > 0
|
|
56
|
+
assert "metrics" in data
|
|
57
|
+
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
"""Integration test for Sokoban rollouts via /rollout endpoint."""
|
|
2
|
+
import os
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
requests = pytest.importorskip("requests")
|
|
6
|
+
|
|
7
|
+
# Use the actual ENVIRONMENT_API_KEY from .env
|
|
8
|
+
AUTH_HEADER = {"Authorization": "Bearer sk_env_30c78a787bac223c716918181209f263"}
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@pytest.mark.slow
|
|
12
|
+
def test_sokoban_manual_rollout(sokoban_server: str) -> None:
|
|
13
|
+
"""Test a manual Sokoban rollout with explicit movement actions."""
|
|
14
|
+
# Actions: 0=left, 1=up, 2=right, 3=down
|
|
15
|
+
rollout_payload = {
|
|
16
|
+
"run_id": "test_manual_sokoban",
|
|
17
|
+
"env": {"seed": 0, "config": {"difficulty": "easy", "max_steps": 20}},
|
|
18
|
+
"ops": [], # Not used for manual actions in Sokoban
|
|
19
|
+
"policy": {
|
|
20
|
+
"policy_name": "manual",
|
|
21
|
+
"config": {
|
|
22
|
+
"provider": "noop",
|
|
23
|
+
"actions": [0, 2, 2, 3, 3, 0], # Pass actions via policy.config
|
|
24
|
+
},
|
|
25
|
+
},
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
resp = requests.post(
|
|
29
|
+
f"{sokoban_server}/rollout",
|
|
30
|
+
json=rollout_payload,
|
|
31
|
+
headers=AUTH_HEADER,
|
|
32
|
+
timeout=30.0,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
assert resp.status_code == 200, f"Rollout failed: {resp.status_code} {resp.text}"
|
|
36
|
+
data = resp.json()
|
|
37
|
+
|
|
38
|
+
# Verify response structure
|
|
39
|
+
assert "trajectories" in data
|
|
40
|
+
assert len(data["trajectories"]) > 0
|
|
41
|
+
assert "metrics" in data
|
|
42
|
+
|
|
43
|
+
trajectory = data["trajectories"][0]
|
|
44
|
+
assert "steps" in trajectory
|
|
45
|
+
|
|
46
|
+
# Should have taken the requested actions
|
|
47
|
+
assert len(trajectory["steps"]) >= 6 # Initial obs + 6 actions
|
|
48
|
+
|
|
49
|
+
# Verify each step has required fields
|
|
50
|
+
for step in trajectory["steps"]:
|
|
51
|
+
assert "obs" in step
|
|
52
|
+
assert "reward" in step or "reward_last" in step.get("obs", {})
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@pytest.mark.slow
|
|
56
|
+
def test_sokoban_policy_rollout_with_openai(sokoban_server: str) -> None:
|
|
57
|
+
"""Test a Sokoban rollout using OpenAI GPT-5-mini policy."""
|
|
58
|
+
if "OPENAI_API_KEY" not in os.environ:
|
|
59
|
+
pytest.skip("OPENAI_API_KEY required for this test")
|
|
60
|
+
|
|
61
|
+
rollout_payload = {
|
|
62
|
+
"run_id": "test_policy_sokoban",
|
|
63
|
+
"env": {"seed": 0, "config": {"difficulty": "easy", "max_steps": 10}},
|
|
64
|
+
"ops": ["policy", "policy"], # 2 policy calls
|
|
65
|
+
"policy": {
|
|
66
|
+
"policy_name": "gpt-5-mini",
|
|
67
|
+
"config": {
|
|
68
|
+
"provider": "openai",
|
|
69
|
+
"model": "gpt-5-mini",
|
|
70
|
+
"max_tokens": 512,
|
|
71
|
+
},
|
|
72
|
+
},
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
resp = requests.post(
|
|
76
|
+
f"{sokoban_server}/rollout",
|
|
77
|
+
json=rollout_payload,
|
|
78
|
+
headers=AUTH_HEADER,
|
|
79
|
+
timeout=180.0, # GPT-5-mini can be slow
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# GPT-5-mini may or may not work for Sokoban, so just check it doesn't crash
|
|
83
|
+
assert resp.status_code in (200, 500), f"Unexpected status: {resp.status_code}"
|
|
84
|
+
|
|
85
|
+
if resp.status_code == 200:
|
|
86
|
+
data = resp.json()
|
|
87
|
+
assert "trajectories" in data
|
|
88
|
+
assert "metrics" in data
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@pytest.mark.fast
|
|
92
|
+
def test_sokoban_difficulty_levels(sokoban_server: str) -> None:
|
|
93
|
+
"""Test Sokoban rollouts with different difficulty levels."""
|
|
94
|
+
for difficulty in ["easy", "medium", "hard"]:
|
|
95
|
+
rollout_payload = {
|
|
96
|
+
"run_id": f"test_difficulty_{difficulty}",
|
|
97
|
+
"env": {"seed": 0, "config": {"difficulty": difficulty, "max_steps": 10}},
|
|
98
|
+
"ops": [],
|
|
99
|
+
"policy": {
|
|
100
|
+
"config": {
|
|
101
|
+
"provider": "noop",
|
|
102
|
+
"actions": [2, 3, 0], # right, down, left
|
|
103
|
+
},
|
|
104
|
+
},
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
resp = requests.post(
|
|
108
|
+
f"{sokoban_server}/rollout",
|
|
109
|
+
json=rollout_payload,
|
|
110
|
+
headers=AUTH_HEADER,
|
|
111
|
+
timeout=30.0,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
assert resp.status_code == 200, f"Rollout failed for {difficulty}: {resp.text}"
|
|
115
|
+
data = resp.json()
|
|
116
|
+
|
|
117
|
+
# Verify basic structure
|
|
118
|
+
assert "trajectories" in data
|
|
119
|
+
assert len(data["trajectories"]) > 0
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@pytest.mark.fast
|
|
123
|
+
def test_sokoban_max_steps_limit(sokoban_server: str) -> None:
|
|
124
|
+
"""Test that Sokoban respects max_steps configuration."""
|
|
125
|
+
max_steps = 5
|
|
126
|
+
rollout_payload = {
|
|
127
|
+
"run_id": "test_max_steps",
|
|
128
|
+
"env": {"seed": 0, "config": {"difficulty": "easy", "max_steps": max_steps}},
|
|
129
|
+
"ops": [],
|
|
130
|
+
"policy": {
|
|
131
|
+
"config": {
|
|
132
|
+
"provider": "noop",
|
|
133
|
+
"actions": [0] * 20, # Try to take 20 actions, but should be limited
|
|
134
|
+
},
|
|
135
|
+
},
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
resp = requests.post(
|
|
139
|
+
f"{sokoban_server}/rollout",
|
|
140
|
+
json=rollout_payload,
|
|
141
|
+
headers=AUTH_HEADER,
|
|
142
|
+
timeout=30.0,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
assert resp.status_code == 200
|
|
146
|
+
data = resp.json()
|
|
147
|
+
|
|
148
|
+
trajectory = data["trajectories"][0]
|
|
149
|
+
steps = trajectory["steps"]
|
|
150
|
+
|
|
151
|
+
# Should have stopped at max_steps (plus initial observation)
|
|
152
|
+
assert len(steps) <= max_steps + 1, f"Expected <= {max_steps + 1} steps, got {len(steps)}"
|
|
153
|
+
|
|
154
|
+
# Check if truncated
|
|
155
|
+
final_obs = steps[-1].get("obs", {})
|
|
156
|
+
if len(steps) > max_steps:
|
|
157
|
+
assert final_obs.get("truncated") is True
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@pytest.mark.fast
|
|
161
|
+
def test_sokoban_completion_detection(sokoban_server: str) -> None:
|
|
162
|
+
"""Test that Sokoban detects puzzle completion (terminated=True)."""
|
|
163
|
+
# This test verifies the structure, not necessarily that we solve it
|
|
164
|
+
rollout_payload = {
|
|
165
|
+
"run_id": "test_completion",
|
|
166
|
+
"env": {"seed": 0, "config": {"difficulty": "easy", "max_steps": 50}},
|
|
167
|
+
"ops": [],
|
|
168
|
+
"policy": {
|
|
169
|
+
"config": {
|
|
170
|
+
"provider": "noop",
|
|
171
|
+
"actions": [2, 3, 0, 1, 2], # Random moves
|
|
172
|
+
},
|
|
173
|
+
},
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
resp = requests.post(
|
|
177
|
+
f"{sokoban_server}/rollout",
|
|
178
|
+
json=rollout_payload,
|
|
179
|
+
headers=AUTH_HEADER,
|
|
180
|
+
timeout=30.0,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
assert resp.status_code == 200
|
|
184
|
+
data = resp.json()
|
|
185
|
+
|
|
186
|
+
trajectory = data["trajectories"][0]
|
|
187
|
+
final_step = trajectory["steps"][-1]
|
|
188
|
+
final_obs = final_step.get("obs", {})
|
|
189
|
+
|
|
190
|
+
# Verify that termination fields exist
|
|
191
|
+
assert "terminated" in final_obs or "done" in final_step
|
|
192
|
+
assert "boxes_on_target" in final_obs
|
|
193
|
+
assert "num_boxes" in final_obs
|
|
194
|
+
|
|
195
|
+
# If all boxes on target, should be terminated
|
|
196
|
+
if final_obs.get("boxes_on_target") == final_obs.get("num_boxes"):
|
|
197
|
+
assert final_obs.get("terminated") is True or final_step.get("done") is True
|
|
198
|
+
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""Unit tests for Sokoban environment and rewards."""
|
|
2
|
+
import pytest
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@pytest.mark.fast
|
|
6
|
+
def test_sokoban_module_imports():
|
|
7
|
+
"""Test that Sokoban modules can be imported."""
|
|
8
|
+
from synth_ai.environments.examples.sokoban import environment, engine
|
|
9
|
+
|
|
10
|
+
assert hasattr(environment, "SokobanEnvironment")
|
|
11
|
+
assert hasattr(engine, "SokobanEngine")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@pytest.mark.asyncio
|
|
15
|
+
async def test_sokoban_reward_components():
|
|
16
|
+
"""Test that Sokoban reward components exist and work."""
|
|
17
|
+
from synth_ai.environments.examples.sokoban.engine import (
|
|
18
|
+
SokobanEngine,
|
|
19
|
+
SokobanGoalAchievedComponent,
|
|
20
|
+
SokobanStepPenaltyComponent,
|
|
21
|
+
SokobanPublicState,
|
|
22
|
+
)
|
|
23
|
+
from synth_ai.environments.tasks.core import TaskInstance, Impetus, Intent
|
|
24
|
+
|
|
25
|
+
# Create a minimal task instance
|
|
26
|
+
task = TaskInstance(
|
|
27
|
+
id="test",
|
|
28
|
+
impetus=Impetus(instructions="Test"),
|
|
29
|
+
intent=Intent(
|
|
30
|
+
rubric={"goal": "test"},
|
|
31
|
+
gold_trajectories=None,
|
|
32
|
+
gold_state_diff={},
|
|
33
|
+
deterministic_eval_functions=[],
|
|
34
|
+
),
|
|
35
|
+
metadata={"difficulty": "easy", "max_steps": 50, "seed": 0},
|
|
36
|
+
is_reproducible=False,
|
|
37
|
+
initial_engine_snapshot=None,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
engine = SokobanEngine(task)
|
|
41
|
+
|
|
42
|
+
# Test that reward components exist
|
|
43
|
+
assert hasattr(engine, "reward_stack")
|
|
44
|
+
assert engine.reward_stack is not None
|
|
45
|
+
|
|
46
|
+
# Test reward components directly
|
|
47
|
+
goal_reward = SokobanGoalAchievedComponent()
|
|
48
|
+
penalty = SokobanStepPenaltyComponent()
|
|
49
|
+
|
|
50
|
+
# Mock state for reward calculation
|
|
51
|
+
import numpy as np
|
|
52
|
+
|
|
53
|
+
state = SokobanPublicState(
|
|
54
|
+
dim_room=(3, 3),
|
|
55
|
+
room_fixed=np.array([[0]]),
|
|
56
|
+
room_state=np.array([[0]]),
|
|
57
|
+
player_position=(0, 0),
|
|
58
|
+
boxes_on_target=0,
|
|
59
|
+
num_steps=0,
|
|
60
|
+
max_steps=50,
|
|
61
|
+
last_action_name="NONE",
|
|
62
|
+
num_boxes=1,
|
|
63
|
+
error_info=None,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# Test goal reward (should be 0 for incomplete puzzle)
|
|
67
|
+
reward1 = await goal_reward.score(state, {"action": 0})
|
|
68
|
+
assert reward1 == 0.0
|
|
69
|
+
|
|
70
|
+
# Test completed state
|
|
71
|
+
state_complete = SokobanPublicState(
|
|
72
|
+
dim_room=(3, 3),
|
|
73
|
+
room_fixed=np.array([[0]]),
|
|
74
|
+
room_state=np.array([[0]]),
|
|
75
|
+
player_position=(0, 0),
|
|
76
|
+
boxes_on_target=1,
|
|
77
|
+
num_steps=10,
|
|
78
|
+
max_steps=50,
|
|
79
|
+
last_action_name="RIGHT",
|
|
80
|
+
num_boxes=1,
|
|
81
|
+
error_info=None,
|
|
82
|
+
)
|
|
83
|
+
reward_complete = await goal_reward.score(state_complete, {"action": 0})
|
|
84
|
+
assert reward_complete > 0
|
|
85
|
+
|
|
86
|
+
# Test penalty (should be negative small value)
|
|
87
|
+
penalty_reward = await penalty.score(state, {"action": 0})
|
|
88
|
+
assert penalty_reward < 0
|
|
89
|
+
assert penalty_reward > -1 # Should be a small penalty
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def test_sokoban_difficulty_settings():
|
|
93
|
+
"""Test that Sokoban engine can be created with task metadata."""
|
|
94
|
+
from synth_ai.environments.examples.sokoban.engine import SokobanEngine
|
|
95
|
+
from synth_ai.environments.tasks.core import TaskInstance, Impetus, Intent
|
|
96
|
+
|
|
97
|
+
# Test with different difficulty metadata
|
|
98
|
+
for difficulty in ["easy", "medium", "hard"]:
|
|
99
|
+
task = TaskInstance(
|
|
100
|
+
id="test",
|
|
101
|
+
impetus=Impetus(instructions="Test"),
|
|
102
|
+
intent=Intent(
|
|
103
|
+
rubric={"goal": "test"},
|
|
104
|
+
gold_trajectories=None,
|
|
105
|
+
gold_state_diff={},
|
|
106
|
+
deterministic_eval_functions=[],
|
|
107
|
+
),
|
|
108
|
+
metadata={"difficulty": difficulty, "max_steps": 50, "seed": 0},
|
|
109
|
+
is_reproducible=False,
|
|
110
|
+
initial_engine_snapshot=None,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
engine = SokobanEngine(task)
|
|
114
|
+
assert engine is not None
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# Verilog Eval Config for Groq Qwen3-32B
|
|
2
|
+
|
|
3
|
+
[task_app]
|
|
4
|
+
url = "http://localhost:8103" # Verilog task app port
|
|
5
|
+
|
|
6
|
+
[eval]
|
|
7
|
+
num_episodes = 5
|
|
8
|
+
seeds = [0, 1, 2, 3, 4]
|
|
9
|
+
max_steps = 10
|
|
10
|
+
|
|
11
|
+
[policy]
|
|
12
|
+
provider = "groq"
|
|
13
|
+
model = "qwen/qwen3-32b"
|
|
14
|
+
temperature = 0.2
|
|
15
|
+
max_tokens = 768
|
|
16
|
+
inference_url = "https://api.groq.com/openai/v1/chat/completions"
|
|
17
|
+
|
|
18
|
+
[env]
|
|
19
|
+
difficulty = "medium" # Can be "easy", "medium", or "hard"
|
|
20
|
+
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# GRPO Verilog Task App
|
|
2
|
+
|
|
3
|
+
This example mirrors the Crafter task app layout while targeting the Verilog
|
|
4
|
+
hardware synthesis environment under `synth_ai.environments.examples.verilog`.
|
|
5
|
+
The `grpo_verilog.py` module builds a lightweight dataset from the VerilogEval
|
|
6
|
+
spec-to-RTL benchmark and wires a minimalist task-app configuration. The
|
|
7
|
+
companion `grpo_verilog_task_app.py` acts as a compatibility wrapper for direct
|
|
8
|
+
FastAPI execution or Modal deployment.
|
|
9
|
+
|
|
10
|
+
The rollout bridge currently surfaces the initial observation for the selected
|
|
11
|
+
task instance, providing a scaffold for future extensions that integrate the
|
|
12
|
+
full hosted environment workflow and policy orchestration similar to Crafter.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|