synth-ai 0.2.12__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/multi_step/configs/crafter_rl_outcome.toml +74 -0
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +186 -0
- examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +83 -0
- examples/multi_step/configs/crafter_rl_stepwise_simple.toml +78 -0
- examples/multi_step/crafter_rl_lora.md +51 -10
- examples/multi_step/sse_metrics_streaming_notes.md +357 -0
- examples/multi_step/task_app_config_notes.md +7 -1
- examples/swe/task_app/grpo_swe_mini.py +55 -26
- examples/swe/task_app/hosted/rollout.py +40 -0
- examples/swe/task_app/hosted/test_service.py +5 -6
- examples/task_apps/TESTING.md +275 -0
- examples/task_apps/__init__.py +0 -0
- examples/task_apps/crafter/__init__.py +0 -0
- examples/task_apps/crafter/task_app/__init__.py +2 -0
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +21 -46
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +60 -4
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +109 -45
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +67 -49
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +242 -193
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
- examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
- examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
- examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
- examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
- examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
- examples/task_apps/enron/__init__.py +1 -0
- examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
- examples/task_apps/enron/task_app/README.md +14 -0
- examples/task_apps/enron/task_app/__init__.py +1 -0
- examples/task_apps/enron/task_app/grpo_enron.py +906 -0
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
- examples/task_apps/enron/tests/__init__.py +2 -0
- examples/task_apps/enron/tests/conftest.py +115 -0
- examples/task_apps/enron/tests/integration/__init__.py +2 -0
- examples/task_apps/enron/tests/integration/test_enron_eval.py +177 -0
- examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
- examples/task_apps/enron/tests/unit/__init__.py +2 -0
- examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
- examples/task_apps/math/__init__.py +0 -0
- examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
- examples/task_apps/pokemon_battle/__init__.py +2 -0
- examples/task_apps/pokemon_battle/modal_app.py +104 -0
- examples/task_apps/pokemon_battle/task_app/README.md +68 -0
- examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
- examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
- examples/task_apps/pokemon_red/README.md +357 -0
- examples/task_apps/pokemon_red/__init__.py +3 -0
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
- examples/task_apps/pokemon_red/pallet_town_rl_config.toml +73 -0
- examples/task_apps/pokemon_red/task_app.py +606 -0
- examples/task_apps/pokemon_red/test_pallet_town_rewards.py +191 -0
- examples/task_apps/sokoban/README.md +307 -0
- examples/task_apps/sokoban/__init__.py +3 -0
- examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
- examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
- examples/task_apps/sokoban/task_app.py +1058 -0
- examples/task_apps/sokoban/tests/__init__.py +2 -0
- examples/task_apps/sokoban/tests/conftest.py +113 -0
- examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
- examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
- examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
- examples/task_apps/verilog/__init__.py +1 -0
- examples/task_apps/verilog/eval_groq_qwen32b.toml +20 -0
- examples/task_apps/verilog/task_app/README.md +12 -0
- examples/task_apps/verilog/task_app/__init__.py +1 -0
- examples/task_apps/verilog/task_app/grpo_verilog.py +931 -0
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
- examples/task_apps/verilog/tests/__init__.py +2 -0
- examples/task_apps/verilog/tests/conftest.py +115 -0
- examples/task_apps/verilog/tests/integration/__init__.py +2 -0
- examples/task_apps/verilog/tests/integration/test_verilog_eval.py +179 -0
- examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
- examples/task_apps/verilog/tests/unit/__init__.py +2 -0
- examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
- examples/vlm/crafter_openai_vlm_agent.py +4 -4
- examples/vlm/run_crafter_vlm_benchmark.py +4 -4
- examples/warming_up_to_rl/configs/eval_stepwise_complex.toml +4 -2
- examples/warming_up_to_rl/configs/eval_stepwise_simple.toml +4 -2
- examples/warming_up_to_rl/run_eval.py +127 -18
- examples/workflows/__init__.py +0 -0
- examples/workflows/math_rl/__init__.py +0 -0
- examples/workflows/math_rl/download_dataset.py +80 -0
- synth_ai/__init__.py +41 -1
- synth_ai/api/train/builders.py +73 -29
- synth_ai/api/train/cli.py +12 -6
- synth_ai/api/train/configs/__init__.py +44 -0
- synth_ai/api/train/configs/rl.py +134 -0
- synth_ai/api/train/configs/sft.py +95 -0
- synth_ai/api/train/configs/shared.py +24 -0
- synth_ai/api/train/env_resolver.py +5 -2
- synth_ai/api/train/supported_algos.py +10 -5
- synth_ai/api/train/utils.py +7 -4
- synth_ai/cli/__init__.py +7 -51
- synth_ai/cli/_storage.py +4 -3
- synth_ai/cli/_validate_task_app.py +11 -0
- synth_ai/cli/balance.py +4 -3
- synth_ai/cli/calc.py +2 -2
- synth_ai/cli/demo.py +49 -43
- synth_ai/cli/legacy_root_backup.py +1 -1
- synth_ai/cli/rl_demo.py +86 -106
- synth_ai/cli/root.py +0 -97
- synth_ai/cli/task_apps.py +1710 -186
- synth_ai/demos/core/cli.py +121 -159
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +28 -16
- synth_ai/environments/examples/crafter_classic/environment.py +16 -0
- synth_ai/environments/examples/enron/engine.py +7 -2
- synth_ai/environments/examples/enron/environment.py +68 -0
- synth_ai/environments/examples/red/engine.py +27 -0
- synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
- synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
- synth_ai/environments/examples/red/environment.py +60 -0
- synth_ai/environments/examples/sokoban/taskset.py +116 -0
- synth_ai/environments/examples/verilog/engine.py +30 -4
- synth_ai/evals/__init__.py +15 -0
- synth_ai/evals/client.py +82 -0
- synth_ai/evals/types.py +42 -0
- synth_ai/jobs/client.py +16 -4
- synth_ai/judge_schemas.py +127 -0
- synth_ai/py.typed +0 -0
- synth_ai/task/__init__.py +14 -5
- synth_ai/task/contracts.py +124 -38
- synth_ai/task/proxy.py +48 -56
- synth_ai/task/rubrics/__init__.py +53 -0
- synth_ai/task/rubrics/loaders.py +133 -0
- synth_ai/task/rubrics/models.py +57 -0
- synth_ai/task/rubrics/scoring.py +113 -0
- synth_ai/task/rubrics/strict.py +149 -0
- synth_ai/task/server.py +8 -7
- synth_ai/task/validators.py +269 -6
- synth_ai/tracing_v3/decorators.py +7 -3
- synth_ai/tracing_v3/replica_sync.py +4 -4
- synth_ai/tracing_v3/serialization.py +130 -0
- synth_ai/tracing_v3/trace_utils.py +317 -0
- synth_ai/tracing_v3/turso/native_manager.py +3 -3
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/METADATA +4 -1
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/RECORD +228 -89
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/entry_points.txt +0 -1
- synth_ai/task/rubrics.py +0 -219
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/utils.py +0 -0
- /examples/{rl/task_app → task_apps/math}/README.md +0 -0
- /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
- /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""Compatibility wrapper for the GRPO Verilog task app.
|
|
2
|
+
|
|
3
|
+
This mirrors the Crafter task app wrapper while delegating configuration to
|
|
4
|
+
`grpo_verilog.py`. Normal usage should prefer `uvx synth-ai serve grpo-verilog`,
|
|
5
|
+
but the module remains for direct execution or importing the FastAPI app.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from fastapi.exceptions import RequestValidationError
|
|
14
|
+
from fastapi.responses import JSONResponse
|
|
15
|
+
from starlette.requests import Request
|
|
16
|
+
from synth_ai.task.apps import ModalDeploymentConfig, registry
|
|
17
|
+
from synth_ai.task.auth import is_api_key_header_authorized, normalize_environment_api_key
|
|
18
|
+
from synth_ai.task.server import TaskAppConfig, create_task_app, run_task_app
|
|
19
|
+
|
|
20
|
+
from .grpo_verilog import build_config
|
|
21
|
+
|
|
22
|
+
APP_ID = "grpo-verilog"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _build_base_config() -> TaskAppConfig:
|
|
26
|
+
# Lazily construct the base config to avoid heavy work at import time.
|
|
27
|
+
return build_config()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
_REGISTERED_ENTRY = registry.get(APP_ID)
|
|
32
|
+
except Exception: # pragma: no cover - registry unavailable in some contexts
|
|
33
|
+
MODAL_DEPLOYMENT: ModalDeploymentConfig | None = None
|
|
34
|
+
ENV_FILES: tuple[str, ...] = ()
|
|
35
|
+
else:
|
|
36
|
+
MODAL_DEPLOYMENT = _REGISTERED_ENTRY.modal
|
|
37
|
+
ENV_FILES = tuple(_REGISTERED_ENTRY.env_files)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def build_task_app_config() -> TaskAppConfig:
|
|
41
|
+
"""Return a fresh TaskAppConfig for this wrapper."""
|
|
42
|
+
base = _build_base_config()
|
|
43
|
+
return base.clone()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def fastapi_app():
|
|
47
|
+
"""Return the FastAPI application for Modal or other ASGI hosts."""
|
|
48
|
+
|
|
49
|
+
app = create_task_app(build_task_app_config())
|
|
50
|
+
|
|
51
|
+
# Replace default health endpoints so we can permit soft auth failures and log 422s.
|
|
52
|
+
filtered_routes = []
|
|
53
|
+
for route in app.router.routes:
|
|
54
|
+
path = getattr(route, "path", None)
|
|
55
|
+
methods = getattr(route, "methods", set()) or set()
|
|
56
|
+
if path in {"/health", "/health/rollout"} and "GET" in methods:
|
|
57
|
+
continue
|
|
58
|
+
filtered_routes.append(route)
|
|
59
|
+
app.router.routes = filtered_routes
|
|
60
|
+
|
|
61
|
+
def _log_env_key_prefix(source: str, env_key: str | None) -> str | None:
|
|
62
|
+
if not env_key:
|
|
63
|
+
return None
|
|
64
|
+
prefix = env_key[: max(1, len(env_key) // 2)]
|
|
65
|
+
print(f"[{source}] expected ENVIRONMENT_API_KEY prefix: {prefix}")
|
|
66
|
+
return prefix
|
|
67
|
+
|
|
68
|
+
@app.get("/health")
|
|
69
|
+
async def health(request: Request):
|
|
70
|
+
env_key = normalize_environment_api_key()
|
|
71
|
+
if not env_key:
|
|
72
|
+
return JSONResponse(
|
|
73
|
+
status_code=503,
|
|
74
|
+
content={"status": "unhealthy", "detail": "Missing ENVIRONMENT_API_KEY"},
|
|
75
|
+
)
|
|
76
|
+
if not is_api_key_header_authorized(request):
|
|
77
|
+
prefix = _log_env_key_prefix("health", env_key)
|
|
78
|
+
content = {"status": "healthy", "authorized": False}
|
|
79
|
+
if prefix:
|
|
80
|
+
content["expected_api_key_prefix"] = prefix
|
|
81
|
+
return JSONResponse(status_code=200, content=content)
|
|
82
|
+
return {"status": "healthy", "authorized": True}
|
|
83
|
+
|
|
84
|
+
@app.get("/health/rollout")
|
|
85
|
+
async def health_rollout(request: Request):
|
|
86
|
+
env_key = normalize_environment_api_key()
|
|
87
|
+
if not env_key:
|
|
88
|
+
return JSONResponse(
|
|
89
|
+
status_code=503,
|
|
90
|
+
content={"status": "unhealthy", "detail": "Missing ENVIRONMENT_API_KEY"},
|
|
91
|
+
)
|
|
92
|
+
if not is_api_key_header_authorized(request):
|
|
93
|
+
prefix = _log_env_key_prefix("health/rollout", env_key)
|
|
94
|
+
content = {"status": "healthy", "authorized": False}
|
|
95
|
+
if prefix:
|
|
96
|
+
content["expected_api_key_prefix"] = prefix
|
|
97
|
+
return JSONResponse(status_code=200, content=content)
|
|
98
|
+
return {"ok": True, "authorized": True}
|
|
99
|
+
|
|
100
|
+
@app.exception_handler(RequestValidationError)
|
|
101
|
+
async def _on_validation_error(request: Request, exc: RequestValidationError):
|
|
102
|
+
try:
|
|
103
|
+
hdr = request.headers
|
|
104
|
+
snapshot = {
|
|
105
|
+
"path": str(request.url.path),
|
|
106
|
+
"have_x_api_key": bool(hdr.get("x-api-key")),
|
|
107
|
+
"have_x_api_keys": bool(hdr.get("x-api-keys")),
|
|
108
|
+
"have_authorization": bool(hdr.get("authorization")),
|
|
109
|
+
"errors": exc.errors()[:5],
|
|
110
|
+
}
|
|
111
|
+
print("[422] validation", snapshot, flush=True)
|
|
112
|
+
except Exception:
|
|
113
|
+
pass
|
|
114
|
+
return JSONResponse(
|
|
115
|
+
status_code=422,
|
|
116
|
+
content={"status": "invalid", "detail": exc.errors()[:5]},
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
return app
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
if __name__ == "__main__":
|
|
123
|
+
parser = argparse.ArgumentParser(description="Run the Verilog task app locally")
|
|
124
|
+
parser.add_argument("--host", default="0.0.0.0")
|
|
125
|
+
parser.add_argument("--port", type=int, default=8103)
|
|
126
|
+
parser.add_argument("--reload", action="store_true", help="Enable uvicorn autoreload")
|
|
127
|
+
parser.add_argument(
|
|
128
|
+
"--env-file",
|
|
129
|
+
action="append",
|
|
130
|
+
default=[],
|
|
131
|
+
help="Additional .env files to load before startup",
|
|
132
|
+
)
|
|
133
|
+
args = parser.parse_args()
|
|
134
|
+
|
|
135
|
+
default_env = Path(__file__).resolve().parents[4] / "backend" / ".env.dev"
|
|
136
|
+
env_files = [str(default_env)] if default_env.exists() else []
|
|
137
|
+
env_files.extend(args.env_file or [])
|
|
138
|
+
|
|
139
|
+
run_task_app(
|
|
140
|
+
build_task_app_config,
|
|
141
|
+
host=args.host,
|
|
142
|
+
port=args.port,
|
|
143
|
+
reload=args.reload,
|
|
144
|
+
env_files=env_files,
|
|
145
|
+
)
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""Shared fixtures for Verilog tests."""
|
|
2
|
+
import os
|
|
3
|
+
import socket
|
|
4
|
+
import subprocess
|
|
5
|
+
from subprocess import TimeoutExpired
|
|
6
|
+
import time
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Iterator
|
|
9
|
+
|
|
10
|
+
import pytest
|
|
11
|
+
|
|
12
|
+
requests = pytest.importorskip("requests")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _which(executable: str) -> bool:
|
|
16
|
+
return any(
|
|
17
|
+
(Path(path) / executable).exists()
|
|
18
|
+
for path in os.getenv("PATH", "").split(os.pathsep)
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _find_free_port() -> int:
|
|
23
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
|
24
|
+
sock.bind(("127.0.0.1", 0))
|
|
25
|
+
return sock.getsockname()[1]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _wait_for_server(base_url: str, timeout: float = 60.0) -> None:
|
|
29
|
+
"""Wait for the Verilog server to become ready."""
|
|
30
|
+
deadline = time.time() + timeout
|
|
31
|
+
while time.time() < deadline:
|
|
32
|
+
try:
|
|
33
|
+
# Try /info first (no auth required if --insecure)
|
|
34
|
+
resp = requests.get(f"{base_url}/info", timeout=2.0)
|
|
35
|
+
if resp.status_code == 200:
|
|
36
|
+
return
|
|
37
|
+
# If 400/401, server is up but needs auth - that's OK
|
|
38
|
+
if resp.status_code in (400, 401):
|
|
39
|
+
return
|
|
40
|
+
except Exception:
|
|
41
|
+
time.sleep(0.5)
|
|
42
|
+
raise RuntimeError(f"Task app at {base_url} did not become ready")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@pytest.fixture(scope="module")
|
|
46
|
+
def verilog_server(tmp_path_factory: pytest.TempPathFactory) -> Iterator[str]:
|
|
47
|
+
"""Start the Verilog task app server for testing."""
|
|
48
|
+
if not _which("uv"):
|
|
49
|
+
pytest.skip("uv executable not found on PATH")
|
|
50
|
+
if "GROQ_API_KEY" not in os.environ:
|
|
51
|
+
pytest.skip("GROQ_API_KEY must be set for Groq-backed tests")
|
|
52
|
+
|
|
53
|
+
port = _find_free_port()
|
|
54
|
+
base_url = f"http://127.0.0.1:{port}"
|
|
55
|
+
tmp_path = tmp_path_factory.mktemp("verilog")
|
|
56
|
+
trace_dir = tmp_path / "traces"
|
|
57
|
+
trace_dir.mkdir(parents=True, exist_ok=True)
|
|
58
|
+
|
|
59
|
+
env = os.environ.copy()
|
|
60
|
+
cmd = [
|
|
61
|
+
"uv",
|
|
62
|
+
"run",
|
|
63
|
+
"-m",
|
|
64
|
+
"synth_ai",
|
|
65
|
+
"task-app",
|
|
66
|
+
"serve",
|
|
67
|
+
"grpo-verilog",
|
|
68
|
+
"--port",
|
|
69
|
+
str(port),
|
|
70
|
+
"--no-reload",
|
|
71
|
+
]
|
|
72
|
+
proc = subprocess.Popen(
|
|
73
|
+
cmd,
|
|
74
|
+
stdout=subprocess.PIPE,
|
|
75
|
+
stderr=subprocess.STDOUT,
|
|
76
|
+
text=True,
|
|
77
|
+
env=env,
|
|
78
|
+
stdin=subprocess.PIPE,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Send "n" to decline tracing
|
|
82
|
+
try:
|
|
83
|
+
if proc.stdin:
|
|
84
|
+
proc.stdin.write("n\n")
|
|
85
|
+
proc.stdin.flush()
|
|
86
|
+
except Exception:
|
|
87
|
+
pass
|
|
88
|
+
|
|
89
|
+
stdout_capture = ""
|
|
90
|
+
try:
|
|
91
|
+
time.sleep(2)
|
|
92
|
+
if proc.poll() is not None:
|
|
93
|
+
stdout_capture, _ = proc.communicate(timeout=2)
|
|
94
|
+
tail = "\n".join(stdout_capture.strip().splitlines()[-20:]) if stdout_capture else ""
|
|
95
|
+
pytest.skip(f"Task app terminated immediately:\n{tail}")
|
|
96
|
+
|
|
97
|
+
_wait_for_server(base_url)
|
|
98
|
+
yield base_url
|
|
99
|
+
except RuntimeError as e:
|
|
100
|
+
proc.terminate()
|
|
101
|
+
try:
|
|
102
|
+
stdout_capture, _ = proc.communicate(timeout=10)
|
|
103
|
+
except TimeoutExpired:
|
|
104
|
+
proc.kill()
|
|
105
|
+
stdout_capture, _ = proc.communicate()
|
|
106
|
+
tail = "\n".join((stdout_capture or "").strip().splitlines()[-20:])
|
|
107
|
+
pytest.skip(f"Task app failed to start: {e}\n{tail}")
|
|
108
|
+
finally:
|
|
109
|
+
if proc.poll() is None:
|
|
110
|
+
proc.terminate()
|
|
111
|
+
try:
|
|
112
|
+
proc.wait(timeout=5)
|
|
113
|
+
except TimeoutExpired:
|
|
114
|
+
proc.kill()
|
|
115
|
+
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""Integration tests for Verilog task app with Groq evaluation."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
import socket
|
|
6
|
+
import subprocess
|
|
7
|
+
from subprocess import TimeoutExpired
|
|
8
|
+
import time
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Iterator
|
|
11
|
+
|
|
12
|
+
import pytest
|
|
13
|
+
|
|
14
|
+
requests = pytest.importorskip("requests")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
HERE = Path(__file__).resolve().parent
|
|
18
|
+
TASK_APP_ROOT = HERE.parents[1]
|
|
19
|
+
CONFIG_PATH = TASK_APP_ROOT / "eval_groq_qwen32b.toml"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _which(executable: str) -> bool:
|
|
23
|
+
return any(
|
|
24
|
+
(Path(path) / executable).exists()
|
|
25
|
+
for path in os.getenv("PATH", "").split(os.pathsep)
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _find_free_port() -> int:
|
|
30
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
|
31
|
+
sock.bind(("127.0.0.1", 0))
|
|
32
|
+
return sock.getsockname()[1]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _wait_for_server(base_url: str, timeout: float = 60.0) -> None:
|
|
36
|
+
"""Wait for the Verilog server to become ready."""
|
|
37
|
+
deadline = time.time() + timeout
|
|
38
|
+
while time.time() < deadline:
|
|
39
|
+
try:
|
|
40
|
+
resp = requests.get(f"{base_url}/info", timeout=2.0)
|
|
41
|
+
if resp.status_code == 200:
|
|
42
|
+
return
|
|
43
|
+
except Exception:
|
|
44
|
+
time.sleep(0.5)
|
|
45
|
+
raise RuntimeError(f"Task app at {base_url} did not become ready")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@pytest.fixture
|
|
49
|
+
def verilog_server(tmp_path: Path) -> Iterator[str]:
|
|
50
|
+
"""Start the Verilog task app server for testing."""
|
|
51
|
+
if not _which("uv"):
|
|
52
|
+
pytest.skip("uv executable not found on PATH")
|
|
53
|
+
if "GROQ_API_KEY" not in os.environ:
|
|
54
|
+
pytest.skip("GROQ_API_KEY must be set for Groq-backed evals")
|
|
55
|
+
|
|
56
|
+
port = _find_free_port()
|
|
57
|
+
base_url = f"http://127.0.0.1:{port}"
|
|
58
|
+
trace_dir = tmp_path / "traces"
|
|
59
|
+
trace_dir.mkdir(parents=True, exist_ok=True)
|
|
60
|
+
|
|
61
|
+
env = os.environ.copy()
|
|
62
|
+
cmd = [
|
|
63
|
+
"uv",
|
|
64
|
+
"run",
|
|
65
|
+
"-m",
|
|
66
|
+
"synth_ai",
|
|
67
|
+
"task-app",
|
|
68
|
+
"serve",
|
|
69
|
+
"grpo-verilog",
|
|
70
|
+
"--port",
|
|
71
|
+
str(port),
|
|
72
|
+
"--no-reload",
|
|
73
|
+
]
|
|
74
|
+
proc = subprocess.Popen(
|
|
75
|
+
cmd,
|
|
76
|
+
stdout=subprocess.PIPE,
|
|
77
|
+
stderr=subprocess.STDOUT,
|
|
78
|
+
text=True,
|
|
79
|
+
env=env,
|
|
80
|
+
stdin=subprocess.PIPE, # Auto-answer tracing prompt
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Send "n" to decline tracing
|
|
84
|
+
try:
|
|
85
|
+
if proc.stdin:
|
|
86
|
+
proc.stdin.write("n\n")
|
|
87
|
+
proc.stdin.flush()
|
|
88
|
+
except Exception:
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
stdout_capture = ""
|
|
92
|
+
try:
|
|
93
|
+
# Check if process died immediately
|
|
94
|
+
time.sleep(2)
|
|
95
|
+
if proc.poll() is not None:
|
|
96
|
+
stdout_capture, _ = proc.communicate(timeout=2)
|
|
97
|
+
tail = "\n".join(stdout_capture.strip().splitlines()[-20:]) if stdout_capture else ""
|
|
98
|
+
pytest.skip(f"Task app terminated immediately:\n{tail}")
|
|
99
|
+
|
|
100
|
+
_wait_for_server(base_url)
|
|
101
|
+
yield base_url
|
|
102
|
+
except RuntimeError as e:
|
|
103
|
+
proc.terminate()
|
|
104
|
+
try:
|
|
105
|
+
stdout_capture, _ = proc.communicate(timeout=10)
|
|
106
|
+
except TimeoutExpired:
|
|
107
|
+
proc.kill()
|
|
108
|
+
stdout_capture, _ = proc.communicate()
|
|
109
|
+
tail = "\n".join((stdout_capture or "").strip().splitlines()[-20:])
|
|
110
|
+
pytest.skip(f"Task app failed to start: {e}\n{tail}")
|
|
111
|
+
finally:
|
|
112
|
+
if proc.poll() is None:
|
|
113
|
+
proc.terminate()
|
|
114
|
+
try:
|
|
115
|
+
proc.wait(timeout=5)
|
|
116
|
+
except TimeoutExpired:
|
|
117
|
+
proc.kill()
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@pytest.mark.slow
|
|
121
|
+
def test_verilog_server_health(verilog_server: str) -> None:
|
|
122
|
+
"""Test that the Verilog server health endpoint works."""
|
|
123
|
+
# Health endpoint requires auth, so we expect 400 (auth failed) or 200
|
|
124
|
+
resp = requests.get(f"{verilog_server}/health", timeout=5.0)
|
|
125
|
+
assert resp.status_code in (200, 400), f"Unexpected status: {resp.status_code}"
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@pytest.mark.slow
|
|
129
|
+
def test_verilog_task_info(verilog_server: str) -> None:
|
|
130
|
+
"""Test that the Verilog server returns valid task_info."""
|
|
131
|
+
resp = requests.get(f"{verilog_server}/task_info", timeout=5.0)
|
|
132
|
+
assert resp.status_code == 200
|
|
133
|
+
data = resp.json()
|
|
134
|
+
assert "task" in data
|
|
135
|
+
assert data["task"]["id"] == "verilog"
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@pytest.mark.slow
|
|
139
|
+
def test_verilog_eval_with_groq(verilog_server: str) -> None:
|
|
140
|
+
"""Spin up the Verilog task app and run a Groq-backed eval."""
|
|
141
|
+
if not CONFIG_PATH.exists():
|
|
142
|
+
pytest.skip(f"Config file not found: {CONFIG_PATH}")
|
|
143
|
+
|
|
144
|
+
cmd = [
|
|
145
|
+
"uv",
|
|
146
|
+
"run",
|
|
147
|
+
"-m",
|
|
148
|
+
"synth_ai",
|
|
149
|
+
"eval",
|
|
150
|
+
"grpo-verilog",
|
|
151
|
+
"--config",
|
|
152
|
+
str(CONFIG_PATH),
|
|
153
|
+
"--url",
|
|
154
|
+
verilog_server,
|
|
155
|
+
"--model",
|
|
156
|
+
"qwen/qwen3-32b",
|
|
157
|
+
"--seeds",
|
|
158
|
+
"0", # Just test one seed
|
|
159
|
+
]
|
|
160
|
+
result = subprocess.run(
|
|
161
|
+
cmd,
|
|
162
|
+
stdout=subprocess.PIPE,
|
|
163
|
+
stderr=subprocess.STDOUT,
|
|
164
|
+
text=True,
|
|
165
|
+
env=os.environ.copy(),
|
|
166
|
+
check=False,
|
|
167
|
+
timeout=300, # 5 minutes max
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
if result.returncode != 0:
|
|
171
|
+
pytest.fail(f"Eval failed with return code {result.returncode}:\n{result.stdout}")
|
|
172
|
+
|
|
173
|
+
# Check for success indicators
|
|
174
|
+
assert "Eval complete" in result.stdout
|
|
175
|
+
assert "1 ok, 0 failed" in result.stdout or "status=200" in result.stdout
|
|
176
|
+
|
|
177
|
+
# Check that we got a meaningful outcome score
|
|
178
|
+
assert "outcome" in result.stdout.lower() or "mean_return" in result.stdout.lower()
|
|
179
|
+
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Integration test for Verilog rollouts via /rollout endpoint."""
|
|
2
|
+
import os
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
requests = pytest.importorskip("requests")
|
|
6
|
+
|
|
7
|
+
# Use the actual ENVIRONMENT_API_KEY from .env
|
|
8
|
+
AUTH_HEADER = {"Authorization": "Bearer sk_env_30c78a787bac223c716918181209f263"}
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@pytest.mark.slow
|
|
12
|
+
def test_verilog_policy_rollout(verilog_server: str) -> None:
|
|
13
|
+
"""Test a Verilog rollout using Groq policy."""
|
|
14
|
+
if "GROQ_API_KEY" not in os.environ:
|
|
15
|
+
pytest.skip("GROQ_API_KEY required for this test")
|
|
16
|
+
|
|
17
|
+
rollout_payload = {
|
|
18
|
+
"run_id": "test_policy_verilog",
|
|
19
|
+
"env": {"seed": 0},
|
|
20
|
+
"ops": [], # Empty ops means use policy for all steps
|
|
21
|
+
"policy": {
|
|
22
|
+
"policy_name": "qwen-groq",
|
|
23
|
+
"config": {
|
|
24
|
+
"provider": "groq",
|
|
25
|
+
"model": "qwen/qwen3-32b",
|
|
26
|
+
"max_steps": 5, # Limit steps for test
|
|
27
|
+
},
|
|
28
|
+
},
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
resp = requests.post(
|
|
32
|
+
f"{verilog_server}/rollout",
|
|
33
|
+
json=rollout_payload,
|
|
34
|
+
headers=AUTH_HEADER,
|
|
35
|
+
timeout=120.0,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
assert resp.status_code == 200, f"Rollout failed: {resp.status_code} {resp.text}"
|
|
39
|
+
data = resp.json()
|
|
40
|
+
|
|
41
|
+
# Verify response structure
|
|
42
|
+
assert "trajectories" in data
|
|
43
|
+
assert "metrics" in data
|
|
44
|
+
assert "trace" in data
|
|
45
|
+
|
|
46
|
+
trajectory = data["trajectories"][0]
|
|
47
|
+
assert "steps" in trajectory
|
|
48
|
+
|
|
49
|
+
# Check that at least one step was taken
|
|
50
|
+
assert len(trajectory["steps"]) > 0
|
|
51
|
+
|
|
52
|
+
# Verify metrics
|
|
53
|
+
metrics = data["metrics"]
|
|
54
|
+
assert "episode_returns" in metrics or "mean_return" in metrics
|
|
55
|
+
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""Unit tests for Verilog scoring and rewards."""
|
|
2
|
+
import pytest
|
|
3
|
+
|
|
4
|
+
from synth_ai.environments.examples.verilog.engine import (
|
|
5
|
+
VerilogCompileSuccessComponent,
|
|
6
|
+
VerilogSimulationPassComponent,
|
|
7
|
+
VerilogSubmitSuccessComponent,
|
|
8
|
+
VerilogPublicState,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@pytest.mark.asyncio
|
|
13
|
+
async def test_compile_success_reward():
|
|
14
|
+
"""Test that successful compilation awards 0.1 reward."""
|
|
15
|
+
component = VerilogCompileSuccessComponent()
|
|
16
|
+
state = VerilogPublicState(files={}, build_dir="/tmp", task_completed=False)
|
|
17
|
+
|
|
18
|
+
# Successful compile (returncode 0)
|
|
19
|
+
action = {"type": "compile", "returncode": 0}
|
|
20
|
+
reward = await component.score(state, action)
|
|
21
|
+
assert reward == 0.1
|
|
22
|
+
|
|
23
|
+
# Failed compile (returncode != 0)
|
|
24
|
+
action_fail = {"type": "compile", "returncode": 1}
|
|
25
|
+
reward_fail = await component.score(state, action_fail)
|
|
26
|
+
assert reward_fail == 0.0
|
|
27
|
+
|
|
28
|
+
# Non-compile action
|
|
29
|
+
action_other = {"type": "write_file"}
|
|
30
|
+
reward_other = await component.score(state, action_other)
|
|
31
|
+
assert reward_other == 0.0
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@pytest.mark.asyncio
|
|
35
|
+
async def test_simulation_pass_reward():
|
|
36
|
+
"""Test that passing simulation awards 1.0 reward."""
|
|
37
|
+
component = VerilogSimulationPassComponent()
|
|
38
|
+
state = VerilogPublicState(files={}, build_dir="/tmp", task_completed=False)
|
|
39
|
+
|
|
40
|
+
# Passing simulation
|
|
41
|
+
action = {"type": "simulate", "passed": True}
|
|
42
|
+
reward = await component.score(state, action)
|
|
43
|
+
assert reward == 1.0
|
|
44
|
+
|
|
45
|
+
# Failing simulation
|
|
46
|
+
action_fail = {"type": "simulate", "passed": False}
|
|
47
|
+
reward_fail = await component.score(state, action_fail)
|
|
48
|
+
assert reward_fail == 0.0
|
|
49
|
+
|
|
50
|
+
# Non-simulate action
|
|
51
|
+
action_other = {"type": "compile"}
|
|
52
|
+
reward_other = await component.score(state, action_other)
|
|
53
|
+
assert reward_other == 0.0
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@pytest.mark.asyncio
|
|
57
|
+
async def test_submit_success_reward():
|
|
58
|
+
"""Test that successful submission awards 10.0 reward."""
|
|
59
|
+
component = VerilogSubmitSuccessComponent()
|
|
60
|
+
state = VerilogPublicState(files={}, build_dir="/tmp", task_completed=False)
|
|
61
|
+
|
|
62
|
+
# Successful submission (tests passed)
|
|
63
|
+
action = {"type": "submit", "passed": True}
|
|
64
|
+
reward = await component.score(state, action)
|
|
65
|
+
assert reward == 10.0
|
|
66
|
+
|
|
67
|
+
# Failed submission (tests didn't pass)
|
|
68
|
+
action_fail = {"type": "submit", "passed": False}
|
|
69
|
+
reward_fail = await component.score(state, action_fail)
|
|
70
|
+
assert reward_fail == 0.0
|
|
71
|
+
|
|
72
|
+
# Non-submit action
|
|
73
|
+
action_other = {"type": "compile"}
|
|
74
|
+
reward_other = await component.score(state, action_other)
|
|
75
|
+
assert reward_other == 0.0
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@pytest.mark.asyncio
|
|
79
|
+
async def test_submit_checks_simulation_output():
|
|
80
|
+
"""Test that submit() correctly checks the last simulation output."""
|
|
81
|
+
from synth_ai.environments.examples.verilog.engine import VerilogEngine
|
|
82
|
+
from synth_ai.environments.tasks.core import TaskInstance, Impetus, Intent
|
|
83
|
+
|
|
84
|
+
# Create a minimal task instance
|
|
85
|
+
task = TaskInstance(
|
|
86
|
+
id="test",
|
|
87
|
+
impetus=Impetus(instructions="Test"),
|
|
88
|
+
intent=Intent(
|
|
89
|
+
rubric={"goal": "test"},
|
|
90
|
+
gold_trajectories=None,
|
|
91
|
+
gold_state_diff={},
|
|
92
|
+
deterministic_eval_functions=[],
|
|
93
|
+
),
|
|
94
|
+
metadata=None,
|
|
95
|
+
is_reproducible=False,
|
|
96
|
+
initial_engine_snapshot=None,
|
|
97
|
+
)
|
|
98
|
+
task.snapshot_dir = None # Will be set by engine
|
|
99
|
+
|
|
100
|
+
engine = VerilogEngine(task)
|
|
101
|
+
|
|
102
|
+
# Test 1: No simulation run yet
|
|
103
|
+
result = await engine.submit()
|
|
104
|
+
assert result["passed"] is False
|
|
105
|
+
assert "No simulation run yet" in result["detail"]
|
|
106
|
+
|
|
107
|
+
# Test 2: Simulate with passing output
|
|
108
|
+
engine._last_simulate_output = "Mismatches: 0 in 100 samples\nALL_TESTS_PASSED"
|
|
109
|
+
result_pass = await engine.submit()
|
|
110
|
+
assert result_pass["passed"] is True
|
|
111
|
+
assert "All tests passed" in result_pass["detail"]
|
|
112
|
+
|
|
113
|
+
# Test 3: Simulate with failing output
|
|
114
|
+
engine._last_simulate_output = "Mismatches: 5 in 100 samples\nErrors detected"
|
|
115
|
+
result_fail = await engine.submit()
|
|
116
|
+
assert result_fail["passed"] is False
|
|
117
|
+
assert "Tests failed" in result_fail["detail"]
|
|
118
|
+
|
|
@@ -28,10 +28,10 @@ from pathlib import Path
|
|
|
28
28
|
from typing import Any
|
|
29
29
|
from uuid import uuid4
|
|
30
30
|
|
|
31
|
-
from examples.
|
|
32
|
-
|
|
31
|
+
from examples.task_apps.crafter.task_app.synth_envs_hosted.envs.crafter.environment import (
|
|
32
|
+
CrafterEnvironment,
|
|
33
33
|
)
|
|
34
|
-
from examples.
|
|
34
|
+
from examples.task_apps.crafter.task_app.synth_envs_hosted.envs.crafter.policy import CrafterPolicy
|
|
35
35
|
from openai import OpenAI
|
|
36
36
|
from synth_ai.environments.examples.crafter_classic.environment import CrafterClassicEnvironment
|
|
37
37
|
from synth_ai.environments.examples.crafter_classic.taskset import (
|
|
@@ -140,7 +140,7 @@ async def _run_episode(
|
|
|
140
140
|
) -> EpisodeResult:
|
|
141
141
|
task_instance = _build_task_instance(seed)
|
|
142
142
|
env = CrafterClassicEnvironment(task_instance)
|
|
143
|
-
wrapper =
|
|
143
|
+
wrapper = CrafterEnvironment(env, seed=seed)
|
|
144
144
|
policy = CrafterPolicy(inference_url="openai://chat-completions", model=model)
|
|
145
145
|
await policy.initialize({"use_tools": True, "model": model})
|
|
146
146
|
|
|
@@ -24,10 +24,10 @@ from pathlib import Path
|
|
|
24
24
|
from typing import Any
|
|
25
25
|
from uuid import uuid4
|
|
26
26
|
|
|
27
|
-
from examples.
|
|
28
|
-
|
|
27
|
+
from examples.task_apps.crafter.task_app.synth_envs_hosted.envs.crafter.environment import (
|
|
28
|
+
CrafterEnvironment,
|
|
29
29
|
)
|
|
30
|
-
from examples.
|
|
30
|
+
from examples.task_apps.crafter.task_app.synth_envs_hosted.envs.crafter.policy import CrafterPolicy
|
|
31
31
|
from openai import AsyncOpenAI
|
|
32
32
|
from synth_ai.environments.examples.crafter_classic.environment import CrafterClassicEnvironment
|
|
33
33
|
from synth_ai.environments.examples.crafter_classic.taskset import (
|
|
@@ -142,7 +142,7 @@ async def _run_episode(
|
|
|
142
142
|
async with semaphore:
|
|
143
143
|
task_instance = _build_task_instance(seed)
|
|
144
144
|
env = CrafterClassicEnvironment(task_instance)
|
|
145
|
-
wrapper =
|
|
145
|
+
wrapper = CrafterEnvironment(env, seed=seed)
|
|
146
146
|
|
|
147
147
|
policy = CrafterPolicy(inference_url="openai://chat-completions", model=model)
|
|
148
148
|
await policy.initialize({"use_tools": True, "model": model})
|
|
@@ -4,9 +4,11 @@
|
|
|
4
4
|
# task_app_url = "https://YOUR-TASK-APP.modal.run"
|
|
5
5
|
|
|
6
6
|
model = "qwen/qwen3-32b"
|
|
7
|
-
|
|
7
|
+
# Route inference to local task app Groq proxy
|
|
8
|
+
inference_url = "http://localhost:8001/proxy/groq"
|
|
9
|
+
num_episodes = 10
|
|
8
10
|
max_turns = 10
|
|
9
|
-
concurrency =
|
|
11
|
+
concurrency = 10
|
|
10
12
|
# difficulty = "easy" # optional
|
|
11
13
|
|
|
12
14
|
[rollout]
|