synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +12 -1
- examples/swe/task_app/grpo_swe_mini.py +55 -26
- examples/swe/task_app/hosted/rollout.py +40 -0
- examples/swe/task_app/hosted/test_service.py +5 -6
- examples/task_apps/TESTING.md +275 -0
- examples/task_apps/__init__.py +0 -0
- examples/task_apps/crafter/__init__.py +0 -0
- examples/task_apps/crafter/task_app/__init__.py +2 -0
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +18 -13
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +60 -4
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +25 -3
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +10 -0
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
- examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
- examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
- examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
- examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
- examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
- examples/task_apps/enron/__init__.py +1 -0
- examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
- examples/task_apps/enron/task_app/README.md +14 -0
- examples/task_apps/enron/task_app/__init__.py +1 -0
- examples/task_apps/enron/task_app/grpo_enron.py +906 -0
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
- examples/task_apps/enron/tests/__init__.py +2 -0
- examples/task_apps/enron/tests/conftest.py +115 -0
- examples/task_apps/enron/tests/integration/__init__.py +2 -0
- examples/task_apps/enron/tests/integration/test_enron_eval.py +177 -0
- examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
- examples/task_apps/enron/tests/unit/__init__.py +2 -0
- examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
- examples/task_apps/math/__init__.py +0 -0
- examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
- examples/task_apps/pokemon_battle/__init__.py +2 -0
- examples/task_apps/pokemon_battle/modal_app.py +104 -0
- examples/task_apps/pokemon_battle/task_app/README.md +68 -0
- examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
- examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
- examples/task_apps/pokemon_red/README.md +357 -0
- examples/task_apps/pokemon_red/__init__.py +3 -0
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
- examples/task_apps/pokemon_red/pallet_town_rl_config.toml +73 -0
- examples/task_apps/pokemon_red/task_app.py +606 -0
- examples/task_apps/pokemon_red/test_pallet_town_rewards.py +191 -0
- examples/task_apps/sokoban/README.md +307 -0
- examples/task_apps/sokoban/__init__.py +3 -0
- examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
- examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
- examples/task_apps/sokoban/task_app.py +1058 -0
- examples/task_apps/sokoban/tests/__init__.py +2 -0
- examples/task_apps/sokoban/tests/conftest.py +113 -0
- examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
- examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
- examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
- examples/task_apps/verilog/__init__.py +1 -0
- examples/task_apps/verilog/eval_groq_qwen32b.toml +20 -0
- examples/task_apps/verilog/task_app/README.md +12 -0
- examples/task_apps/verilog/task_app/__init__.py +1 -0
- examples/task_apps/verilog/task_app/grpo_verilog.py +931 -0
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
- examples/task_apps/verilog/tests/__init__.py +2 -0
- examples/task_apps/verilog/tests/conftest.py +115 -0
- examples/task_apps/verilog/tests/integration/__init__.py +2 -0
- examples/task_apps/verilog/tests/integration/test_verilog_eval.py +179 -0
- examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
- examples/task_apps/verilog/tests/unit/__init__.py +2 -0
- examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
- examples/vlm/crafter_openai_vlm_agent.py +4 -4
- examples/vlm/run_crafter_vlm_benchmark.py +4 -4
- examples/workflows/__init__.py +0 -0
- examples/workflows/math_rl/__init__.py +0 -0
- examples/workflows/math_rl/download_dataset.py +80 -0
- synth_ai/__init__.py +2 -2
- synth_ai/api/train/builders.py +25 -11
- synth_ai/api/train/cli.py +12 -6
- synth_ai/api/train/configs/__init__.py +10 -10
- synth_ai/api/train/configs/rl.py +5 -4
- synth_ai/api/train/configs/sft.py +4 -3
- synth_ai/api/train/env_resolver.py +5 -2
- synth_ai/api/train/supported_algos.py +10 -5
- synth_ai/api/train/utils.py +7 -4
- synth_ai/cli/__init__.py +7 -51
- synth_ai/cli/_storage.py +4 -3
- synth_ai/cli/_validate_task_app.py +11 -0
- synth_ai/cli/balance.py +4 -3
- synth_ai/cli/calc.py +2 -2
- synth_ai/cli/demo.py +14 -7
- synth_ai/cli/legacy_root_backup.py +1 -1
- synth_ai/cli/rl_demo.py +8 -7
- synth_ai/cli/root.py +0 -97
- synth_ai/cli/task_apps.py +1707 -186
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +28 -16
- synth_ai/environments/examples/enron/engine.py +7 -2
- synth_ai/environments/examples/enron/environment.py +68 -0
- synth_ai/environments/examples/red/engine.py +27 -0
- synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
- synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
- synth_ai/environments/examples/red/environment.py +60 -0
- synth_ai/environments/examples/sokoban/taskset.py +116 -0
- synth_ai/environments/examples/verilog/engine.py +30 -4
- synth_ai/evals/client.py +58 -61
- synth_ai/jobs/client.py +16 -4
- synth_ai/judge_schemas.py +16 -16
- synth_ai/py.typed +0 -0
- synth_ai/task/__init__.py +14 -5
- synth_ai/task/contracts.py +124 -38
- synth_ai/task/proxy.py +48 -56
- synth_ai/task/rubrics/__init__.py +53 -0
- synth_ai/task/rubrics/loaders.py +133 -0
- synth_ai/task/rubrics/models.py +57 -0
- synth_ai/task/rubrics/scoring.py +113 -0
- synth_ai/{rubrics/validators.py → task/rubrics/strict.py} +53 -30
- synth_ai/task/server.py +8 -7
- synth_ai/task/validators.py +269 -6
- synth_ai/tracing_v3/decorators.py +7 -3
- synth_ai/tracing_v3/replica_sync.py +4 -4
- synth_ai/tracing_v3/serialization.py +5 -5
- synth_ai/tracing_v3/trace_utils.py +317 -0
- synth_ai/tracing_v3/turso/native_manager.py +3 -3
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/METADATA +4 -1
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/RECORD +214 -101
- examples/agora_ex/README_MoE.md +0 -224
- examples/agora_ex/__init__.py +0 -7
- examples/agora_ex/agora_ex.py +0 -65
- examples/agora_ex/agora_ex_task_app.py +0 -590
- examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +0 -121
- examples/agora_ex/reward_fn_grpo-human.py +0 -129
- examples/agora_ex/system_prompt_CURRENT.md +0 -63
- examples/agora_ex/task_app/agora_ex_task_app.py +0 -590
- examples/agora_ex/task_app/reward_fn_grpo-human.py +0 -129
- examples/agora_ex/task_app/system_prompt_CURRENT.md +0 -63
- synth_ai/rubrics/__init__.py +0 -22
- synth_ai/task/rubrics.py +0 -219
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/utils.py +0 -0
- /examples/{rl/task_app → task_apps/math}/README.md +0 -0
- /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
- /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/top_level.txt +0 -0
|
@@ -12,7 +12,7 @@ variety = "gspo"
|
|
|
12
12
|
# Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-crafter`
|
|
13
13
|
task_url = "https://YOUR-MODAL-TASK-APP.modal.run"
|
|
14
14
|
# Point at the Synth backend (or compatible service) that exposes /api/judge/v1/*
|
|
15
|
-
judge_url = "https://
|
|
15
|
+
judge_url = "https://synth-backend-dev-docker.onrender.com/api"
|
|
16
16
|
|
|
17
17
|
[compute]
|
|
18
18
|
gpu_type = "H200"
|
|
@@ -101,6 +101,9 @@ verify_every_k = 0
|
|
|
101
101
|
|
|
102
102
|
[rubric]
|
|
103
103
|
enabled = true
|
|
104
|
+
model = "openai/gpt-oss-120b"
|
|
105
|
+
api_base = "https://synth-backend-dev-docker.onrender.com/api/judge"
|
|
106
|
+
api_key_env = "OPENAI_API_KEY"
|
|
104
107
|
# Blend the hosted judge scores with environment returns inside the trainer.
|
|
105
108
|
[rubric.weights]
|
|
106
109
|
env = 0.2
|
|
@@ -110,10 +113,18 @@ outcome = 0.4
|
|
|
110
113
|
[rubric.event]
|
|
111
114
|
# Hosted judge rubric for per-decision progress scoring.
|
|
112
115
|
rubric_id = "crafter/event@v1"
|
|
116
|
+
criteria = [
|
|
117
|
+
{ key = "progress.unique_achievements", weight = 0.9, description = "Return 1 when this decision explicitly unlocks a brand-new Crafter achievement (inventory or status text confirms it this turn). Otherwise return 0.", aggregation = "weighted_sum" },
|
|
118
|
+
{ key = "process.intent_alignment", weight = 0.1, description = "Use at most 0.3 to acknowledge tightly coupled setup that finishes the last prerequisite; keep ≤0.1 when the agent only repositions or gathers without an imminent unlock.", aggregation = "weighted_sum" },
|
|
119
|
+
]
|
|
113
120
|
|
|
114
121
|
[rubric.outcome]
|
|
115
122
|
# Hosted judge rubric for final trajectory scoring.
|
|
116
123
|
rubric_id = "crafter/outcome@v1"
|
|
124
|
+
criteria = [
|
|
125
|
+
{ key = "outcome.goal_completion", weight = 0.6, description = "Full credit when the agent ends with strong survival metrics and a clear crafted milestone (e.g., iron tools, furnace).", aggregation = "weighted_sum" },
|
|
126
|
+
{ key = "outcome.achievement_depth", weight = 0.4, description = "Partial credit for intermediate achievements (saplings, wood/stone tools) that set up future success.", aggregation = "weighted_sum" },
|
|
127
|
+
]
|
|
117
128
|
|
|
118
129
|
[judge]
|
|
119
130
|
type = "gemini" # or "groq" when routing to Groq-hosted judges
|
|
@@ -60,34 +60,55 @@ try:
|
|
|
60
60
|
HAS_HOSTED = True
|
|
61
61
|
except Exception:
|
|
62
62
|
try: # pragma: no cover - optional dependency path
|
|
63
|
-
from examples.
|
|
64
|
-
|
|
63
|
+
from examples.task_apps.crafter.task_app.synth_envs_hosted.branching import ( # type: ignore
|
|
64
|
+
BranchingEnvironmentConfig,
|
|
65
65
|
)
|
|
66
|
-
from examples.
|
|
67
|
-
|
|
66
|
+
from examples.task_apps.crafter.task_app.synth_envs_hosted.environment_routes import ( # type: ignore # noqa: E501
|
|
67
|
+
CrafterEnvironmentRoutes,
|
|
68
68
|
)
|
|
69
|
-
from examples.
|
|
70
|
-
|
|
69
|
+
from examples.task_apps.crafter.task_app.synth_envs_hosted.policy_routes import ( # type: ignore
|
|
70
|
+
PolicyRoutes,
|
|
71
71
|
)
|
|
72
|
-
from examples.
|
|
72
|
+
from examples.task_apps.crafter.task_app.synth_envs_hosted.rollout import ( # type: ignore
|
|
73
|
+
RolloutPayload,
|
|
74
|
+
)
|
|
75
|
+
from examples.task_apps.crafter.task_app.synth_envs_hosted.rollout import (
|
|
76
|
+
EnvironmentConfig,
|
|
77
|
+
)
|
|
78
|
+
from examples.task_apps.crafter.task_app.synth_envs_hosted.rollout import (
|
|
79
|
+
PolicyConfig,
|
|
80
|
+
)
|
|
81
|
+
from examples.task_apps.crafter.task_app.synth_envs_hosted.rollout import (
|
|
82
|
+
RolloutRequest,
|
|
83
|
+
)
|
|
84
|
+
from examples.task_apps.crafter.task_app.synth_envs_hosted.rollout import (
|
|
85
|
+
RolloutResponse,
|
|
86
|
+
)
|
|
87
|
+
from examples.task_apps.crafter.task_app.synth_envs_hosted.rollout import (
|
|
88
|
+
RunSpec,
|
|
89
|
+
)
|
|
90
|
+
from examples.task_apps.crafter.task_app.synth_envs_hosted.rollout import (
|
|
91
|
+
ToolUse,
|
|
92
|
+
)
|
|
93
|
+
from examples.task_apps.crafter.task_app.hosted.rollout import ( # type: ignore
|
|
73
94
|
RolloutEnvSpec as LegacyRolloutEnvSpec,
|
|
74
95
|
)
|
|
75
|
-
from examples.
|
|
96
|
+
from examples.task_apps.crafter.task_app.hosted.rollout import (
|
|
76
97
|
RolloutPolicySpec as LegacyRolloutPolicySpec,
|
|
77
98
|
)
|
|
78
|
-
from examples.
|
|
99
|
+
from examples.task_apps.crafter.task_app.hosted.rollout import (
|
|
79
100
|
RolloutRecordConfig as LegacyRolloutRecordConfig,
|
|
80
101
|
)
|
|
81
|
-
from examples.
|
|
102
|
+
from examples.task_apps.crafter.task_app.hosted.rollout import (
|
|
82
103
|
RolloutRequest as LegacyRolloutRequest,
|
|
83
104
|
)
|
|
84
|
-
from examples.
|
|
105
|
+
from examples.task_apps.crafter.task_app.hosted.rollout import (
|
|
85
106
|
RolloutResponse as LegacyRolloutResponse,
|
|
86
107
|
)
|
|
87
|
-
from examples.
|
|
108
|
+
from examples.task_apps.crafter.task_app.hosted.rollout import (
|
|
88
109
|
RolloutSafetyConfig as LegacyRolloutSafetyConfig,
|
|
89
110
|
)
|
|
90
|
-
from examples.
|
|
111
|
+
from examples.task_apps.crafter.task_app.hosted.rollout import (
|
|
91
112
|
execute_rollout as legacy_execute_rollout,
|
|
92
113
|
)
|
|
93
114
|
HAS_HOSTED = True
|
|
@@ -264,7 +285,7 @@ def build_dataset() -> tuple[TaskDatasetRegistry, MiniSweDataset]:
|
|
|
264
285
|
def _base_task_info(dataset: MiniSweDataset) -> TaskInfo:
|
|
265
286
|
return TaskInfo(
|
|
266
287
|
task={"id": "swe_mini", "name": "mini-SWE Tasks", "version": "0.1.0"},
|
|
267
|
-
|
|
288
|
+
environment="swe-mini",
|
|
268
289
|
action_space={
|
|
269
290
|
"type": "tool",
|
|
270
291
|
"tools": ["run_command", "submit_patch"],
|
|
@@ -292,11 +313,6 @@ def _base_task_info(dataset: MiniSweDataset) -> TaskInfo:
|
|
|
292
313
|
},
|
|
293
314
|
"tool": {"name": "run_command", "parallel_tool_calls": False},
|
|
294
315
|
},
|
|
295
|
-
capabilities={
|
|
296
|
-
"supports_rollout": True,
|
|
297
|
-
"supports_env_lifecycle": True,
|
|
298
|
-
"requires_api_key_header": True,
|
|
299
|
-
},
|
|
300
316
|
limits={"max_ops": 2000, "max_time_s": 7200},
|
|
301
317
|
)
|
|
302
318
|
|
|
@@ -348,18 +364,31 @@ def provide_task_instances(
|
|
|
348
364
|
dataset: MiniSweDataset, base_info: TaskInfo, seeds: Sequence[int]
|
|
349
365
|
) -> Iterable[TaskInfo]:
|
|
350
366
|
infos: list[TaskInfo] = []
|
|
367
|
+
base_observation = getattr(base_info, "observation", None)
|
|
368
|
+
if hasattr(base_observation, "model_dump"):
|
|
369
|
+
base_observation_data = base_observation.model_dump()
|
|
370
|
+
elif isinstance(base_observation, dict):
|
|
371
|
+
base_observation_data = dict(base_observation)
|
|
372
|
+
else:
|
|
373
|
+
base_observation_data = {}
|
|
374
|
+
|
|
351
375
|
for seed in seeds:
|
|
352
376
|
instance = dataset.sample_by_index(int(seed))
|
|
353
377
|
infos.append(
|
|
354
378
|
TaskInfo(
|
|
355
379
|
task=base_info.task,
|
|
356
|
-
|
|
380
|
+
environment=base_info.environment,
|
|
357
381
|
action_space=base_info.action_space,
|
|
358
|
-
observation={
|
|
359
|
-
|
|
382
|
+
observation={
|
|
383
|
+
**base_observation_data,
|
|
384
|
+
"instance_id": instance["instance_id"],
|
|
385
|
+
},
|
|
386
|
+
dataset={
|
|
387
|
+
**base_info.dataset.model_dump(),
|
|
388
|
+
"instance_id": instance["instance_id"],
|
|
389
|
+
},
|
|
360
390
|
rubric=base_info.rubric,
|
|
361
391
|
inference=base_info.inference,
|
|
362
|
-
capabilities=base_info.capabilities,
|
|
363
392
|
limits=base_info.limits,
|
|
364
393
|
)
|
|
365
394
|
)
|
|
@@ -397,10 +426,10 @@ def build_config() -> TaskAppConfig:
|
|
|
397
426
|
HostedTaskAppCls = HostedTaskApp
|
|
398
427
|
except Exception:
|
|
399
428
|
try:
|
|
400
|
-
from examples.
|
|
401
|
-
|
|
429
|
+
from examples.task_apps.crafter.task_app.synth_envs_hosted.hosted_app import ( # type: ignore
|
|
430
|
+
create_app,
|
|
402
431
|
)
|
|
403
|
-
HostedTaskAppCls =
|
|
432
|
+
HostedTaskAppCls = create_app
|
|
404
433
|
except Exception as exc: # pragma: no cover - optional dependency path
|
|
405
434
|
logger.warning("Unable to import HostedTaskApp for swe-mini: %s", exc)
|
|
406
435
|
if HostedTaskAppCls is not None:
|
|
@@ -1238,6 +1238,15 @@ async def execute_rollout(
|
|
|
1238
1238
|
)
|
|
1239
1239
|
|
|
1240
1240
|
# Build partial trajectory and return HTTP 200
|
|
1241
|
+
# Extract inference_url from policy meta (best effort)
|
|
1242
|
+
inference_url = None
|
|
1243
|
+
if policy_handle is not None:
|
|
1244
|
+
try:
|
|
1245
|
+
policy_snapshot = policy_handle.snapshot()
|
|
1246
|
+
inference_url = policy_snapshot.get("config", {}).get("inference_url")
|
|
1247
|
+
except Exception:
|
|
1248
|
+
pass
|
|
1249
|
+
|
|
1241
1250
|
trajectory = RolloutTrajectory(
|
|
1242
1251
|
env_id=env_id,
|
|
1243
1252
|
policy_id=policy_id,
|
|
@@ -1249,6 +1258,7 @@ async def execute_rollout(
|
|
|
1249
1258
|
"at_op": op,
|
|
1250
1259
|
},
|
|
1251
1260
|
length=len(trajectory_steps),
|
|
1261
|
+
inference_url=inference_url, # NEW: Required for trace correlation
|
|
1252
1262
|
decision_samples=decision_samples if step_rewards_active else None,
|
|
1253
1263
|
)
|
|
1254
1264
|
metrics = RolloutMetrics(
|
|
@@ -1369,6 +1379,15 @@ async def execute_rollout(
|
|
|
1369
1379
|
},
|
|
1370
1380
|
)
|
|
1371
1381
|
trajectory_steps.append(term_step)
|
|
1382
|
+
# Extract inference_url from policy meta (best effort)
|
|
1383
|
+
inference_url = None
|
|
1384
|
+
if policy_handle is not None:
|
|
1385
|
+
try:
|
|
1386
|
+
policy_snapshot = policy_handle.snapshot()
|
|
1387
|
+
inference_url = policy_snapshot.get("config", {}).get("inference_url")
|
|
1388
|
+
except Exception:
|
|
1389
|
+
pass
|
|
1390
|
+
|
|
1372
1391
|
trajectory = RolloutTrajectory(
|
|
1373
1392
|
env_id=env_id,
|
|
1374
1393
|
policy_id=policy_id,
|
|
@@ -1379,6 +1398,7 @@ async def execute_rollout(
|
|
|
1379
1398
|
"at_op": op,
|
|
1380
1399
|
},
|
|
1381
1400
|
length=len(trajectory_steps),
|
|
1401
|
+
inference_url=inference_url, # NEW: Required for trace correlation
|
|
1382
1402
|
decision_samples=decision_samples if step_rewards_active else None,
|
|
1383
1403
|
)
|
|
1384
1404
|
metrics = RolloutMetrics(
|
|
@@ -1460,6 +1480,15 @@ async def execute_rollout(
|
|
|
1460
1480
|
)
|
|
1461
1481
|
trajectory_steps.append(term_step)
|
|
1462
1482
|
# Build partial response
|
|
1483
|
+
# Extract inference_url from policy meta (best effort)
|
|
1484
|
+
inference_url = None
|
|
1485
|
+
if policy_handle is not None:
|
|
1486
|
+
try:
|
|
1487
|
+
policy_snapshot = policy_handle.snapshot()
|
|
1488
|
+
inference_url = policy_snapshot.get("config", {}).get("inference_url")
|
|
1489
|
+
except Exception:
|
|
1490
|
+
pass
|
|
1491
|
+
|
|
1463
1492
|
trajectory = RolloutTrajectory(
|
|
1464
1493
|
env_id=env_id,
|
|
1465
1494
|
policy_id=policy_id,
|
|
@@ -1471,6 +1500,7 @@ async def execute_rollout(
|
|
|
1471
1500
|
"at_op": op,
|
|
1472
1501
|
},
|
|
1473
1502
|
length=len(trajectory_steps),
|
|
1503
|
+
inference_url=inference_url, # NEW: Required for trace correlation
|
|
1474
1504
|
decision_samples=decision_samples if step_rewards_active else None,
|
|
1475
1505
|
)
|
|
1476
1506
|
metrics = RolloutMetrics(
|
|
@@ -1688,12 +1718,22 @@ async def execute_rollout(
|
|
|
1688
1718
|
timing_final.setdefault("overhead_ms", 0.0)
|
|
1689
1719
|
|
|
1690
1720
|
# Build trajectory
|
|
1721
|
+
# Extract inference_url from policy meta
|
|
1722
|
+
inference_url = None
|
|
1723
|
+
if policy_handle is not None:
|
|
1724
|
+
try:
|
|
1725
|
+
policy_snapshot = policy_handle.snapshot()
|
|
1726
|
+
inference_url = policy_snapshot.get("config", {}).get("inference_url")
|
|
1727
|
+
except Exception:
|
|
1728
|
+
pass
|
|
1729
|
+
|
|
1691
1730
|
trajectory = RolloutTrajectory(
|
|
1692
1731
|
env_id=env_id,
|
|
1693
1732
|
policy_id=policy_id,
|
|
1694
1733
|
steps=trajectory_steps,
|
|
1695
1734
|
final={"observation": _summarize_observation_for_storage(env_handle, current_obs)},
|
|
1696
1735
|
length=len(trajectory_steps),
|
|
1736
|
+
inference_url=inference_url, # NEW: Required for trace correlation
|
|
1697
1737
|
decision_samples=decision_samples if step_rewards_active else None,
|
|
1698
1738
|
)
|
|
1699
1739
|
|
|
@@ -1,15 +1,14 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Simple test script for the GRPO Synth Envs Hosted Service.
|
|
4
|
-
|
|
5
|
-
Run this after starting the service with:
|
|
6
|
-
python main.py
|
|
7
|
-
"""
|
|
2
|
+
"""Manual smoke script for the GRPO Synth Envs Hosted Service."""
|
|
8
3
|
|
|
9
4
|
import asyncio
|
|
10
5
|
import json
|
|
11
6
|
|
|
12
7
|
import httpx
|
|
8
|
+
import pytest
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
pytestmark = pytest.mark.skip(reason="Requires running hosted service on localhost:8000")
|
|
13
12
|
|
|
14
13
|
|
|
15
14
|
async def test_service():
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
# Task App Testing Guide
|
|
2
|
+
|
|
3
|
+
This document describes how to run tests for the task apps in this directory.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Each task app has unit and integration tests following a consistent pattern inspired by the customer environment tests in `customers/`.
|
|
8
|
+
|
|
9
|
+
## Test Structure
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
examples/task_apps/<app_name>/tests/
|
|
13
|
+
├── __init__.py
|
|
14
|
+
├── integration/
|
|
15
|
+
│ ├── __init__.py
|
|
16
|
+
│ └── test_<app>_eval.py # Server startup + eval tests
|
|
17
|
+
└── unit/
|
|
18
|
+
├── __init__.py
|
|
19
|
+
└── test_<app>_*.py # Environment, scoring, dataset tests
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Running Tests
|
|
23
|
+
|
|
24
|
+
### Prerequisites
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
# Install test dependencies
|
|
28
|
+
uv sync --dev
|
|
29
|
+
|
|
30
|
+
# Set required environment variables
|
|
31
|
+
export GROQ_API_KEY="your-groq-key"
|
|
32
|
+
export OPENAI_API_KEY="your-openai-key" # For Sokoban
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Run All Tests for a Task App
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
# Verilog
|
|
39
|
+
pytest examples/task_apps/verilog/tests/ -v
|
|
40
|
+
|
|
41
|
+
# Enron
|
|
42
|
+
pytest examples/task_apps/enron/tests/ -v
|
|
43
|
+
|
|
44
|
+
# Sokoban
|
|
45
|
+
pytest examples/task_apps/sokoban/tests/ -v
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Run Only Unit Tests (Fast)
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
# Runs quickly, no server startup required
|
|
52
|
+
pytest examples/task_apps/verilog/tests/unit/ -v
|
|
53
|
+
pytest examples/task_apps/enron/tests/unit/ -v
|
|
54
|
+
pytest examples/task_apps/sokoban/tests/unit/ -v
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Run Only Integration Tests
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
# Slower, starts servers and runs evals
|
|
61
|
+
pytest examples/task_apps/verilog/tests/integration/ -v
|
|
62
|
+
pytest examples/task_apps/enron/tests/integration/ -v
|
|
63
|
+
pytest examples/task_apps/sokoban/tests/integration/ -v
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Run All Task App Tests
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
# Run everything
|
|
70
|
+
pytest examples/task_apps/*/tests/ -v
|
|
71
|
+
|
|
72
|
+
# Skip slow tests
|
|
73
|
+
pytest examples/task_apps/*/tests/ -v -m "not slow"
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Test Categories
|
|
77
|
+
|
|
78
|
+
### Unit Tests
|
|
79
|
+
|
|
80
|
+
**Purpose**: Test individual components in isolation
|
|
81
|
+
- Environment initialization
|
|
82
|
+
- Reward calculation
|
|
83
|
+
- Tool implementations
|
|
84
|
+
- State management
|
|
85
|
+
|
|
86
|
+
**Characteristics**:
|
|
87
|
+
- Fast (< 1 second each)
|
|
88
|
+
- No external dependencies
|
|
89
|
+
- No server startup
|
|
90
|
+
- No API calls
|
|
91
|
+
|
|
92
|
+
**Examples**:
|
|
93
|
+
- `test_verilog_scoring.py`: Tests reward components (compile, simulate, submit)
|
|
94
|
+
- `test_enron_environment.py`: Tests search, answer, reward calculation
|
|
95
|
+
- `test_sokoban_environment.py`: Tests actions, rewards, truncation
|
|
96
|
+
|
|
97
|
+
### Integration Tests
|
|
98
|
+
|
|
99
|
+
**Purpose**: Test the full system end-to-end
|
|
100
|
+
- Server startup
|
|
101
|
+
- Health/info endpoints
|
|
102
|
+
- Full evaluation runs
|
|
103
|
+
- **Rollout execution** (manual and policy-driven)
|
|
104
|
+
|
|
105
|
+
**Characteristics**:
|
|
106
|
+
- Slower (30-300 seconds)
|
|
107
|
+
- Requires server startup
|
|
108
|
+
- May require API keys
|
|
109
|
+
- Tests real workflows
|
|
110
|
+
|
|
111
|
+
**Examples**:
|
|
112
|
+
- `test_verilog_eval.py`: Starts server, runs Groq eval with Qwen3-32B
|
|
113
|
+
- `test_verilog_rollout.py`: **Manual & policy rollouts via /rollout endpoint**
|
|
114
|
+
- `test_enron_eval.py`: Starts server, runs Groq eval
|
|
115
|
+
- `test_enron_rollout.py`: **Manual & policy rollouts, auth testing**
|
|
116
|
+
- `test_sokoban_eval.py`: Starts server, tests manual rollout
|
|
117
|
+
- `test_sokoban_rollout.py`: **6 rollout tests (manual, policy, difficulties, limits)**
|
|
118
|
+
|
|
119
|
+
## What Each Test Validates
|
|
120
|
+
|
|
121
|
+
### Verilog Tests
|
|
122
|
+
|
|
123
|
+
**Unit Tests** (4 tests):
|
|
124
|
+
- ✅ Compile success gives +0.1 reward
|
|
125
|
+
- ✅ Simulation pass gives +1.0 reward
|
|
126
|
+
- ✅ Submit success gives +10.0 reward
|
|
127
|
+
- ✅ Submit checks last simulation output correctly
|
|
128
|
+
|
|
129
|
+
**Integration Tests** (5 tests):
|
|
130
|
+
- ✅ Server starts and responds to /health
|
|
131
|
+
- ✅ /task_info returns valid Verilog task metadata
|
|
132
|
+
- ✅ Full eval with Qwen3-32B completes successfully
|
|
133
|
+
- ✅ **Manual rollout** with explicit write/compile/simulate/submit
|
|
134
|
+
- ✅ **Policy rollout** using Groq/Qwen3-32B (verifies LLM integration)
|
|
135
|
+
|
|
136
|
+
### Enron Tests
|
|
137
|
+
|
|
138
|
+
**Unit Tests** (3 tests):
|
|
139
|
+
- ✅ search_emails tool works correctly
|
|
140
|
+
- ✅ answer_question tool calculates rewards
|
|
141
|
+
- ✅ Exact answer match gives high reward (>0.9)
|
|
142
|
+
- ✅ Partial answer match gives medium reward (>0.5)
|
|
143
|
+
- ✅ Wrong answer gives low reward (<0.5)
|
|
144
|
+
|
|
145
|
+
**Integration Tests** (6 tests):
|
|
146
|
+
- ✅ Server starts and responds to /health
|
|
147
|
+
- ✅ /task_info returns valid Enron task metadata
|
|
148
|
+
- ✅ Full eval with Qwen3-32B completes successfully
|
|
149
|
+
- ✅ **Manual rollout** with explicit search/read/answer actions
|
|
150
|
+
- ✅ **Policy rollout** using Groq/Qwen3-32B
|
|
151
|
+
- ✅ **Authentication** enforcement (rejects requests without auth header)
|
|
152
|
+
|
|
153
|
+
### Sokoban Tests
|
|
154
|
+
|
|
155
|
+
**Unit Tests** (3 tests):
|
|
156
|
+
- ✅ Module imports work correctly
|
|
157
|
+
- ✅ Reward components exist (goal achieved, step penalty)
|
|
158
|
+
- ✅ Engine creation with different difficulty levels
|
|
159
|
+
|
|
160
|
+
**Integration Tests** (9 tests):
|
|
161
|
+
- ✅ Server starts and responds to /health
|
|
162
|
+
- ✅ /task_info returns valid Sokoban task metadata
|
|
163
|
+
- ✅ **Manual rollout** with movement actions (left/right/up/down)
|
|
164
|
+
- ✅ **Policy rollout** with OpenAI GPT-5-mini (may skip if slow)
|
|
165
|
+
- ✅ **All difficulty levels** (easy/medium/hard) work correctly
|
|
166
|
+
- ✅ **Max steps limit** enforcement (stops at configured limit)
|
|
167
|
+
- ✅ **Puzzle completion detection** (terminated=True when solved)
|
|
168
|
+
- ✅ Truncation on max_steps
|
|
169
|
+
- ✅ Response structure validation
|
|
170
|
+
|
|
171
|
+
## Debugging Test Failures
|
|
172
|
+
|
|
173
|
+
### Server Won't Start
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
# Check if port is already in use
|
|
177
|
+
lsof -i :<port>
|
|
178
|
+
|
|
179
|
+
# Check logs manually
|
|
180
|
+
uv run -m synth_ai task-app serve <app_name> --port 8999
|
|
181
|
+
|
|
182
|
+
# Check environment variables
|
|
183
|
+
echo $GROQ_API_KEY
|
|
184
|
+
echo $OPENAI_API_KEY
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
### Tests Timeout
|
|
188
|
+
|
|
189
|
+
```bash
|
|
190
|
+
# Run with more verbose output
|
|
191
|
+
pytest <test_file> -v -s
|
|
192
|
+
|
|
193
|
+
# Skip slow tests
|
|
194
|
+
pytest <test_file> -v --timeout=60
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
### Import Errors
|
|
198
|
+
|
|
199
|
+
```bash
|
|
200
|
+
# Ensure you're in the right directory
|
|
201
|
+
cd /path/to/synth-ai
|
|
202
|
+
|
|
203
|
+
# Reinstall dependencies
|
|
204
|
+
uv sync --dev
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
## CI/CD Integration
|
|
208
|
+
|
|
209
|
+
These tests can be run in CI with:
|
|
210
|
+
|
|
211
|
+
```yaml
|
|
212
|
+
# .github/workflows/test-task-apps.yml
|
|
213
|
+
- name: Run task app tests
|
|
214
|
+
env:
|
|
215
|
+
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
|
|
216
|
+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
217
|
+
run: |
|
|
218
|
+
# Unit tests (fast, always run)
|
|
219
|
+
pytest examples/task_apps/*/tests/unit/ -v
|
|
220
|
+
|
|
221
|
+
# Integration tests (slower, only on main)
|
|
222
|
+
if [ "$GITHUB_REF" = "refs/heads/main" ]; then
|
|
223
|
+
pytest examples/task_apps/*/tests/integration/ -v --timeout=300
|
|
224
|
+
fi
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
## Adding Tests for New Task Apps
|
|
228
|
+
|
|
229
|
+
When creating a new task app, follow this pattern:
|
|
230
|
+
|
|
231
|
+
1. **Create test structure**:
|
|
232
|
+
```bash
|
|
233
|
+
mkdir -p examples/task_apps/<new_app>/tests/{unit,integration}
|
|
234
|
+
touch examples/task_apps/<new_app>/tests/__init__.py
|
|
235
|
+
touch examples/task_apps/<new_app>/tests/unit/__init__.py
|
|
236
|
+
touch examples/task_apps/<new_app>/tests/integration/__init__.py
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
2. **Create unit tests** (`tests/unit/test_<app>_*.py`):
|
|
240
|
+
- Test environment initialization
|
|
241
|
+
- Test reward calculation
|
|
242
|
+
- Test tool implementations
|
|
243
|
+
- Test edge cases
|
|
244
|
+
|
|
245
|
+
3. **Create integration tests** (`tests/integration/test_<app>_eval.py`):
|
|
246
|
+
- Copy from an existing integration test
|
|
247
|
+
- Update app name, port, config path
|
|
248
|
+
- Add app-specific endpoint tests
|
|
249
|
+
|
|
250
|
+
4. **Add to CI**:
|
|
251
|
+
- Update CI config to include new tests
|
|
252
|
+
- Ensure required env vars are set
|
|
253
|
+
|
|
254
|
+
## Test Coverage Goals
|
|
255
|
+
|
|
256
|
+
- Unit test coverage: >80%
|
|
257
|
+
- Integration test coverage: 100% of critical paths
|
|
258
|
+
- All public APIs have at least one integration test
|
|
259
|
+
- All reward components have unit tests
|
|
260
|
+
|
|
261
|
+
## Common Issues
|
|
262
|
+
|
|
263
|
+
### "Task app terminated immediately"
|
|
264
|
+
- Check that the app name is correct
|
|
265
|
+
- Verify the app is registered in `synth_ai/task/apps.py`
|
|
266
|
+
- Check recent changes to the app code
|
|
267
|
+
|
|
268
|
+
### "GROQ_API_KEY must be set"
|
|
269
|
+
- Set the environment variable
|
|
270
|
+
- Or skip the test: `pytest -k "not groq"`
|
|
271
|
+
|
|
272
|
+
### "Config file not found"
|
|
273
|
+
- Ensure eval config exists in task app directory
|
|
274
|
+
- Check the path in the test matches actual location
|
|
275
|
+
|
|
File without changes
|
|
File without changes
|
|
@@ -68,7 +68,7 @@ def _resolve_repo_root() -> Path:
|
|
|
68
68
|
def _resolve_task_app_root(repo_root: Path) -> Path:
|
|
69
69
|
"""Locate the task_app directory even when the module is copied to a temp mount."""
|
|
70
70
|
|
|
71
|
-
preferred = (repo_root / "examples" / "
|
|
71
|
+
preferred = (repo_root / "examples" / "task_apps" / "crafter" / "task_app").resolve()
|
|
72
72
|
if preferred.is_dir():
|
|
73
73
|
return preferred
|
|
74
74
|
|
|
@@ -81,7 +81,7 @@ def _resolve_task_app_root(repo_root: Path) -> Path:
|
|
|
81
81
|
if (candidate / "synth_envs_hosted").is_dir():
|
|
82
82
|
return candidate
|
|
83
83
|
|
|
84
|
-
fallback = Path("/opt/synth_ai_repo/examples/
|
|
84
|
+
fallback = Path("/opt/synth_ai_repo/examples/task_apps/crafter/task_app")
|
|
85
85
|
if fallback.is_dir():
|
|
86
86
|
return fallback.resolve()
|
|
87
87
|
|
|
@@ -306,13 +306,16 @@ def build_dataset() -> tuple[TaskDatasetRegistry, CrafterDataset]:
|
|
|
306
306
|
def _base_task_info(dataset: CrafterDataset) -> TaskInfo:
|
|
307
307
|
return TaskInfo(
|
|
308
308
|
task={"id": "crafter_classic", "name": "Crafter Classic", "version": "1.0.0"},
|
|
309
|
-
|
|
309
|
+
environment="crafter",
|
|
310
310
|
action_space={
|
|
311
311
|
"type": "discrete",
|
|
312
|
+
"description": f"Discrete action space with {len(crafter_constants.actions)} actions including movement, crafting, and interaction",
|
|
312
313
|
"size": len(crafter_constants.actions),
|
|
313
314
|
"actions": list(crafter_constants.actions),
|
|
314
315
|
},
|
|
315
316
|
observation={
|
|
317
|
+
"type": "dict",
|
|
318
|
+
"description": "RGB frame (64x64x3) plus inventory counts, achievements, and semantic map patches",
|
|
316
319
|
"summary": "RGB frame plus inventory, achievements, and semantic map patches.",
|
|
317
320
|
"keys": ["image", "inventory", "achievements", "semantic_map_patch7"],
|
|
318
321
|
"image_shape": [64, 64, 3],
|
|
@@ -336,11 +339,6 @@ def _base_task_info(dataset: CrafterDataset) -> TaskInfo:
|
|
|
336
339
|
},
|
|
337
340
|
"tool": {"name": "interact", "parallel_tool_calls": False},
|
|
338
341
|
},
|
|
339
|
-
capabilities={
|
|
340
|
-
"supports_rollout": True,
|
|
341
|
-
"supports_env_lifecycle": True,
|
|
342
|
-
"requires_api_key_header": True,
|
|
343
|
-
},
|
|
344
342
|
limits={"max_ops": 100000, "max_time_s": 3600},
|
|
345
343
|
)
|
|
346
344
|
|
|
@@ -366,29 +364,36 @@ def provide_task_instances(
|
|
|
366
364
|
dataset: CrafterDataset, base_info: TaskInfo, seeds: Sequence[int]
|
|
367
365
|
) -> Iterable[TaskInfo]:
|
|
368
366
|
infos: list[TaskInfo] = []
|
|
367
|
+
base_observation = getattr(base_info, "observation", None)
|
|
368
|
+
if hasattr(base_observation, "model_dump"):
|
|
369
|
+
observation_template = base_observation.model_dump()
|
|
370
|
+
elif isinstance(base_observation, dict):
|
|
371
|
+
observation_template = dict(base_observation)
|
|
372
|
+
else:
|
|
373
|
+
observation_template = {}
|
|
374
|
+
|
|
369
375
|
for seed_value in seeds:
|
|
370
376
|
summary = dataset.describe_seed(seed_value)
|
|
371
377
|
infos.append(
|
|
372
378
|
TaskInfo(
|
|
373
379
|
task=base_info.task,
|
|
374
|
-
|
|
380
|
+
environment=base_info.environment,
|
|
375
381
|
action_space=base_info.action_space,
|
|
376
382
|
observation={
|
|
377
|
-
**
|
|
383
|
+
**observation_template,
|
|
378
384
|
"seed": seed_value,
|
|
379
385
|
"traits": summary["traits"],
|
|
380
386
|
"inventory": summary["inventory"],
|
|
381
387
|
"player_position": summary["player_position"],
|
|
382
388
|
},
|
|
383
389
|
dataset={
|
|
384
|
-
**base_info.dataset,
|
|
390
|
+
**base_info.dataset.model_dump(),
|
|
385
391
|
"seed": seed_value,
|
|
386
392
|
"difficulty": summary["difficulty"],
|
|
387
393
|
"config": summary["config"],
|
|
388
394
|
},
|
|
389
395
|
rubric=base_info.rubric,
|
|
390
396
|
inference=base_info.inference,
|
|
391
|
-
capabilities=base_info.capabilities,
|
|
392
397
|
limits=base_info.limits,
|
|
393
398
|
)
|
|
394
399
|
)
|
|
@@ -659,7 +664,7 @@ register_task_app(
|
|
|
659
664
|
# Mount repo root so local modules resolve when deployed on Modal
|
|
660
665
|
(str(REPO_ROOT), "/opt/synth_ai_repo"),
|
|
661
666
|
(str(REPO_ROOT / "synth_ai"), "/opt/synth_ai_repo/synth_ai"),
|
|
662
|
-
(str(TASK_APP_ROOT), "/opt/synth_ai_repo/examples/
|
|
667
|
+
(str(TASK_APP_ROOT), "/opt/synth_ai_repo/examples/task_apps/crafter/task_app"),
|
|
663
668
|
),
|
|
664
669
|
secret_names=("groq-api-key", "openai-api-key"),
|
|
665
670
|
memory=16384,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Compatibility wrapper for the GRPO Crafter task app.
|
|
2
2
|
|
|
3
3
|
This module now delegates to the TaskAppConfig defined in the colocated example at
|
|
4
|
-
`examples/
|
|
4
|
+
`examples/task_apps/crafter/task_app/grpo_crafter.py`. It is kept for legacy usage
|
|
5
5
|
(running the file directly or targeting `fastapi_app` from external tooling). Prefer using
|
|
6
6
|
`uvx synth-ai serve grpo-crafter` for local development and testing.
|
|
7
7
|
"""
|