synth-ai 0.2.9.dev0__py3-none-any.whl → 0.2.23.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/README.md +1 -0
- examples/__init__.py +16 -0
- examples/analyze_semantic_words.sh +17 -0
- examples/baseline/banking77_baseline.py +243 -0
- examples/baseline/banking77_pipeline_baseline.py +294 -0
- examples/baseline/crafter_baseline.py +407 -0
- examples/baseline/pokemon_red_baseline.py +326 -0
- examples/baseline/simple_baseline.py +56 -0
- examples/baseline/warming_up_to_rl_baseline.py +239 -0
- examples/blog_posts/gepa/README.md +355 -0
- examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
- examples/blog_posts/gepa/configs/banking77_gepa_test.toml +80 -0
- examples/blog_posts/gepa/configs/banking77_mipro_local.toml +50 -0
- examples/blog_posts/gepa/configs/banking77_pipeline_gepa_local.toml +101 -0
- examples/blog_posts/gepa/configs/banking77_pipeline_gepa_test.toml +96 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +57 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +35 -0
- examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +51 -0
- examples/blog_posts/gepa/configs/hover_gepa_local.toml +57 -0
- examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +35 -0
- examples/blog_posts/gepa/configs/hover_mipro_local.toml +51 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +57 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +35 -0
- examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +51 -0
- examples/blog_posts/gepa/configs/pupa_gepa_local.toml +58 -0
- examples/blog_posts/gepa/configs/pupa_mipro_local.toml +52 -0
- examples/blog_posts/gepa/deploy_banking77_task_app.sh +54 -0
- examples/blog_posts/gepa/gepa_baseline.py +204 -0
- examples/blog_posts/gepa/query_prompts_example.py +97 -0
- examples/blog_posts/gepa/run_gepa_banking77.sh +112 -0
- examples/blog_posts/gepa/run_gepa_banking77_pipeline.sh +163 -0
- examples/blog_posts/gepa/task_apps.py +105 -0
- examples/blog_posts/gepa/test_gepa_local.sh +67 -0
- examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
- examples/blog_posts/mipro/README.md +415 -0
- examples/blog_posts/mipro/configs/banking77_mipro_local.toml +91 -0
- examples/blog_posts/mipro/configs/banking77_mipro_test.toml +87 -0
- examples/blog_posts/mipro/configs/banking77_pipeline_mipro_gemini_flash_lite_local.toml +98 -0
- examples/blog_posts/mipro/configs/banking77_pipeline_mipro_gpt41mini_local.toml +96 -0
- examples/blog_posts/mipro/configs/banking77_pipeline_mipro_local.toml +94 -0
- examples/blog_posts/mipro/configs/banking77_pipeline_mipro_test.toml +170 -0
- examples/blog_posts/mipro/deploy_banking77_pipeline_task_app.sh +59 -0
- examples/blog_posts/mipro/deploy_banking77_task_app.sh +41 -0
- examples/blog_posts/mipro/multi_step.md +79 -0
- examples/blog_posts/mipro/run_mipro_banking77.sh +191 -0
- examples/blog_posts/mipro/run_mipro_banking77_pipeline.sh +171 -0
- examples/blog_posts/mipro/run_mipro_banking77_pipeline_gemini_flash_lite.sh +177 -0
- examples/blog_posts/mipro/run_mipro_banking77_pipeline_gpt41mini.sh +173 -0
- examples/blog_posts/mipro/verify_banking77_setup.sh +117 -0
- examples/blog_posts/pokemon_vl/README.md +98 -0
- examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
- examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +27 -0
- examples/blog_posts/pokemon_vl/configs/eval_rl_final.toml +24 -0
- examples/blog_posts/pokemon_vl/configs/filter_high_reward.toml +10 -0
- examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +43 -0
- examples/blog_posts/pokemon_vl/configs/train_sft_qwen4b_vl.toml +40 -0
- examples/blog_posts/pokemon_vl/extract_images.py +239 -0
- examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
- examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
- examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
- examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
- examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
- examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
- examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
- examples/blog_posts/warming_up_to_rl/README.md +158 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_groq_qwen32b.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_openai_gpt_oss_120b.toml +29 -0
- examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +10 -0
- examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
- examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +91 -0
- examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +40 -0
- examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
- examples/crafter_debug_render.py +186 -0
- examples/dev/qwen3_32b_qlora_4xh100.toml +45 -0
- examples/gepa/banking77_pipeline_gepa.toml +96 -0
- examples/gepa/multi_stage_gepa_example.toml +84 -0
- examples/gepa/run_gepa_banking77_pipeline.sh +157 -0
- examples/multi_step/SFT_README.md +147 -0
- examples/multi_step/configs/README_verilog_rl.md +77 -0
- examples/multi_step/configs/VERILOG_REWARDS.md +103 -0
- examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +196 -0
- examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
- examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
- examples/multi_step/configs/crafter_rl_outcome.toml +75 -0
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +145 -0
- examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +84 -0
- examples/multi_step/configs/crafter_rl_stepwise_simple.toml +79 -0
- examples/multi_step/configs/crafter_rl_stepwise_simple_NEW_FORMAT.toml +105 -0
- examples/multi_step/configs/crafter_sft_qwen30b_lora.toml +62 -0
- examples/multi_step/configs/crafter_synth_backend.md +40 -0
- examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
- examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
- examples/multi_step/configs/verilog_rl_lora.toml +147 -0
- examples/multi_step/convert_traces_to_sft.py +84 -0
- examples/multi_step/crafter_rl_lora.md +70 -0
- examples/multi_step/judges/crafter_backend_judge.py +220 -0
- examples/multi_step/judges/verilog_backend_judge.py +234 -0
- examples/multi_step/readme.md +48 -0
- examples/multi_step/run_sft_qwen30b.sh +45 -0
- examples/multi_step/sse_metrics_streaming_notes.md +357 -0
- examples/multi_step/task_app_config_notes.md +494 -0
- examples/multi_step/verilog_rl_lora.md +218 -0
- examples/qwen_coder/README.md +102 -0
- examples/qwen_coder/_shared.py +113 -0
- examples/qwen_coder/configs/coder_lora_30b.toml +60 -0
- examples/qwen_coder/configs/coder_lora_4b.toml +61 -0
- examples/qwen_coder/configs/coder_lora_small.toml +57 -0
- examples/qwen_coder/generate_dataset.py +98 -0
- examples/qwen_coder/infer_ft_smoke.py +65 -0
- examples/qwen_coder/infer_prod_proxy.py +73 -0
- examples/qwen_coder/infer_via_synth.py +87 -0
- examples/qwen_coder/scripts/infer_coder.sh +19 -0
- examples/qwen_coder/scripts/train_coder_30b.sh +22 -0
- examples/qwen_coder/sft_full_17b.py +103 -0
- examples/qwen_coder/sft_lora_30b.py +110 -0
- examples/qwen_coder/subset_jsonl.py +39 -0
- examples/qwen_coder/todos.md +38 -0
- examples/qwen_coder/validate_jsonl.py +60 -0
- examples/qwen_vl/BUGS_AND_FIXES.md +232 -0
- examples/qwen_vl/IMAGE_VALIDATION_COMPLETE.md +271 -0
- examples/qwen_vl/IMAGE_VALIDATION_SUMMARY.md +260 -0
- examples/qwen_vl/INFERENCE_SFT_TESTS.md +412 -0
- examples/qwen_vl/NEXT_STEPS_2B.md +325 -0
- examples/qwen_vl/QUICKSTART.md +327 -0
- examples/qwen_vl/QUICKSTART_RL_VISION.md +110 -0
- examples/qwen_vl/README.md +152 -0
- examples/qwen_vl/RL_VISION_COMPLETE.md +475 -0
- examples/qwen_vl/RL_VISION_TESTING.md +333 -0
- examples/qwen_vl/SDK_VISION_INTEGRATION.md +328 -0
- examples/qwen_vl/SETUP_COMPLETE.md +274 -0
- examples/qwen_vl/VISION_TESTS_COMPLETE.md +489 -0
- examples/qwen_vl/VLM_PIPELINE_COMPLETE.md +242 -0
- examples/qwen_vl/__init__.py +2 -0
- examples/qwen_vl/collect_data_via_cli.md +415 -0
- examples/qwen_vl/collect_vision_traces.py +368 -0
- examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +110 -0
- examples/qwen_vl/configs/crafter_vlm_sft_example.toml +59 -0
- examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +26 -0
- examples/qwen_vl/configs/eval_gpt4o_vision_proper.toml +29 -0
- examples/qwen_vl/configs/eval_gpt5nano_vision.toml +26 -0
- examples/qwen_vl/configs/eval_qwen3vl_vision.toml +26 -0
- examples/qwen_vl/configs/filter_qwen3vl_sft.toml +49 -0
- examples/qwen_vl/configs/filter_vision_sft.toml +52 -0
- examples/qwen_vl/configs/filter_vision_test.toml +8 -0
- examples/qwen_vl/configs/sft_qwen3_vl_2b_test.toml +54 -0
- examples/qwen_vl/crafter_gpt5nano_agent.py +308 -0
- examples/qwen_vl/crafter_qwen_vl_agent.py +300 -0
- examples/qwen_vl/run_vision_comparison.sh +61 -0
- examples/qwen_vl/run_vision_sft_pipeline.sh +175 -0
- examples/qwen_vl/test_image_validation.py +201 -0
- examples/qwen_vl/test_sft_vision_data.py +110 -0
- examples/rl/README.md +169 -0
- examples/rl/configs/eval_base_qwen.toml +17 -0
- examples/rl/configs/eval_rl_qwen.toml +13 -0
- examples/rl/configs/rl_from_base_qwen.toml +62 -0
- examples/rl/configs/rl_from_base_qwen17.toml +80 -0
- examples/rl/configs/rl_from_ft_qwen.toml +37 -0
- examples/rl/download_dataset.py +80 -0
- examples/rl/run_eval.py +436 -0
- examples/rl/run_rl_and_save.py +111 -0
- examples/rl/task_app/README.md +21 -0
- {synth_ai/task/apps → examples/rl/task_app}/math_single_step.py +188 -50
- examples/rl/task_app/math_task_app.py +111 -0
- examples/run_crafter_demo.sh +10 -0
- examples/sdk_prompt_learning_example.py +55 -0
- examples/sft/README.md +139 -0
- examples/sft/configs/crafter_fft_qwen0p6b.toml +49 -0
- examples/sft/configs/crafter_lora_qwen0p6b.toml +49 -0
- examples/sft/evaluate.py +117 -0
- examples/sft/export_dataset.py +120 -0
- examples/sft/generate_traces.py +164 -0
- examples/swe/__init__.py +12 -0
- examples/swe/task_app/README.md +135 -0
- examples/swe/task_app/__init__.py +2 -0
- examples/swe/task_app/grpo_swe_mini.py +604 -0
- examples/swe/task_app/grpo_swe_mini_task_app.py +124 -0
- examples/swe/task_app/hosted/README.md +173 -0
- examples/swe/task_app/hosted/__init__.py +5 -0
- examples/swe/task_app/hosted/branching.py +143 -0
- examples/swe/task_app/hosted/environment_routes.py +1289 -0
- examples/swe/task_app/hosted/envs/__init__.py +1 -0
- examples/swe/task_app/hosted/envs/crafter/__init__.py +6 -0
- examples/swe/task_app/hosted/envs/crafter/app.py +1 -0
- examples/swe/task_app/hosted/envs/crafter/environment.py +522 -0
- examples/swe/task_app/hosted/envs/crafter/policy.py +478 -0
- examples/swe/task_app/hosted/envs/crafter/react_agent.py +108 -0
- examples/swe/task_app/hosted/envs/crafter/shared.py +305 -0
- examples/swe/task_app/hosted/envs/crafter/tools.py +47 -0
- examples/swe/task_app/hosted/envs/mini_swe/__init__.py +8 -0
- examples/swe/task_app/hosted/envs/mini_swe/environment.py +1191 -0
- examples/swe/task_app/hosted/envs/mini_swe/policy.py +355 -0
- examples/swe/task_app/hosted/envs/mini_swe/shared.py +83 -0
- examples/swe/task_app/hosted/envs/mini_swe/tools.py +96 -0
- examples/swe/task_app/hosted/hosted_app.py +204 -0
- examples/swe/task_app/hosted/inference/__init__.py +5 -0
- examples/swe/task_app/hosted/inference/openai_client.py +584 -0
- examples/swe/task_app/hosted/main.py +100 -0
- examples/swe/task_app/hosted/policy_routes.py +1094 -0
- examples/swe/task_app/hosted/registry.py +195 -0
- examples/swe/task_app/hosted/rollout.py +1905 -0
- examples/swe/task_app/hosted/storage/__init__.py +5 -0
- examples/swe/task_app/hosted/storage/volume.py +211 -0
- examples/swe/task_app/hosted/test_agents.py +161 -0
- examples/swe/task_app/hosted/test_service.py +136 -0
- examples/swe/task_app/hosted/utils.py +62 -0
- examples/swe/task_app/morph_backend.py +178 -0
- examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
- examples/task_apps/TESTING.md +275 -0
- examples/task_apps/banking77/__init__.py +6 -0
- examples/task_apps/banking77/banking77_task_app.py +912 -0
- examples/task_apps/banking77/deploy_wrapper.py +46 -0
- examples/task_apps/banking77_pipeline/__init__.py +6 -0
- examples/task_apps/banking77_pipeline/banking77_pipeline_task_app.py +489 -0
- examples/task_apps/banking77_pipeline/deploy_wrapper.py +50 -0
- examples/task_apps/crafter/CREATE_SFT_DATASET.md +286 -0
- examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
- examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +187 -0
- examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +281 -0
- examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
- examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
- examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
- examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
- examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
- examples/task_apps/crafter/task_app/README.md +42 -0
- examples/task_apps/crafter/task_app/__init__.py +5 -0
- examples/task_apps/crafter/task_app/grpo_crafter.py +1055 -0
- examples/task_apps/crafter/task_app/grpo_crafter_task_app.py +146 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/README.md +173 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/__init__.py +5 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/branching.py +143 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/environment_routes.py +1226 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/__init__.py +1 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/environment.py +532 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +583 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +122 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/shared.py +305 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +253 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/inference/__init__.py +5 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +999 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/main.py +100 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +1252 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/registry.py +195 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +2233 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/storage/__init__.py +5 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/storage/volume.py +211 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/test_agents.py +161 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/test_service.py +136 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +411 -0
- examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
- examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
- examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
- examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
- examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
- examples/task_apps/enron/__init__.py +2 -0
- examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
- examples/task_apps/enron/filter_sft.toml +5 -0
- examples/task_apps/enron/task_app/README.md +14 -0
- examples/task_apps/enron/task_app/__init__.py +1 -0
- examples/task_apps/enron/task_app/grpo_enron.py +906 -0
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
- examples/task_apps/enron/tests/__init__.py +4 -0
- examples/task_apps/enron/tests/conftest.py +115 -0
- examples/task_apps/enron/tests/integration/__init__.py +4 -0
- examples/task_apps/enron/tests/integration/test_enron_eval.py +179 -0
- examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
- examples/task_apps/enron/tests/unit/__init__.py +4 -0
- examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
- examples/task_apps/gepa_benchmarks/__init__.py +7 -0
- examples/task_apps/gepa_benchmarks/common.py +260 -0
- examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
- examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
- examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
- examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
- examples/task_apps/math/README.md +21 -0
- examples/task_apps/math/math_single_step.py +1000 -0
- examples/task_apps/math/math_task_app.py +115 -0
- examples/task_apps/pokemon_battle/__init__.py +2 -0
- examples/task_apps/pokemon_battle/modal_app.py +104 -0
- examples/task_apps/pokemon_battle/task_app/README.md +68 -0
- examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
- examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
- examples/task_apps/pokemon_red/README.md +356 -0
- examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +428 -0
- examples/task_apps/pokemon_red/__init__.py +3 -0
- examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +30 -0
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +224 -0
- examples/task_apps/pokemon_red/pallet_town_rl_config.toml +75 -0
- examples/task_apps/pokemon_red/task_app.py +1048 -0
- examples/task_apps/pokemon_red/test_pallet_town_rewards.py +193 -0
- examples/task_apps/sokoban/README.md +306 -0
- examples/task_apps/sokoban/__init__.py +3 -0
- examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
- examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
- examples/task_apps/sokoban/filter_sft.toml +5 -0
- examples/task_apps/sokoban/task_app.py +1058 -0
- examples/task_apps/sokoban/tests/__init__.py +4 -0
- examples/task_apps/sokoban/tests/conftest.py +113 -0
- examples/task_apps/sokoban/tests/integration/__init__.py +4 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
- examples/task_apps/sokoban/tests/unit/__init__.py +4 -0
- examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
- examples/task_apps/verilog/__init__.py +1 -0
- examples/task_apps/verilog/eval_groq_qwen32b.toml +22 -0
- examples/task_apps/verilog/filter_sft.toml +5 -0
- examples/task_apps/verilog/task_app/README.md +12 -0
- examples/task_apps/verilog/task_app/__init__.py +1 -0
- examples/task_apps/verilog/task_app/grpo_verilog.py +1166 -0
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
- examples/task_apps/verilog/tests/__init__.py +4 -0
- examples/task_apps/verilog/tests/conftest.py +115 -0
- examples/task_apps/verilog/tests/integration/__init__.py +4 -0
- examples/task_apps/verilog/tests/integration/test_verilog_eval.py +181 -0
- examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
- examples/task_apps/verilog/tests/unit/__init__.py +4 -0
- examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
- examples/tunnel_gepa_banking77/README.md +106 -0
- examples/tunnel_gepa_banking77/banking77_gepa_tunnel.toml +95 -0
- examples/tunnel_gepa_banking77/keep_tunnel_running.py +60 -0
- examples/tunnel_gepa_banking77/run_gepa_with_tunnel.sh +226 -0
- examples/vlm/PROPOSAL.md +53 -0
- examples/vlm/README.md +68 -0
- examples/vlm/configs/crafter_vlm_gpt4o.toml +49 -0
- examples/vlm/crafter_image_only_agent.py +207 -0
- examples/vlm/crafter_openai_vlm_agent.py +275 -0
- examples/vlm/filter_image_rows.py +63 -0
- examples/vlm/run_crafter_vlm_benchmark.py +316 -0
- examples/warming_up_to_rl/_utils.py +92 -0
- examples/warming_up_to_rl/analyze_trace_db.py +422 -0
- examples/warming_up_to_rl/configs/crafter_fft.toml +53 -0
- examples/warming_up_to_rl/configs/crafter_fft_4b.toml +54 -0
- examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +22 -0
- examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +15 -0
- examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +24 -0
- examples/warming_up_to_rl/configs/eval_stepwise_complex.toml +35 -0
- examples/warming_up_to_rl/configs/eval_stepwise_consistent.toml +26 -0
- examples/warming_up_to_rl/configs/eval_stepwise_per_achievement.toml +36 -0
- examples/warming_up_to_rl/configs/eval_stepwise_simple.toml +32 -0
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +85 -0
- examples/warming_up_to_rl/configs/rl_from_ft.toml +58 -0
- examples/warming_up_to_rl/export_trace_sft.py +837 -0
- examples/warming_up_to_rl/groq_test.py +97 -0
- examples/warming_up_to_rl/manage_secrets.py +131 -0
- examples/warming_up_to_rl/old/event_rewards.md +234 -0
- examples/warming_up_to_rl/old/notes.md +73 -0
- examples/warming_up_to_rl/readme.md +110 -0
- examples/warming_up_to_rl/run_eval.py +736 -0
- examples/warming_up_to_rl/run_fft_and_save.py +380 -0
- examples/warming_up_to_rl/run_local_rollout.py +239 -0
- examples/warming_up_to_rl/run_local_rollout_modal.py +248 -0
- examples/warming_up_to_rl/run_local_rollout_parallel.py +405 -0
- examples/warming_up_to_rl/run_local_rollout_traced.py +477 -0
- examples/warming_up_to_rl/run_rl_and_save.py +124 -0
- examples/warming_up_to_rl/run_rollout_remote.py +156 -0
- examples/warming_up_to_rl/task_app/README.md +42 -0
- examples/warming_up_to_rl/task_app/grpo_crafter.py +876 -0
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +135 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +143 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1226 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +522 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +454 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +108 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +305 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +253 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +729 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +100 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +1114 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +195 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1891 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +211 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +161 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +137 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +129 -0
- examples/workflows/math_rl/configs/eval_base_qwen.toml +15 -0
- examples/workflows/math_rl/configs/eval_rl_qwen.toml +11 -0
- examples/workflows/math_rl/configs/rl_from_base_qwen.toml +62 -0
- examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +80 -0
- examples/workflows/math_rl/configs/rl_from_ft_qwen.toml +35 -0
- examples/workflows/math_rl/download_dataset.py +80 -0
- examples/workflows/math_rl/run_eval.py +436 -0
- examples/workflows/math_rl/run_rl_and_save.py +111 -0
- synth_ai/__init__.py +47 -23
- synth_ai/_utils/__init__.py +47 -0
- synth_ai/_utils/base_url.py +10 -0
- synth_ai/_utils/http.py +10 -0
- synth_ai/_utils/prompts.py +10 -0
- synth_ai/_utils/task_app_state.py +12 -0
- synth_ai/_utils/user_config.py +10 -0
- synth_ai/api/models/supported.py +514 -0
- synth_ai/api/train/__init__.py +60 -2
- synth_ai/api/train/builders.py +347 -39
- synth_ai/api/train/cli.py +895 -160
- synth_ai/api/train/config_finder.py +103 -25
- synth_ai/api/train/configs/__init__.py +65 -0
- synth_ai/api/train/configs/prompt_learning.py +496 -0
- synth_ai/api/train/configs/rl.py +188 -0
- synth_ai/api/train/configs/sft.py +99 -0
- synth_ai/api/train/configs/shared.py +81 -0
- synth_ai/api/train/env_resolver.py +70 -20
- synth_ai/api/train/pollers.py +29 -4
- synth_ai/api/train/prompt_learning.py +425 -0
- synth_ai/api/train/sft.py +390 -0
- synth_ai/api/train/supported_algos.py +147 -0
- synth_ai/api/train/task_app.py +6 -4
- synth_ai/api/train/utils.py +64 -52
- synth_ai/api/train/validators.py +1117 -0
- synth_ai/api/tunnel.py +49 -0
- synth_ai/auth/credentials.py +94 -0
- synth_ai/baseline/__init__.py +25 -0
- synth_ai/baseline/config.py +209 -0
- synth_ai/baseline/discovery.py +214 -0
- synth_ai/baseline/execution.py +146 -0
- synth_ai/cfgs.py +227 -0
- synth_ai/cli/__init__.py +85 -63
- synth_ai/cli/_modal_wrapper.py +31 -0
- synth_ai/cli/_storage.py +20 -0
- synth_ai/cli/_typer_patch.py +47 -0
- synth_ai/cli/_validate_task_app.py +29 -0
- synth_ai/cli/balance.py +16 -4
- synth_ai/cli/calc.py +36 -21
- synth_ai/cli/claude.py +70 -0
- synth_ai/cli/codex.py +267 -0
- synth_ai/cli/commands/__init__.py +18 -0
- synth_ai/cli/commands/baseline/__init__.py +12 -0
- synth_ai/cli/commands/baseline/core.py +637 -0
- synth_ai/cli/commands/baseline/list.py +93 -0
- synth_ai/cli/commands/demo/__init__.py +6 -0
- synth_ai/cli/commands/demo/core.py +163 -0
- synth_ai/cli/commands/eval/__init__.py +19 -0
- synth_ai/cli/commands/eval/core.py +1112 -0
- synth_ai/cli/commands/eval/errors.py +81 -0
- synth_ai/cli/commands/eval/validation.py +133 -0
- synth_ai/cli/commands/filter/__init__.py +12 -0
- synth_ai/cli/commands/filter/core.py +424 -0
- synth_ai/cli/commands/filter/errors.py +55 -0
- synth_ai/cli/commands/filter/validation.py +77 -0
- synth_ai/cli/commands/help/__init__.py +185 -0
- synth_ai/cli/commands/help/core.py +72 -0
- synth_ai/cli/commands/smoke/__init__.py +7 -0
- synth_ai/cli/commands/smoke/core.py +1437 -0
- synth_ai/cli/commands/status/__init__.py +66 -0
- synth_ai/cli/commands/status/client.py +192 -0
- synth_ai/cli/commands/status/config.py +92 -0
- synth_ai/cli/commands/status/errors.py +20 -0
- synth_ai/cli/commands/status/formatters.py +164 -0
- synth_ai/cli/commands/status/subcommands/__init__.py +9 -0
- synth_ai/cli/commands/status/subcommands/files.py +79 -0
- synth_ai/cli/commands/status/subcommands/jobs.py +334 -0
- synth_ai/cli/commands/status/subcommands/models.py +79 -0
- synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
- synth_ai/cli/commands/status/subcommands/runs.py +81 -0
- synth_ai/cli/commands/status/subcommands/session.py +183 -0
- synth_ai/cli/commands/status/subcommands/summary.py +47 -0
- synth_ai/cli/commands/status/subcommands/usage.py +203 -0
- synth_ai/cli/commands/status/utils.py +114 -0
- synth_ai/cli/commands/train/__init__.py +53 -0
- synth_ai/cli/commands/train/core.py +21 -0
- synth_ai/cli/commands/train/errors.py +117 -0
- synth_ai/cli/commands/train/judge_schemas.py +200 -0
- synth_ai/cli/commands/train/judge_validation.py +305 -0
- synth_ai/cli/commands/train/validation.py +386 -0
- synth_ai/cli/demo.py +32 -140
- synth_ai/cli/deploy.py +233 -0
- synth_ai/cli/eval/__init__.py +36 -0
- synth_ai/cli/eval/core.py +5 -0
- synth_ai/cli/eval/errors.py +31 -0
- synth_ai/cli/eval/validation.py +5 -0
- synth_ai/cli/filter/__init__.py +28 -0
- synth_ai/cli/filter/core.py +5 -0
- synth_ai/cli/filter/errors.py +23 -0
- synth_ai/cli/filter/validation.py +5 -0
- synth_ai/cli/legacy_root_backup.py +28 -22
- synth_ai/cli/lib/__init__.py +10 -0
- synth_ai/cli/lib/task_app_discovery.py +7 -0
- synth_ai/cli/lib/task_app_env.py +518 -0
- synth_ai/cli/mcp.py +34 -0
- synth_ai/cli/modal_serve/__init__.py +12 -0
- synth_ai/cli/modal_serve/core.py +14 -0
- synth_ai/cli/modal_serve/errors.py +8 -0
- synth_ai/cli/modal_serve/validation.py +11 -0
- synth_ai/cli/opencode.py +256 -0
- synth_ai/cli/recent.py +13 -7
- synth_ai/cli/rl_demo.py +156 -116
- synth_ai/cli/root.py +131 -132
- synth_ai/cli/serve/__init__.py +12 -0
- synth_ai/cli/serve/core.py +14 -0
- synth_ai/cli/serve/errors.py +8 -0
- synth_ai/cli/serve/validation.py +11 -0
- synth_ai/cli/setup.py +49 -0
- synth_ai/cli/status.py +7 -125
- synth_ai/cli/task_app_deploy.py +7 -0
- synth_ai/cli/task_app_list.py +25 -0
- synth_ai/cli/task_app_modal_serve.py +11 -0
- synth_ai/cli/task_app_serve.py +11 -0
- synth_ai/cli/task_apps.py +2284 -257
- synth_ai/cli/traces.py +9 -5
- synth_ai/cli/train/__init__.py +12 -0
- synth_ai/cli/train/core.py +21 -0
- synth_ai/cli/train/errors.py +8 -0
- synth_ai/cli/train/validation.py +24 -0
- synth_ai/cli/train.py +5 -0
- synth_ai/cli/turso.py +73 -0
- synth_ai/cli/watch.py +13 -18
- synth_ai/demos/__init__.py +10 -0
- synth_ai/demos/core/__init__.py +28 -1
- synth_ai/demos/core/cli.py +579 -291
- synth_ai/demos/crafter/__init__.py +1 -0
- synth_ai/demos/crafter/crafter_fft_4b.toml +55 -0
- synth_ai/demos/crafter/grpo_crafter_task_app.py +185 -0
- synth_ai/demos/crafter/rl_from_base_qwen4b.toml +74 -0
- synth_ai/demos/demo_registry.py +176 -0
- synth_ai/demos/demo_task_apps/__init__.py +3 -3
- synth_ai/demos/demo_task_apps/core.py +64 -28
- synth_ai/demos/demo_task_apps/crafter/__init__.py +1 -0
- synth_ai/demos/demo_task_apps/crafter/configs/crafter_fft_4b.toml +53 -0
- synth_ai/demos/demo_task_apps/crafter/configs/rl_from_base_qwen4b.toml +73 -0
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +184 -0
- synth_ai/demos/demo_task_apps/math/_common.py +1 -2
- synth_ai/demos/demo_task_apps/math/app.py +2 -1
- synth_ai/demos/demo_task_apps/math/deploy_modal.py +3 -6
- synth_ai/demos/demo_task_apps/math/modal_task_app.py +185 -83
- synth_ai/demos/demo_task_apps/math/task_app_entry.py +0 -2
- synth_ai/demos/math/__init__.py +1 -0
- synth_ai/demos/math/_common.py +16 -0
- synth_ai/demos/math/app.py +38 -0
- synth_ai/demos/math/config.toml +76 -0
- synth_ai/demos/math/deploy_modal.py +54 -0
- synth_ai/demos/math/modal_task_app.py +703 -0
- synth_ai/demos/math/task_app_entry.py +51 -0
- synth_ai/environments/environment/core.py +7 -1
- synth_ai/environments/examples/bandit/engine.py +12 -5
- synth_ai/environments/examples/bandit/environment.py +0 -1
- synth_ai/environments/examples/bandit/taskset.py +4 -4
- synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
- synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
- synth_ai/environments/examples/crafter_classic/environment.py +93 -2
- synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
- synth_ai/environments/examples/enron/engine.py +7 -2
- synth_ai/environments/examples/enron/environment.py +68 -0
- synth_ai/environments/examples/red/engine.py +60 -12
- synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
- synth_ai/environments/examples/red/engine_helpers/reward_components.py +151 -179
- synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
- synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
- synth_ai/environments/examples/red/environment.py +86 -0
- synth_ai/environments/examples/red/trace_hooks_v3.py +168 -0
- synth_ai/environments/examples/sokoban/taskset.py +116 -0
- synth_ai/environments/examples/verilog/engine.py +104 -12
- synth_ai/environments/examples/wordle/environment.py +0 -1
- synth_ai/environments/reproducibility/tree.py +5 -6
- synth_ai/environments/service/app.py +11 -12
- synth_ai/environments/service/core_routes.py +10 -9
- synth_ai/environments/stateful/engine.py +1 -1
- synth_ai/environments/tasks/core.py +1 -0
- synth_ai/environments/tasks/filters.py +5 -6
- synth_ai/environments/tasks/utils.py +4 -5
- synth_ai/evals/__init__.py +15 -0
- synth_ai/evals/base.py +14 -5
- synth_ai/evals/client.py +82 -0
- synth_ai/evals/types.py +42 -0
- synth_ai/http.py +8 -22
- synth_ai/http_client.py +45 -12
- synth_ai/inference/__init__.py +0 -2
- synth_ai/inference/client.py +21 -7
- synth_ai/jobs/client.py +129 -80
- synth_ai/judge_schemas.py +127 -0
- synth_ai/learning/__init__.py +51 -6
- synth_ai/learning/algorithms.py +14 -0
- synth_ai/learning/client.py +122 -30
- synth_ai/learning/config.py +2 -40
- synth_ai/learning/constants.py +0 -2
- synth_ai/learning/ft_client.py +4 -56
- synth_ai/learning/health.py +14 -8
- synth_ai/learning/jobs.py +43 -47
- synth_ai/learning/prompt_learning_client.py +276 -0
- synth_ai/learning/prompt_learning_types.py +185 -0
- synth_ai/{rl → learning/rl}/__init__.py +14 -5
- synth_ai/learning/rl/client.py +269 -0
- synth_ai/learning/rl/config.py +31 -0
- synth_ai/{rl → learning/rl}/contracts.py +5 -10
- synth_ai/{rl → learning/rl}/env_keys.py +45 -16
- synth_ai/learning/rl/secrets.py +13 -0
- synth_ai/learning/rl_client.py +2 -253
- synth_ai/learning/sft/__init__.py +29 -0
- synth_ai/learning/sft/client.py +68 -0
- synth_ai/learning/sft/config.py +270 -0
- synth_ai/learning/sft/data.py +698 -0
- synth_ai/learning/sse.py +25 -26
- synth_ai/learning/validators.py +29 -25
- synth_ai/mcp/__init__.py +5 -0
- synth_ai/mcp/__main__.py +8 -0
- synth_ai/mcp/main.py +254 -0
- synth_ai/mcp/setup.py +100 -0
- synth_ai/modal.py +257 -0
- synth_ai/pricing/__init__.py +3 -0
- synth_ai/pricing/model_pricing.py +64 -0
- synth_ai/session/__init__.py +75 -0
- synth_ai/session/client.py +383 -0
- synth_ai/session/constants.py +63 -0
- synth_ai/session/exceptions.py +105 -0
- synth_ai/session/manager.py +139 -0
- synth_ai/session/models.py +89 -0
- synth_ai/session/query.py +110 -0
- synth_ai/spec/__init__.py +46 -0
- synth_ai/spec/dataclasses.py +149 -0
- synth_ai/spec/loader.py +144 -0
- synth_ai/spec/serializer.py +199 -0
- synth_ai/spec/validation.py +250 -0
- synth_ai/streaming/__init__.py +29 -0
- synth_ai/streaming/config.py +94 -0
- synth_ai/streaming/handlers.py +589 -0
- synth_ai/streaming/streamer.py +320 -0
- synth_ai/streaming/types.py +95 -0
- synth_ai/task/__init__.py +50 -30
- synth_ai/task/apps/__init__.py +63 -19
- synth_ai/task/auth.py +35 -23
- synth_ai/task/client.py +15 -13
- synth_ai/task/config.py +261 -0
- synth_ai/task/contracts.py +165 -64
- synth_ai/task/datasets.py +9 -6
- synth_ai/task/errors.py +11 -10
- synth_ai/task/health.py +17 -11
- synth_ai/task/inference_api.py +101 -0
- synth_ai/task/json.py +58 -24
- synth_ai/task/proxy.py +59 -66
- synth_ai/task/rubrics/__init__.py +55 -0
- synth_ai/task/rubrics/loaders.py +156 -0
- synth_ai/task/rubrics/models.py +57 -0
- synth_ai/task/rubrics/scoring.py +116 -0
- synth_ai/task/rubrics/strict.py +149 -0
- synth_ai/task/rubrics.py +22 -15
- synth_ai/task/server.py +65 -31
- synth_ai/task/trace_correlation_helpers.py +328 -0
- synth_ai/task/tracing_utils.py +44 -28
- synth_ai/task/validators.py +449 -6
- synth_ai/task/vendors.py +5 -7
- synth_ai/tracing_v3/__init__.py +4 -0
- synth_ai/tracing_v3/abstractions.py +21 -4
- synth_ai/tracing_v3/config.py +167 -22
- synth_ai/tracing_v3/constants.py +21 -0
- synth_ai/tracing_v3/db_config.py +42 -29
- synth_ai/tracing_v3/decorators.py +80 -45
- synth_ai/tracing_v3/examples/basic_usage.py +15 -9
- synth_ai/tracing_v3/hooks.py +6 -4
- synth_ai/tracing_v3/llm_call_record_helpers.py +161 -61
- synth_ai/tracing_v3/migration_helper.py +1 -2
- synth_ai/tracing_v3/replica_sync.py +12 -7
- synth_ai/tracing_v3/serialization.py +130 -0
- synth_ai/tracing_v3/session_tracer.py +73 -16
- synth_ai/tracing_v3/storage/base.py +89 -1
- synth_ai/tracing_v3/storage/config.py +63 -16
- synth_ai/tracing_v3/storage/factory.py +11 -9
- synth_ai/tracing_v3/storage/utils.py +15 -11
- synth_ai/tracing_v3/trace_utils.py +317 -0
- synth_ai/tracing_v3/turso/__init__.py +8 -21
- synth_ai/tracing_v3/turso/daemon.py +123 -15
- synth_ai/tracing_v3/turso/models.py +5 -2
- synth_ai/tracing_v3/turso/native_manager.py +1293 -0
- synth_ai/tracing_v3/utils.py +5 -4
- synth_ai/tunnel.py +143 -0
- synth_ai/tunnel_deploy.py +278 -0
- synth_ai/types.py +8 -0
- synth_ai/urls.py +11 -0
- synth_ai/utils/__init__.py +166 -0
- synth_ai/utils/agents.py +74 -0
- synth_ai/utils/apps.py +152 -0
- synth_ai/utils/base_url.py +94 -0
- synth_ai/utils/bin.py +39 -0
- synth_ai/utils/claude.py +36 -0
- synth_ai/utils/cli.py +284 -0
- synth_ai/utils/config.py +81 -0
- synth_ai/utils/env.py +346 -0
- synth_ai/utils/errors.py +85 -0
- synth_ai/utils/http.py +172 -0
- synth_ai/utils/json.py +72 -0
- synth_ai/utils/log_filter.py +99 -0
- synth_ai/utils/logging.py +198 -0
- synth_ai/utils/modal.py +299 -0
- synth_ai/utils/paths.py +95 -0
- synth_ai/utils/process.py +233 -0
- synth_ai/utils/prompts.py +39 -0
- synth_ai/utils/sqld.py +122 -0
- synth_ai/utils/ssl.py +25 -0
- synth_ai/utils/task_app_discovery.py +882 -0
- synth_ai/utils/task_app_env.py +186 -0
- synth_ai/utils/task_app_state.py +318 -0
- synth_ai/utils/tunnel/__init__.py +12 -0
- synth_ai/utils/tunnel/config.py +55 -0
- synth_ai/utils/user_config.py +137 -0
- synth_ai/uvicorn.py +77 -0
- synth_ai-0.2.23.dev3.dist-info/METADATA +357 -0
- synth_ai-0.2.23.dev3.dist-info/RECORD +983 -0
- {synth_ai-0.2.9.dev0.dist-info → synth_ai-0.2.23.dev3.dist-info}/entry_points.txt +0 -1
- {synth_ai-0.2.9.dev0.dist-info → synth_ai-0.2.23.dev3.dist-info}/top_level.txt +1 -0
- synth_ai/cli/man.py +0 -106
- synth_ai/core/experiment.py +0 -15
- synth_ai/core/system.py +0 -15
- synth_ai/demo_registry.py +0 -258
- synth_ai/environments/examples/sokoban/units/astar_common.py +0 -95
- synth_ai/experimental/synth_oss.py +0 -446
- synth_ai/handshake.py +0 -107
- synth_ai/install_sqld.sh +0 -40
- synth_ai/learning/offline/dpo.py +0 -0
- synth_ai/learning/offline/providers.py +0 -7
- synth_ai/learning/offline/sft.py +0 -0
- synth_ai/learning/offline/shared.py +0 -0
- synth_ai/learning/online/grpo.py +0 -0
- synth_ai/learning/online/irft.py +0 -0
- synth_ai/learning/prompts/banking77_injection_eval.py +0 -168
- synth_ai/learning/prompts/gepa.py +0 -0
- synth_ai/learning/prompts/hello_world_in_context_injection_ex.py +0 -213
- synth_ai/learning/prompts/mipro.py +0 -289
- synth_ai/learning/prompts/random_search.py +0 -246
- synth_ai/learning/prompts/run_mipro_banking77.py +0 -172
- synth_ai/learning/prompts/run_random_search_banking77.py +0 -324
- synth_ai/lm/__init__.py +0 -51
- synth_ai/lm/caching/constants.py +0 -6
- synth_ai/lm/caching/dbs.py +0 -0
- synth_ai/lm/caching/ephemeral.py +0 -102
- synth_ai/lm/caching/handler.py +0 -137
- synth_ai/lm/caching/initialize.py +0 -11
- synth_ai/lm/caching/persistent.py +0 -114
- synth_ai/lm/config.py +0 -110
- synth_ai/lm/constants.py +0 -32
- synth_ai/lm/core/__init__.py +0 -8
- synth_ai/lm/core/all.py +0 -73
- synth_ai/lm/core/exceptions.py +0 -7
- synth_ai/lm/core/main.py +0 -319
- synth_ai/lm/core/main_v3.py +0 -594
- synth_ai/lm/core/synth_models.py +0 -48
- synth_ai/lm/core/vendor_clients.py +0 -188
- synth_ai/lm/cost/monitor.py +0 -1
- synth_ai/lm/cost/statefulness.py +0 -1
- synth_ai/lm/injection.py +0 -80
- synth_ai/lm/overrides.py +0 -206
- synth_ai/lm/provider_support/__init__.py +0 -8
- synth_ai/lm/provider_support/anthropic.py +0 -972
- synth_ai/lm/provider_support/openai.py +0 -1139
- synth_ai/lm/provider_support/suppress_logging.py +0 -31
- synth_ai/lm/structured_outputs/handler.py +0 -440
- synth_ai/lm/structured_outputs/inject.py +0 -297
- synth_ai/lm/structured_outputs/rehabilitate.py +0 -185
- synth_ai/lm/tools/__init__.py +0 -3
- synth_ai/lm/tools/base.py +0 -172
- synth_ai/lm/unified_interface.py +0 -202
- synth_ai/lm/vendors/base.py +0 -81
- synth_ai/lm/vendors/core/anthropic_api.py +0 -387
- synth_ai/lm/vendors/core/gemini_api.py +0 -292
- synth_ai/lm/vendors/core/mistral_api.py +0 -322
- synth_ai/lm/vendors/core/openai_api.py +0 -225
- synth_ai/lm/vendors/core/synth_dev_api.py +0 -0
- synth_ai/lm/vendors/local/ollama.py +0 -0
- synth_ai/lm/vendors/openai_standard.py +0 -780
- synth_ai/lm/vendors/openai_standard_responses.py +0 -256
- synth_ai/lm/vendors/retries.py +0 -22
- synth_ai/lm/vendors/supported/custom_endpoint.py +0 -417
- synth_ai/lm/vendors/supported/deepseek.py +0 -69
- synth_ai/lm/vendors/supported/grok.py +0 -75
- synth_ai/lm/vendors/supported/groq.py +0 -16
- synth_ai/lm/vendors/supported/ollama.py +0 -15
- synth_ai/lm/vendors/supported/openrouter.py +0 -74
- synth_ai/lm/vendors/supported/together.py +0 -11
- synth_ai/lm/vendors/synth_client.py +0 -808
- synth_ai/lm/warmup.py +0 -186
- synth_ai/rl/secrets.py +0 -19
- synth_ai/scripts/verify_rewards.py +0 -100
- synth_ai/task/apps/grpo_crafter.py +0 -438
- synth_ai/tracing/__init__.py +0 -30
- synth_ai/tracing_v1/__init__.py +0 -33
- synth_ai/tracing_v3/turso/manager.py +0 -774
- synth_ai/v0/tracing/abstractions.py +0 -224
- synth_ai/v0/tracing/base_client.py +0 -91
- synth_ai/v0/tracing/client_manager.py +0 -131
- synth_ai/v0/tracing/config.py +0 -142
- synth_ai/v0/tracing/context.py +0 -146
- synth_ai/v0/tracing/decorators.py +0 -682
- synth_ai/v0/tracing/events/__init__.py +0 -0
- synth_ai/v0/tracing/events/manage.py +0 -147
- synth_ai/v0/tracing/events/scope.py +0 -86
- synth_ai/v0/tracing/events/store.py +0 -228
- synth_ai/v0/tracing/immediate_client.py +0 -151
- synth_ai/v0/tracing/local.py +0 -18
- synth_ai/v0/tracing/log_client_base.py +0 -73
- synth_ai/v0/tracing/retry_queue.py +0 -186
- synth_ai/v0/tracing/trackers.py +0 -515
- synth_ai/v0/tracing/upload.py +0 -512
- synth_ai/v0/tracing/utils.py +0 -9
- synth_ai/v0/tracing_v1/__init__.py +0 -16
- synth_ai/v0/tracing_v1/abstractions.py +0 -224
- synth_ai/v0/tracing_v1/base_client.py +0 -91
- synth_ai/v0/tracing_v1/client_manager.py +0 -131
- synth_ai/v0/tracing_v1/config.py +0 -142
- synth_ai/v0/tracing_v1/context.py +0 -146
- synth_ai/v0/tracing_v1/decorators.py +0 -703
- synth_ai/v0/tracing_v1/events/__init__.py +0 -0
- synth_ai/v0/tracing_v1/events/manage.py +0 -147
- synth_ai/v0/tracing_v1/events/scope.py +0 -86
- synth_ai/v0/tracing_v1/events/store.py +0 -228
- synth_ai/v0/tracing_v1/immediate_client.py +0 -151
- synth_ai/v0/tracing_v1/local.py +0 -18
- synth_ai/v0/tracing_v1/log_client_base.py +0 -73
- synth_ai/v0/tracing_v1/retry_queue.py +0 -186
- synth_ai/v0/tracing_v1/trackers.py +0 -515
- synth_ai/v0/tracing_v1/upload.py +0 -527
- synth_ai/v0/tracing_v1/utils.py +0 -9
- synth_ai/zyk/__init__.py +0 -30
- synth_ai-0.2.9.dev0.dist-info/METADATA +0 -131
- synth_ai-0.2.9.dev0.dist-info/RECORD +0 -444
- {synth_ai/lm/caching → examples/task_apps}/__init__.py +0 -0
- {synth_ai/lm/cost → examples/task_apps/crafter}/__init__.py +0 -0
- {synth_ai/lm/structured_outputs → examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server}/__init__.py +0 -0
- {synth_ai/lm/vendors → examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests}/__init__.py +0 -0
- {synth_ai/lm/vendors/core → examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils}/__init__.py +0 -0
- {synth_ai/lm/vendors/local → examples/task_apps/math}/__init__.py +0 -0
- {synth_ai/lm/vendors/supported → examples/workflows}/__init__.py +0 -0
- {synth_ai/v0/tracing → examples/workflows/math_rl}/__init__.py +0 -0
- /synth_ai/{compound/cais.py → cli/__main__.py} +0 -0
- /synth_ai/{learning/filtering.py → py.typed} +0 -0
- {synth_ai-0.2.9.dev0.dist-info → synth_ai-0.2.23.dev3.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.9.dev0.dist-info → synth_ai-0.2.23.dev3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Test runner for Pokemon Emerald emulator tests
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
python tests/run_tests.py # Run all tests
|
|
7
|
+
python tests/run_tests.py test_fps_adjustment # Run specific test
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import sys
|
|
12
|
+
import subprocess
|
|
13
|
+
import importlib.util
|
|
14
|
+
|
|
15
|
+
def run_test(test_name):
|
|
16
|
+
"""Run a specific test"""
|
|
17
|
+
test_file = f"tests/{test_name}.py"
|
|
18
|
+
|
|
19
|
+
if not os.path.exists(test_file):
|
|
20
|
+
print(f"❌ Test file not found: {test_file}")
|
|
21
|
+
return False
|
|
22
|
+
|
|
23
|
+
print(f"🧪 Running test: {test_name}")
|
|
24
|
+
print("=" * 50)
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
# Run the test
|
|
28
|
+
result = subprocess.run([
|
|
29
|
+
sys.executable, test_file
|
|
30
|
+
], capture_output=False, text=True)
|
|
31
|
+
|
|
32
|
+
if result.returncode == 0:
|
|
33
|
+
print(f"✅ Test {test_name} passed!")
|
|
34
|
+
return True
|
|
35
|
+
else:
|
|
36
|
+
print(f"❌ Test {test_name} failed!")
|
|
37
|
+
return False
|
|
38
|
+
|
|
39
|
+
except Exception as e:
|
|
40
|
+
print(f"❌ Error running test {test_name}: {e}")
|
|
41
|
+
return False
|
|
42
|
+
|
|
43
|
+
def run_all_tests():
|
|
44
|
+
"""Run all tests in the tests directory"""
|
|
45
|
+
print("🧪 Pokemon Emerald Emulator Test Suite")
|
|
46
|
+
print("=" * 50)
|
|
47
|
+
|
|
48
|
+
# Find all test files
|
|
49
|
+
test_files = []
|
|
50
|
+
for file in os.listdir("tests"):
|
|
51
|
+
if file.startswith("test_") and file.endswith(".py"):
|
|
52
|
+
test_name = file[:-3] # Remove .py extension
|
|
53
|
+
test_files.append(test_name)
|
|
54
|
+
|
|
55
|
+
if not test_files:
|
|
56
|
+
print("❌ No test files found in tests/ directory")
|
|
57
|
+
return False
|
|
58
|
+
|
|
59
|
+
print(f"Found {len(test_files)} test(s):")
|
|
60
|
+
for test in test_files:
|
|
61
|
+
print(f" - {test}")
|
|
62
|
+
print()
|
|
63
|
+
print("💡 Note: For pytest-style tests, run:")
|
|
64
|
+
print(" python -m pytest tests/test_fps_adjustment_pytest.py -v")
|
|
65
|
+
print(" python -m pytest tests/test_server_map_validation.py -v")
|
|
66
|
+
print("💡 Note: Test state files are located in tests/states/")
|
|
67
|
+
print("💡 Note: Map reference files are saved in tests/map_references/")
|
|
68
|
+
print()
|
|
69
|
+
|
|
70
|
+
# Run each test
|
|
71
|
+
results = []
|
|
72
|
+
for test in test_files:
|
|
73
|
+
success = run_test(test)
|
|
74
|
+
results.append((test, success))
|
|
75
|
+
print() # Add spacing between tests
|
|
76
|
+
|
|
77
|
+
# Print summary
|
|
78
|
+
print("📋 Test Summary")
|
|
79
|
+
print("=" * 30)
|
|
80
|
+
|
|
81
|
+
passed_count = sum(1 for _, success in results if success)
|
|
82
|
+
total_count = len(results)
|
|
83
|
+
|
|
84
|
+
for test, success in results:
|
|
85
|
+
status = "✅ PASS" if success else "❌ FAIL"
|
|
86
|
+
print(f"{test}: {status}")
|
|
87
|
+
|
|
88
|
+
print(f"\nResults: {passed_count}/{total_count} tests passed")
|
|
89
|
+
|
|
90
|
+
if passed_count == total_count:
|
|
91
|
+
print("🎉 All tests passed!")
|
|
92
|
+
return True
|
|
93
|
+
else:
|
|
94
|
+
print("⚠️ Some tests failed.")
|
|
95
|
+
return False
|
|
96
|
+
|
|
97
|
+
def main():
|
|
98
|
+
"""Main function"""
|
|
99
|
+
# Check if we're in the right directory
|
|
100
|
+
if not os.path.exists("server/app.py"):
|
|
101
|
+
print("❌ Error: This test runner must be run from the project root directory")
|
|
102
|
+
print("Please run: python tests/run_tests.py")
|
|
103
|
+
sys.exit(1)
|
|
104
|
+
|
|
105
|
+
# Check if tests directory exists
|
|
106
|
+
if not os.path.exists("tests"):
|
|
107
|
+
print("❌ Error: tests/ directory not found")
|
|
108
|
+
sys.exit(1)
|
|
109
|
+
|
|
110
|
+
# Parse command line arguments
|
|
111
|
+
if len(sys.argv) > 1:
|
|
112
|
+
# Run specific test
|
|
113
|
+
test_name = sys.argv[1]
|
|
114
|
+
success = run_test(test_name)
|
|
115
|
+
sys.exit(0 if success else 1)
|
|
116
|
+
else:
|
|
117
|
+
# Run all tests
|
|
118
|
+
success = run_all_tests()
|
|
119
|
+
sys.exit(0 if success else 1)
|
|
120
|
+
|
|
121
|
+
if __name__ == "__main__":
|
|
122
|
+
main()
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Pytest for agent functionality (legacy test file)
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import pytest
|
|
7
|
+
import requests
|
|
8
|
+
import json
|
|
9
|
+
import time
|
|
10
|
+
|
|
11
|
+
class TestAgentDirectAPI:
|
|
12
|
+
"""Test class for agent API endpoints"""
|
|
13
|
+
|
|
14
|
+
base_url = "http://localhost:8080"
|
|
15
|
+
|
|
16
|
+
def test_imports_work(self):
|
|
17
|
+
"""Test that our imports are working"""
|
|
18
|
+
import agent
|
|
19
|
+
assert hasattr(agent, 'app')
|
|
20
|
+
assert hasattr(agent, 'agent_mode')
|
|
21
|
+
assert hasattr(agent, 'websocket_connections')
|
|
22
|
+
|
|
23
|
+
def test_global_state_initialized(self):
|
|
24
|
+
"""Test that global state variables are properly initialized"""
|
|
25
|
+
import agent
|
|
26
|
+
assert agent.agent_mode == True # Should start in agent mode by default
|
|
27
|
+
assert agent.agent_auto_enabled == False # Should start with auto disabled
|
|
28
|
+
assert isinstance(agent.websocket_connections, set)
|
|
29
|
+
assert len(agent.websocket_connections) == 0 # Should start empty
|
|
30
|
+
|
|
31
|
+
def test_broadcast_function_exists(self):
|
|
32
|
+
"""Test that broadcast function exists and is callable"""
|
|
33
|
+
import agent
|
|
34
|
+
assert hasattr(agent, 'broadcast_state_update')
|
|
35
|
+
assert callable(agent.broadcast_state_update)
|
|
36
|
+
|
|
37
|
+
@pytest.mark.skip(reason="Requires running server")
|
|
38
|
+
def test_status_endpoint(self):
|
|
39
|
+
"""Test the /status endpoint"""
|
|
40
|
+
try:
|
|
41
|
+
response = requests.get(f"{self.base_url}/status", timeout=2)
|
|
42
|
+
assert response.status_code == 200
|
|
43
|
+
data = response.json()
|
|
44
|
+
assert "step" in data
|
|
45
|
+
assert "agent_initialized" in data
|
|
46
|
+
except requests.exceptions.ConnectionError:
|
|
47
|
+
pytest.skip("agent server not running")
|
|
48
|
+
|
|
49
|
+
@pytest.mark.skip(reason="Requires running server")
|
|
50
|
+
def test_toggle_mode_endpoint(self):
|
|
51
|
+
"""Test the /toggle_mode endpoint"""
|
|
52
|
+
try:
|
|
53
|
+
response = requests.post(f"{self.base_url}/toggle_mode", timeout=2)
|
|
54
|
+
assert response.status_code == 200
|
|
55
|
+
data = response.json()
|
|
56
|
+
assert "mode" in data
|
|
57
|
+
assert "agent_mode" in data
|
|
58
|
+
assert data["mode"] in ["MANUAL", "AGENT"]
|
|
59
|
+
except requests.exceptions.ConnectionError:
|
|
60
|
+
pytest.skip("agent server not running")
|
|
61
|
+
|
|
62
|
+
@pytest.mark.skip(reason="Requires running server")
|
|
63
|
+
def test_toggle_auto_endpoint(self):
|
|
64
|
+
"""Test the /toggle_auto endpoint"""
|
|
65
|
+
try:
|
|
66
|
+
response = requests.post(f"{self.base_url}/toggle_auto", timeout=2)
|
|
67
|
+
assert response.status_code == 200
|
|
68
|
+
data = response.json()
|
|
69
|
+
assert "auto_enabled" in data
|
|
70
|
+
assert "status" in data
|
|
71
|
+
assert data["status"] in ["ENABLED", "DISABLED"]
|
|
72
|
+
except requests.exceptions.ConnectionError:
|
|
73
|
+
pytest.skip("agent server not running")
|
|
74
|
+
|
|
75
|
+
if __name__ == "__main__":
|
|
76
|
+
pytest.main([__file__, "-v"])
|
examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py
ADDED
|
@@ -0,0 +1,413 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Pytest for Agent Prompts Validation
|
|
4
|
+
|
|
5
|
+
Tests that validate the actual prompt outputs from agent modules:
|
|
6
|
+
- action.py: Action decision prompts
|
|
7
|
+
- memory.py: Memory context generation
|
|
8
|
+
- perception.py: Observation and scene analysis
|
|
9
|
+
- planning.py: Strategic planning prompts
|
|
10
|
+
|
|
11
|
+
This test validates that the agent modules generate proper prompts without "Unknown" values.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import pytest
|
|
15
|
+
import json
|
|
16
|
+
import sys
|
|
17
|
+
import os
|
|
18
|
+
import requests
|
|
19
|
+
import time
|
|
20
|
+
import subprocess
|
|
21
|
+
from typing import Dict, Any, List, Set
|
|
22
|
+
from unittest.mock import Mock, patch
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
# Add the project root to the path
|
|
26
|
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
27
|
+
|
|
28
|
+
# Import agent modules
|
|
29
|
+
from agent.action import action_step
|
|
30
|
+
from agent.memory import memory_step, extract_key_state_info
|
|
31
|
+
from agent.perception import perception_step
|
|
32
|
+
from agent.planning import planning_step
|
|
33
|
+
from utils.vlm import VLM
|
|
34
|
+
from utils.state_formatter import format_state_for_llm, format_state_summary
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class TestAgentPrompts:
|
|
38
|
+
"""Test class for agent prompts validation"""
|
|
39
|
+
|
|
40
|
+
@pytest.fixture
|
|
41
|
+
def server_url(self):
|
|
42
|
+
"""Server URL for testing"""
|
|
43
|
+
return "http://localhost:8000"
|
|
44
|
+
|
|
45
|
+
@pytest.fixture
|
|
46
|
+
def mock_vlm(self):
|
|
47
|
+
"""Create a mock VLM that captures prompts and returns reasonable responses"""
|
|
48
|
+
mock_vlm = Mock(spec=VLM)
|
|
49
|
+
|
|
50
|
+
def mock_get_query(frame, prompt, context=""):
|
|
51
|
+
# Capture the prompt for analysis
|
|
52
|
+
if not hasattr(mock_vlm, 'captured_prompts'):
|
|
53
|
+
mock_vlm.captured_prompts = []
|
|
54
|
+
mock_vlm.captured_prompts.append({
|
|
55
|
+
'context': context,
|
|
56
|
+
'prompt': prompt,
|
|
57
|
+
'frame': frame is not None
|
|
58
|
+
})
|
|
59
|
+
|
|
60
|
+
# Return reasonable responses based on context
|
|
61
|
+
if "PERCEPTION" in context:
|
|
62
|
+
return "I can see the player character on a grassy route with trees and paths."
|
|
63
|
+
elif "ACTION" in context:
|
|
64
|
+
return "UP"
|
|
65
|
+
elif "PLANNING" in context:
|
|
66
|
+
return "Continue exploring the route and look for items or trainers."
|
|
67
|
+
elif "MEMORY" in context:
|
|
68
|
+
return "Updated memory context with current observations."
|
|
69
|
+
else:
|
|
70
|
+
return "Default response"
|
|
71
|
+
|
|
72
|
+
def mock_get_text_query(prompt, context=""):
|
|
73
|
+
# Capture the prompt for analysis
|
|
74
|
+
if not hasattr(mock_vlm, 'captured_prompts'):
|
|
75
|
+
mock_vlm.captured_prompts = []
|
|
76
|
+
mock_vlm.captured_prompts.append({
|
|
77
|
+
'context': context,
|
|
78
|
+
'prompt': prompt,
|
|
79
|
+
'frame': False
|
|
80
|
+
})
|
|
81
|
+
|
|
82
|
+
# Return reasonable responses based on context
|
|
83
|
+
if "ACTION" in context:
|
|
84
|
+
return "UP"
|
|
85
|
+
elif "PLANNING" in context:
|
|
86
|
+
return "Continue exploring the route and look for items or trainers."
|
|
87
|
+
elif "MEMORY" in context:
|
|
88
|
+
return "Updated memory context with current observations."
|
|
89
|
+
else:
|
|
90
|
+
return "Default response"
|
|
91
|
+
|
|
92
|
+
mock_vlm.get_query = mock_get_query
|
|
93
|
+
mock_vlm.get_text_query = mock_get_text_query
|
|
94
|
+
|
|
95
|
+
return mock_vlm
|
|
96
|
+
|
|
97
|
+
def find_state_files(self):
|
|
98
|
+
"""Find all .state files in the tests/states directory"""
|
|
99
|
+
states_dir = Path(__file__).parent / "states"
|
|
100
|
+
if not states_dir.exists():
|
|
101
|
+
return []
|
|
102
|
+
|
|
103
|
+
state_files = list(states_dir.glob("*.state"))
|
|
104
|
+
return sorted(state_files)
|
|
105
|
+
|
|
106
|
+
def start_server_with_state(self, state_file_path: str):
|
|
107
|
+
"""Start the server with a specific state file"""
|
|
108
|
+
import subprocess
|
|
109
|
+
|
|
110
|
+
# Kill any existing server processes
|
|
111
|
+
try:
|
|
112
|
+
subprocess.run(["pkill", "-f", "server.app"], check=False)
|
|
113
|
+
time.sleep(1)
|
|
114
|
+
except:
|
|
115
|
+
pass
|
|
116
|
+
|
|
117
|
+
# Start new server
|
|
118
|
+
server_process = subprocess.Popen([
|
|
119
|
+
"conda", "run", "-n", "mgba", "python", "-m", "server.app",
|
|
120
|
+
"--load-state", state_file_path
|
|
121
|
+
], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
122
|
+
|
|
123
|
+
# Wait for server to start
|
|
124
|
+
time.sleep(3)
|
|
125
|
+
|
|
126
|
+
return server_process
|
|
127
|
+
|
|
128
|
+
def stop_server(self, server_process):
|
|
129
|
+
"""Stop the server"""
|
|
130
|
+
if server_process:
|
|
131
|
+
try:
|
|
132
|
+
server_process.terminate()
|
|
133
|
+
server_process.wait(timeout=5)
|
|
134
|
+
except:
|
|
135
|
+
server_process.kill()
|
|
136
|
+
|
|
137
|
+
def get_state_from_server(self, server_url: str) -> Dict[str, Any]:
|
|
138
|
+
"""Get state data from the server"""
|
|
139
|
+
try:
|
|
140
|
+
response = requests.get(f"{server_url}/state", timeout=10)
|
|
141
|
+
if response.status_code == 200:
|
|
142
|
+
return response.json()
|
|
143
|
+
else:
|
|
144
|
+
return {}
|
|
145
|
+
except Exception:
|
|
146
|
+
return {}
|
|
147
|
+
|
|
148
|
+
def test_action_module_prompts(self, mock_vlm, server_url):
|
|
149
|
+
"""Test that action module generates proper prompts without 'Unknown' values"""
|
|
150
|
+
state_files = self.find_state_files()
|
|
151
|
+
|
|
152
|
+
for state_file in state_files:
|
|
153
|
+
print(f"\nTesting ACTION module with {state_file}")
|
|
154
|
+
|
|
155
|
+
# Start server
|
|
156
|
+
server_process = self.start_server_with_state(str(state_file))
|
|
157
|
+
|
|
158
|
+
try:
|
|
159
|
+
# Get state data
|
|
160
|
+
state_data = self.get_state_from_server(server_url)
|
|
161
|
+
if not state_data:
|
|
162
|
+
pytest.skip(f"Could not get state data for {state_file}")
|
|
163
|
+
|
|
164
|
+
# Mock inputs for action_step
|
|
165
|
+
memory_context = "Test memory context"
|
|
166
|
+
current_plan = "Test plan"
|
|
167
|
+
latest_observation = "Test observation"
|
|
168
|
+
frame = None
|
|
169
|
+
recent_actions = ["UP", "A", "RIGHT"]
|
|
170
|
+
|
|
171
|
+
# Call action_step
|
|
172
|
+
actions = action_step(memory_context, current_plan, latest_observation, frame, state_data, recent_actions, mock_vlm)
|
|
173
|
+
|
|
174
|
+
# Check captured prompts
|
|
175
|
+
action_prompts = [p for p in mock_vlm.captured_prompts if "ACTION" in p['context']]
|
|
176
|
+
|
|
177
|
+
assert action_prompts, f"No action prompts captured for {state_file}"
|
|
178
|
+
|
|
179
|
+
# Analyze the action prompt for issues
|
|
180
|
+
action_prompt = action_prompts[0]['prompt']
|
|
181
|
+
|
|
182
|
+
# Check for "Unknown" values in the prompt
|
|
183
|
+
assert "Unknown" not in action_prompt, f"Action prompt contains 'Unknown' values in {state_file}"
|
|
184
|
+
|
|
185
|
+
# Check for required sections
|
|
186
|
+
required_sections = ["COMPREHENSIVE GAME STATE DATA", "ENHANCED ACTION CONTEXT", "ACTION DECISION TASK"]
|
|
187
|
+
for section in required_sections:
|
|
188
|
+
assert section in action_prompt, f"Action prompt missing section '{section}' in {state_file}"
|
|
189
|
+
|
|
190
|
+
# Check that actions were returned
|
|
191
|
+
assert actions, f"Action module returned no actions for {state_file}"
|
|
192
|
+
assert isinstance(actions, list), f"Action module returned non-list actions for {state_file}"
|
|
193
|
+
|
|
194
|
+
finally:
|
|
195
|
+
self.stop_server(server_process)
|
|
196
|
+
|
|
197
|
+
def test_memory_module_prompts(self, mock_vlm, server_url):
|
|
198
|
+
"""Test that memory module generates proper prompts without 'Unknown' values"""
|
|
199
|
+
state_files = self.find_state_files()
|
|
200
|
+
|
|
201
|
+
for state_file in state_files:
|
|
202
|
+
print(f"\nTesting MEMORY module with {state_file}")
|
|
203
|
+
|
|
204
|
+
# Start server
|
|
205
|
+
server_process = self.start_server_with_state(str(state_file))
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
# Get state data
|
|
209
|
+
state_data = self.get_state_from_server(server_url)
|
|
210
|
+
if not state_data:
|
|
211
|
+
pytest.skip(f"Could not get state data for {state_file}")
|
|
212
|
+
|
|
213
|
+
# Test extract_key_state_info function
|
|
214
|
+
key_info = extract_key_state_info(state_data)
|
|
215
|
+
|
|
216
|
+
# Check for "Unknown" values in key info
|
|
217
|
+
assert "Unknown" not in str(key_info), f"Memory key_info contains 'Unknown' values in {state_file}"
|
|
218
|
+
|
|
219
|
+
# Check for required fields
|
|
220
|
+
required_fields = ['state_summary', 'player_name', 'money', 'current_map', 'in_battle', 'party_health']
|
|
221
|
+
for field in required_fields:
|
|
222
|
+
assert field in key_info, f"Memory key_info missing field '{field}' in {state_file}"
|
|
223
|
+
|
|
224
|
+
# Test memory_step function
|
|
225
|
+
memory_context = "Test memory context"
|
|
226
|
+
current_plan = "Test plan"
|
|
227
|
+
recent_actions = ["UP", "A", "RIGHT"]
|
|
228
|
+
observation_buffer = [
|
|
229
|
+
{
|
|
230
|
+
"frame_id": 1,
|
|
231
|
+
"observation": "Test observation",
|
|
232
|
+
"state": state_data
|
|
233
|
+
}
|
|
234
|
+
]
|
|
235
|
+
|
|
236
|
+
# Call memory_step
|
|
237
|
+
updated_memory = memory_step(memory_context, current_plan, recent_actions, observation_buffer, mock_vlm)
|
|
238
|
+
|
|
239
|
+
# Check for "Unknown" values in memory context
|
|
240
|
+
assert "Unknown" not in updated_memory, f"Memory context contains 'Unknown' values in {state_file}"
|
|
241
|
+
|
|
242
|
+
# Check for required sections
|
|
243
|
+
required_sections = ["COMPREHENSIVE MEMORY CONTEXT", "CURRENT STATE", "CURRENT PLAN", "KEY EVENTS", "RECENT MEMORY"]
|
|
244
|
+
for section in required_sections:
|
|
245
|
+
assert section in updated_memory, f"Memory context missing section '{section}' in {state_file}"
|
|
246
|
+
|
|
247
|
+
# Check that memory context is not empty
|
|
248
|
+
assert len(updated_memory.strip()) > 100, f"Memory context seems too short in {state_file}"
|
|
249
|
+
|
|
250
|
+
finally:
|
|
251
|
+
self.stop_server(server_process)
|
|
252
|
+
|
|
253
|
+
def test_perception_module_prompts(self, mock_vlm, server_url):
|
|
254
|
+
"""Test that perception module generates proper prompts without 'Unknown' values"""
|
|
255
|
+
state_files = self.find_state_files()
|
|
256
|
+
|
|
257
|
+
for state_file in state_files:
|
|
258
|
+
print(f"\nTesting PERCEPTION module with {state_file}")
|
|
259
|
+
|
|
260
|
+
# Start server
|
|
261
|
+
server_process = self.start_server_with_state(str(state_file))
|
|
262
|
+
|
|
263
|
+
try:
|
|
264
|
+
# Get state data
|
|
265
|
+
state_data = self.get_state_from_server(server_url)
|
|
266
|
+
if not state_data:
|
|
267
|
+
pytest.skip(f"Could not get state data for {state_file}")
|
|
268
|
+
|
|
269
|
+
# Mock frame
|
|
270
|
+
frame = None
|
|
271
|
+
|
|
272
|
+
# Call perception_step
|
|
273
|
+
observation, slow_thinking = perception_step(frame, state_data, mock_vlm)
|
|
274
|
+
|
|
275
|
+
# Check captured prompts
|
|
276
|
+
perception_prompts = [p for p in mock_vlm.captured_prompts if "PERCEPTION" in p['context']]
|
|
277
|
+
|
|
278
|
+
assert perception_prompts, f"No perception prompts captured for {state_file}"
|
|
279
|
+
|
|
280
|
+
# Analyze the perception prompt for issues
|
|
281
|
+
perception_prompt = perception_prompts[0]['prompt']
|
|
282
|
+
|
|
283
|
+
# Check for "Unknown" values in the prompt
|
|
284
|
+
assert "Unknown" not in perception_prompt, f"Perception prompt contains 'Unknown' values in {state_file}"
|
|
285
|
+
|
|
286
|
+
# Check for required sections
|
|
287
|
+
required_sections = ["COMPREHENSIVE GAME STATE DATA", "VISUAL ANALYSIS TASK"]
|
|
288
|
+
for section in required_sections:
|
|
289
|
+
assert section in perception_prompt, f"Perception prompt missing section '{section}' in {state_file}"
|
|
290
|
+
|
|
291
|
+
# Check for analysis instructions
|
|
292
|
+
analysis_keywords = ["CUTSCENE", "MAP", "BATTLE", "DIALOGUE", "MENU"]
|
|
293
|
+
found_keywords = [kw for kw in analysis_keywords if kw in perception_prompt]
|
|
294
|
+
assert len(found_keywords) >= 3, f"Perception prompt missing analysis keywords in {state_file}. Found: {found_keywords}"
|
|
295
|
+
|
|
296
|
+
# Check that observation was returned
|
|
297
|
+
assert observation, f"Perception module returned no observation for {state_file}"
|
|
298
|
+
assert isinstance(observation, dict), f"Perception module returned non-dict observation for {state_file}"
|
|
299
|
+
|
|
300
|
+
# Check that slow_thinking is boolean
|
|
301
|
+
assert isinstance(slow_thinking, bool), f"Perception module returned non-boolean slow_thinking for {state_file}"
|
|
302
|
+
|
|
303
|
+
finally:
|
|
304
|
+
self.stop_server(server_process)
|
|
305
|
+
|
|
306
|
+
def test_planning_module_prompts(self, mock_vlm, server_url):
|
|
307
|
+
"""Test that planning module generates proper prompts without 'Unknown' values"""
|
|
308
|
+
state_files = self.find_state_files()
|
|
309
|
+
|
|
310
|
+
for state_file in state_files:
|
|
311
|
+
print(f"\nTesting PLANNING module with {state_file}")
|
|
312
|
+
|
|
313
|
+
# Start server
|
|
314
|
+
server_process = self.start_server_with_state(str(state_file))
|
|
315
|
+
|
|
316
|
+
try:
|
|
317
|
+
# Get state data
|
|
318
|
+
state_data = self.get_state_from_server(server_url)
|
|
319
|
+
if not state_data:
|
|
320
|
+
pytest.skip(f"Could not get state data for {state_file}")
|
|
321
|
+
|
|
322
|
+
# Mock inputs for planning_step
|
|
323
|
+
memory_context = "Test memory context"
|
|
324
|
+
current_plan = None # Start with no plan
|
|
325
|
+
slow_thinking_needed = True
|
|
326
|
+
|
|
327
|
+
# Call planning_step
|
|
328
|
+
plan = planning_step(memory_context, current_plan, slow_thinking_needed, state_data, mock_vlm)
|
|
329
|
+
|
|
330
|
+
# Check captured prompts
|
|
331
|
+
planning_prompts = [p for p in mock_vlm.captured_prompts if "PLANNING" in p['context']]
|
|
332
|
+
|
|
333
|
+
assert planning_prompts, f"No planning prompts captured for {state_file}"
|
|
334
|
+
|
|
335
|
+
# Analyze the planning prompt for issues
|
|
336
|
+
planning_prompt = planning_prompts[0]['prompt']
|
|
337
|
+
|
|
338
|
+
# Check for "Unknown" values in the prompt
|
|
339
|
+
assert "Unknown" not in planning_prompt, f"Planning prompt contains 'Unknown' values in {state_file}"
|
|
340
|
+
|
|
341
|
+
# Check for required sections
|
|
342
|
+
required_sections = ["COMPREHENSIVE GAME STATE DATA", "STRATEGIC PLANNING TASK"]
|
|
343
|
+
for section in required_sections:
|
|
344
|
+
assert section in planning_prompt, f"Planning prompt missing section '{section}' in {state_file}"
|
|
345
|
+
|
|
346
|
+
# Check for planning instructions
|
|
347
|
+
planning_keywords = ["IMMEDIATE GOAL", "SHORT-TERM OBJECTIVES", "LONG-TERM STRATEGY", "EFFICIENCY NOTES"]
|
|
348
|
+
found_keywords = [kw for kw in planning_keywords if kw in planning_prompt]
|
|
349
|
+
assert len(found_keywords) >= 3, f"Planning prompt missing planning keywords in {state_file}. Found: {found_keywords}"
|
|
350
|
+
|
|
351
|
+
# Check that plan was returned
|
|
352
|
+
assert plan, f"Planning module returned no plan for {state_file}"
|
|
353
|
+
assert isinstance(plan, str), f"Planning module returned non-string plan for {state_file}"
|
|
354
|
+
|
|
355
|
+
finally:
|
|
356
|
+
self.stop_server(server_process)
|
|
357
|
+
|
|
358
|
+
def test_all_modules_integration(self, mock_vlm, server_url):
|
|
359
|
+
"""Test that all modules work together without 'Unknown' values"""
|
|
360
|
+
state_files = self.find_state_files()
|
|
361
|
+
|
|
362
|
+
for state_file in state_files:
|
|
363
|
+
print(f"\nTesting ALL MODULES integration with {state_file}")
|
|
364
|
+
|
|
365
|
+
# Start server
|
|
366
|
+
server_process = self.start_server_with_state(str(state_file))
|
|
367
|
+
|
|
368
|
+
try:
|
|
369
|
+
# Get state data
|
|
370
|
+
state_data = self.get_state_from_server(server_url)
|
|
371
|
+
if not state_data:
|
|
372
|
+
pytest.skip(f"Could not get state data for {state_file}")
|
|
373
|
+
|
|
374
|
+
# Test all modules in sequence
|
|
375
|
+
memory_context = "Test memory context"
|
|
376
|
+
current_plan = None
|
|
377
|
+
latest_observation = "Test observation"
|
|
378
|
+
frame = None
|
|
379
|
+
recent_actions = ["UP", "A", "RIGHT"]
|
|
380
|
+
observation_buffer = [
|
|
381
|
+
{
|
|
382
|
+
"frame_id": 1,
|
|
383
|
+
"observation": "Test observation",
|
|
384
|
+
"state": state_data
|
|
385
|
+
}
|
|
386
|
+
]
|
|
387
|
+
|
|
388
|
+
# 1. Perception
|
|
389
|
+
observation, slow_thinking = perception_step(frame, state_data, mock_vlm)
|
|
390
|
+
assert observation and isinstance(observation, dict)
|
|
391
|
+
|
|
392
|
+
# 2. Memory
|
|
393
|
+
updated_memory = memory_step(memory_context, current_plan, recent_actions, observation_buffer, mock_vlm)
|
|
394
|
+
assert "Unknown" not in updated_memory
|
|
395
|
+
|
|
396
|
+
# 3. Planning
|
|
397
|
+
plan = planning_step(updated_memory, current_plan, slow_thinking, state_data, mock_vlm)
|
|
398
|
+
assert plan and isinstance(plan, str)
|
|
399
|
+
|
|
400
|
+
# 4. Action
|
|
401
|
+
actions = action_step(updated_memory, plan, observation, frame, state_data, recent_actions, mock_vlm)
|
|
402
|
+
assert actions and isinstance(actions, list)
|
|
403
|
+
|
|
404
|
+
# Check that no prompts contain "Unknown"
|
|
405
|
+
for prompt_data in mock_vlm.captured_prompts:
|
|
406
|
+
assert "Unknown" not in prompt_data['prompt'], f"Found 'Unknown' in {prompt_data['context']} prompt for {state_file}"
|
|
407
|
+
|
|
408
|
+
finally:
|
|
409
|
+
self.stop_server(server_process)
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
if __name__ == "__main__":
|
|
413
|
+
pytest.main([__file__])
|