synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/multi_step/configs/README_verilog_rl.md +77 -0
- examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
- examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
- examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
- examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +17 -5
- examples/multi_step/configs/crafter_synth_backend.md +40 -0
- examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
- examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
- examples/multi_step/configs/verilog_rl_lora.toml +190 -0
- examples/multi_step/judges/crafter_backend_judge.py +220 -0
- examples/multi_step/judges/verilog_backend_judge.py +234 -0
- examples/multi_step/readme.md +48 -0
- examples/multi_step/verilog_rl_lora.md +218 -0
- examples/qwen_coder/configs/coder_lora_30b.toml +1 -1
- examples/sft/evaluate.py +2 -0
- examples/sft/generate_traces.py +2 -0
- examples/swe/task_app/grpo_swe_mini.py +56 -26
- examples/swe/task_app/hosted/rollout.py +42 -0
- examples/swe/task_app/hosted/test_service.py +5 -6
- examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
- examples/task_apps/TESTING.md +275 -0
- examples/task_apps/__init__.py +0 -0
- examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
- examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
- examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
- examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
- examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
- examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
- examples/task_apps/crafter/__init__.py +0 -0
- examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
- examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
- examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
- examples/task_apps/crafter/task_app/__init__.py +5 -0
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +324 -21
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +76 -7
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +25 -3
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +77 -4
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +117 -9
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
- examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +218 -0
- examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
- examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
- examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
- examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
- examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
- examples/task_apps/enron/__init__.py +1 -0
- examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
- examples/task_apps/enron/filter_sft.toml +5 -0
- examples/task_apps/enron/task_app/README.md +14 -0
- examples/task_apps/enron/task_app/__init__.py +1 -0
- examples/task_apps/enron/task_app/grpo_enron.py +906 -0
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
- examples/task_apps/enron/tests/__init__.py +4 -0
- examples/task_apps/enron/tests/conftest.py +115 -0
- examples/task_apps/enron/tests/integration/__init__.py +4 -0
- examples/task_apps/enron/tests/integration/test_enron_eval.py +179 -0
- examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
- examples/task_apps/enron/tests/unit/__init__.py +4 -0
- examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
- examples/task_apps/math/__init__.py +0 -0
- examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
- examples/task_apps/pokemon_battle/__init__.py +2 -0
- examples/task_apps/pokemon_battle/modal_app.py +104 -0
- examples/task_apps/pokemon_battle/task_app/README.md +68 -0
- examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
- examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
- examples/task_apps/pokemon_red/README.md +357 -0
- examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
- examples/task_apps/pokemon_red/__init__.py +3 -0
- examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
- examples/task_apps/pokemon_red/pallet_town_rl_config.toml +75 -0
- examples/task_apps/pokemon_red/task_app.py +799 -0
- examples/task_apps/pokemon_red/test_pallet_town_rewards.py +193 -0
- examples/task_apps/sokoban/README.md +307 -0
- examples/task_apps/sokoban/__init__.py +3 -0
- examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
- examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
- examples/task_apps/sokoban/filter_sft.toml +5 -0
- examples/task_apps/sokoban/task_app.py +1058 -0
- examples/task_apps/sokoban/tests/__init__.py +4 -0
- examples/task_apps/sokoban/tests/conftest.py +113 -0
- examples/task_apps/sokoban/tests/integration/__init__.py +4 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
- examples/task_apps/sokoban/tests/unit/__init__.py +4 -0
- examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
- examples/task_apps/verilog/__init__.py +1 -0
- examples/task_apps/verilog/eval_groq_qwen32b.toml +24 -0
- examples/task_apps/verilog/filter_sft.toml +5 -0
- examples/task_apps/verilog/task_app/README.md +12 -0
- examples/task_apps/verilog/task_app/__init__.py +1 -0
- examples/task_apps/verilog/task_app/grpo_verilog.py +1166 -0
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
- examples/task_apps/verilog/tests/__init__.py +4 -0
- examples/task_apps/verilog/tests/conftest.py +115 -0
- examples/task_apps/verilog/tests/integration/__init__.py +4 -0
- examples/task_apps/verilog/tests/integration/test_verilog_eval.py +181 -0
- examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
- examples/task_apps/verilog/tests/unit/__init__.py +4 -0
- examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
- examples/vlm/crafter_openai_vlm_agent.py +4 -4
- examples/vlm/run_crafter_vlm_benchmark.py +4 -4
- examples/warming_up_to_rl/groq_test.py +2 -0
- examples/warming_up_to_rl/run_local_rollout.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
- examples/warming_up_to_rl/run_rollout_remote.py +2 -0
- examples/workflows/__init__.py +0 -0
- examples/workflows/math_rl/__init__.py +0 -0
- examples/workflows/math_rl/download_dataset.py +80 -0
- synth_ai/__init__.py +2 -2
- synth_ai/api/models/supported.py +1 -0
- synth_ai/api/train/builders.py +25 -11
- synth_ai/api/train/cli.py +12 -6
- synth_ai/api/train/configs/__init__.py +10 -10
- synth_ai/api/train/configs/rl.py +5 -4
- synth_ai/api/train/configs/sft.py +4 -3
- synth_ai/api/train/env_resolver.py +5 -2
- synth_ai/api/train/supported_algos.py +10 -5
- synth_ai/api/train/utils.py +7 -4
- synth_ai/cli/__init__.py +48 -59
- synth_ai/cli/_modal_wrapper.py +3 -2
- synth_ai/cli/_storage.py +4 -3
- synth_ai/cli/_validate_task_app.py +11 -0
- synth_ai/cli/balance.py +4 -3
- synth_ai/cli/calc.py +2 -2
- synth_ai/cli/demo.py +14 -7
- synth_ai/cli/legacy_root_backup.py +1 -1
- synth_ai/cli/recent.py +1 -1
- synth_ai/cli/rl_demo.py +8 -7
- synth_ai/cli/root.py +0 -97
- synth_ai/cli/status.py +1 -1
- synth_ai/cli/task_apps.py +1922 -190
- synth_ai/cli/traces.py +1 -1
- synth_ai/cli/tui.py +57 -0
- synth_ai/cli/turso.py +1 -1
- synth_ai/cli/watch.py +1 -1
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +29 -17
- synth_ai/environments/examples/crafter_classic/environment.py +1 -1
- synth_ai/environments/examples/enron/engine.py +7 -2
- synth_ai/environments/examples/enron/environment.py +68 -0
- synth_ai/environments/examples/red/engine.py +27 -0
- synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
- synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
- synth_ai/environments/examples/red/environment.py +60 -0
- synth_ai/environments/examples/sokoban/taskset.py +116 -0
- synth_ai/environments/examples/verilog/engine.py +104 -12
- synth_ai/evals/client.py +58 -61
- synth_ai/jobs/client.py +16 -4
- synth_ai/judge_schemas.py +9 -9
- synth_ai/py.typed +0 -0
- synth_ai/task/__init__.py +24 -5
- synth_ai/task/apps/__init__.py +1 -0
- synth_ai/task/config.py +257 -0
- synth_ai/task/contracts.py +138 -39
- synth_ai/task/proxy.py +48 -56
- synth_ai/task/rubrics/__init__.py +56 -0
- synth_ai/task/rubrics/loaders.py +152 -0
- synth_ai/task/rubrics/models.py +57 -0
- synth_ai/task/rubrics/scoring.py +116 -0
- synth_ai/{rubrics/validators.py → task/rubrics/strict.py} +53 -30
- synth_ai/task/server.py +8 -7
- synth_ai/task/trace_correlation_helpers.py +315 -0
- synth_ai/task/validators.py +413 -6
- synth_ai/tracing_v3/abstractions.py +3 -3
- synth_ai/tracing_v3/decorators.py +7 -3
- synth_ai/tracing_v3/llm_call_record_helpers.py +5 -5
- synth_ai/tracing_v3/replica_sync.py +4 -4
- synth_ai/tracing_v3/serialization.py +5 -5
- synth_ai/tracing_v3/session_tracer.py +16 -6
- synth_ai/tracing_v3/storage/base.py +29 -29
- synth_ai/tracing_v3/storage/config.py +3 -3
- synth_ai/tracing_v3/trace_utils.py +317 -0
- synth_ai/tracing_v3/turso/daemon.py +8 -7
- synth_ai/tracing_v3/turso/native_manager.py +66 -43
- synth_ai/tracing_v3/utils.py +3 -3
- synth_ai/tui/__init__.py +5 -0
- synth_ai/tui/__main__.py +13 -0
- synth_ai/tui/cli/__init__.py +1 -0
- synth_ai/tui/cli/query_experiments.py +164 -0
- synth_ai/tui/cli/query_experiments_v3.py +164 -0
- synth_ai/tui/dashboard.py +906 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/METADATA +4 -1
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/RECORD +278 -126
- examples/agora_ex/README_MoE.md +0 -224
- examples/agora_ex/__init__.py +0 -7
- examples/agora_ex/agora_ex.py +0 -65
- examples/agora_ex/agora_ex_task_app.py +0 -590
- examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +0 -121
- examples/agora_ex/reward_fn_grpo-human.py +0 -129
- examples/agora_ex/system_prompt_CURRENT.md +0 -63
- examples/agora_ex/task_app/agora_ex_task_app.py +0 -590
- examples/agora_ex/task_app/reward_fn_grpo-human.py +0 -129
- examples/agora_ex/task_app/system_prompt_CURRENT.md +0 -63
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +0 -62
- synth_ai/rubrics/__init__.py +0 -22
- synth_ai/task/rubrics.py +0 -219
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
- /examples/{rl/task_app → task_apps/math}/README.md +0 -0
- /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
- /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""Compatibility wrapper for the GRPO Enron task app.
|
|
2
|
+
|
|
3
|
+
This mirrors the structure of the Crafter task app wrapper while delegating
|
|
4
|
+
all configuration to the colocated `grpo_enron.py` module. Normal usage should
|
|
5
|
+
prefer invoking `uvx synth-ai serve grpo-enron`, but this module remains for
|
|
6
|
+
direct execution or importing the FastAPI app object.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
from fastapi.exceptions import RequestValidationError
|
|
15
|
+
from fastapi.responses import JSONResponse
|
|
16
|
+
from starlette.requests import Request
|
|
17
|
+
from synth_ai.task.apps import ModalDeploymentConfig, registry
|
|
18
|
+
from synth_ai.task.auth import is_api_key_header_authorized, normalize_environment_api_key
|
|
19
|
+
from synth_ai.task.server import TaskAppConfig, create_task_app, run_task_app
|
|
20
|
+
|
|
21
|
+
from .grpo_enron import build_config
|
|
22
|
+
|
|
23
|
+
APP_ID = "grpo-enron"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _build_base_config() -> TaskAppConfig:
|
|
27
|
+
# Lazily construct the base config to avoid heavy work at import time.
|
|
28
|
+
return build_config()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
_REGISTERED_ENTRY = registry.get(APP_ID)
|
|
33
|
+
except Exception: # pragma: no cover - registry unavailable in some contexts
|
|
34
|
+
MODAL_DEPLOYMENT: ModalDeploymentConfig | None = None
|
|
35
|
+
ENV_FILES: tuple[str, ...] = ()
|
|
36
|
+
else:
|
|
37
|
+
MODAL_DEPLOYMENT = _REGISTERED_ENTRY.modal
|
|
38
|
+
ENV_FILES = tuple(_REGISTERED_ENTRY.env_files)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def build_task_app_config() -> TaskAppConfig:
|
|
42
|
+
"""Return a fresh TaskAppConfig for this wrapper."""
|
|
43
|
+
base = _build_base_config()
|
|
44
|
+
return base.clone()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def fastapi_app():
|
|
48
|
+
"""Return the FastAPI application for Modal or other ASGI hosts."""
|
|
49
|
+
|
|
50
|
+
app = create_task_app(build_task_app_config())
|
|
51
|
+
|
|
52
|
+
# Replace default health endpoints so we can permit soft auth failures and log 422s.
|
|
53
|
+
filtered_routes = []
|
|
54
|
+
for route in app.router.routes:
|
|
55
|
+
path = getattr(route, "path", None)
|
|
56
|
+
methods = getattr(route, "methods", set()) or set()
|
|
57
|
+
if path in {"/health", "/health/rollout"} and "GET" in methods:
|
|
58
|
+
continue
|
|
59
|
+
filtered_routes.append(route)
|
|
60
|
+
app.router.routes = filtered_routes
|
|
61
|
+
|
|
62
|
+
def _log_env_key_prefix(source: str, env_key: str | None) -> str | None:
|
|
63
|
+
if not env_key:
|
|
64
|
+
return None
|
|
65
|
+
prefix = env_key[: max(1, len(env_key) // 2)]
|
|
66
|
+
print(f"[{source}] expected ENVIRONMENT_API_KEY prefix: {prefix}")
|
|
67
|
+
return prefix
|
|
68
|
+
|
|
69
|
+
@app.get("/health")
|
|
70
|
+
async def health(request: Request):
|
|
71
|
+
env_key = normalize_environment_api_key()
|
|
72
|
+
if not env_key:
|
|
73
|
+
return JSONResponse(
|
|
74
|
+
status_code=503,
|
|
75
|
+
content={"status": "unhealthy", "detail": "Missing ENVIRONMENT_API_KEY"},
|
|
76
|
+
)
|
|
77
|
+
if not is_api_key_header_authorized(request):
|
|
78
|
+
prefix = _log_env_key_prefix("health", env_key)
|
|
79
|
+
content = {"status": "healthy", "authorized": False}
|
|
80
|
+
if prefix:
|
|
81
|
+
content["expected_api_key_prefix"] = prefix
|
|
82
|
+
return JSONResponse(status_code=200, content=content)
|
|
83
|
+
return {"status": "healthy", "authorized": True}
|
|
84
|
+
|
|
85
|
+
@app.get("/health/rollout")
|
|
86
|
+
async def health_rollout(request: Request):
|
|
87
|
+
env_key = normalize_environment_api_key()
|
|
88
|
+
if not env_key:
|
|
89
|
+
return JSONResponse(
|
|
90
|
+
status_code=503,
|
|
91
|
+
content={"status": "unhealthy", "detail": "Missing ENVIRONMENT_API_KEY"},
|
|
92
|
+
)
|
|
93
|
+
if not is_api_key_header_authorized(request):
|
|
94
|
+
prefix = _log_env_key_prefix("health/rollout", env_key)
|
|
95
|
+
content = {"status": "healthy", "authorized": False}
|
|
96
|
+
if prefix:
|
|
97
|
+
content["expected_api_key_prefix"] = prefix
|
|
98
|
+
return JSONResponse(status_code=200, content=content)
|
|
99
|
+
return {"ok": True, "authorized": True}
|
|
100
|
+
|
|
101
|
+
@app.exception_handler(RequestValidationError)
|
|
102
|
+
async def _on_validation_error(request: Request, exc: RequestValidationError):
|
|
103
|
+
try:
|
|
104
|
+
hdr = request.headers
|
|
105
|
+
snapshot = {
|
|
106
|
+
"path": str(request.url.path),
|
|
107
|
+
"have_x_api_key": bool(hdr.get("x-api-key")),
|
|
108
|
+
"have_x_api_keys": bool(hdr.get("x-api-keys")),
|
|
109
|
+
"have_authorization": bool(hdr.get("authorization")),
|
|
110
|
+
"errors": exc.errors()[:5],
|
|
111
|
+
}
|
|
112
|
+
print("[422] validation", snapshot, flush=True)
|
|
113
|
+
except Exception:
|
|
114
|
+
pass
|
|
115
|
+
return JSONResponse(
|
|
116
|
+
status_code=422,
|
|
117
|
+
content={"status": "invalid", "detail": exc.errors()[:5]},
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
return app
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
if __name__ == "__main__":
|
|
124
|
+
parser = argparse.ArgumentParser(description="Run the Enron task app locally")
|
|
125
|
+
parser.add_argument("--host", default="0.0.0.0")
|
|
126
|
+
parser.add_argument("--port", type=int, default=8102)
|
|
127
|
+
parser.add_argument("--reload", action="store_true", help="Enable uvicorn autoreload")
|
|
128
|
+
parser.add_argument(
|
|
129
|
+
"--env-file",
|
|
130
|
+
action="append",
|
|
131
|
+
default=[],
|
|
132
|
+
help="Additional .env files to load before startup",
|
|
133
|
+
)
|
|
134
|
+
args = parser.parse_args()
|
|
135
|
+
|
|
136
|
+
default_env = Path(__file__).resolve().parents[4] / "backend" / ".env.dev"
|
|
137
|
+
env_files = [str(default_env)] if default_env.exists() else []
|
|
138
|
+
env_files.extend(args.env_file or [])
|
|
139
|
+
|
|
140
|
+
run_task_app(
|
|
141
|
+
build_task_app_config,
|
|
142
|
+
host=args.host,
|
|
143
|
+
port=args.port,
|
|
144
|
+
reload=args.reload,
|
|
145
|
+
env_files=env_files,
|
|
146
|
+
)
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""Shared fixtures for Enron tests."""
|
|
2
|
+
import os
|
|
3
|
+
import socket
|
|
4
|
+
import subprocess
|
|
5
|
+
from subprocess import TimeoutExpired
|
|
6
|
+
import time
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Iterator
|
|
9
|
+
|
|
10
|
+
import pytest
|
|
11
|
+
|
|
12
|
+
requests = pytest.importorskip("requests")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _which(executable: str) -> bool:
|
|
16
|
+
return any(
|
|
17
|
+
(Path(path) / executable).exists()
|
|
18
|
+
for path in os.getenv("PATH", "").split(os.pathsep)
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _find_free_port() -> int:
|
|
23
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
|
24
|
+
sock.bind(("127.0.0.1", 0))
|
|
25
|
+
return sock.getsockname()[1]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _wait_for_server(base_url: str, timeout: float = 60.0) -> None:
|
|
29
|
+
"""Wait for the Enron server to become ready."""
|
|
30
|
+
deadline = time.time() + timeout
|
|
31
|
+
while time.time() < deadline:
|
|
32
|
+
try:
|
|
33
|
+
# Try /info first (no auth required if --insecure)
|
|
34
|
+
resp = requests.get(f"{base_url}/info", timeout=2.0)
|
|
35
|
+
if resp.status_code == 200:
|
|
36
|
+
return
|
|
37
|
+
# If 400/401, server is up but needs auth - that's OK
|
|
38
|
+
if resp.status_code in (400, 401):
|
|
39
|
+
return
|
|
40
|
+
except Exception:
|
|
41
|
+
time.sleep(0.5)
|
|
42
|
+
raise RuntimeError(f"Task app at {base_url} did not become ready")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@pytest.fixture(scope="module")
|
|
46
|
+
def enron_server(tmp_path_factory: pytest.TempPathFactory) -> Iterator[str]:
|
|
47
|
+
"""Start the Enron task app server for testing."""
|
|
48
|
+
if not _which("uv"):
|
|
49
|
+
pytest.skip("uv executable not found on PATH")
|
|
50
|
+
if "GROQ_API_KEY" not in os.environ:
|
|
51
|
+
pytest.skip("GROQ_API_KEY must be set for Groq-backed tests")
|
|
52
|
+
|
|
53
|
+
port = _find_free_port()
|
|
54
|
+
base_url = f"http://127.0.0.1:{port}"
|
|
55
|
+
tmp_path = tmp_path_factory.mktemp("enron")
|
|
56
|
+
trace_dir = tmp_path / "traces"
|
|
57
|
+
trace_dir.mkdir(parents=True, exist_ok=True)
|
|
58
|
+
|
|
59
|
+
env = os.environ.copy()
|
|
60
|
+
cmd = [
|
|
61
|
+
"uv",
|
|
62
|
+
"run",
|
|
63
|
+
"-m",
|
|
64
|
+
"synth_ai",
|
|
65
|
+
"task-app",
|
|
66
|
+
"serve",
|
|
67
|
+
"grpo-enron",
|
|
68
|
+
"--port",
|
|
69
|
+
str(port),
|
|
70
|
+
"--no-reload",
|
|
71
|
+
]
|
|
72
|
+
proc = subprocess.Popen(
|
|
73
|
+
cmd,
|
|
74
|
+
stdout=subprocess.PIPE,
|
|
75
|
+
stderr=subprocess.STDOUT,
|
|
76
|
+
text=True,
|
|
77
|
+
env=env,
|
|
78
|
+
stdin=subprocess.PIPE,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Send "n" to decline tracing
|
|
82
|
+
try:
|
|
83
|
+
if proc.stdin:
|
|
84
|
+
proc.stdin.write("n\n")
|
|
85
|
+
proc.stdin.flush()
|
|
86
|
+
except Exception:
|
|
87
|
+
pass
|
|
88
|
+
|
|
89
|
+
stdout_capture = ""
|
|
90
|
+
try:
|
|
91
|
+
time.sleep(2)
|
|
92
|
+
if proc.poll() is not None:
|
|
93
|
+
stdout_capture, _ = proc.communicate(timeout=2)
|
|
94
|
+
tail = "\n".join(stdout_capture.strip().splitlines()[-20:]) if stdout_capture else ""
|
|
95
|
+
pytest.skip(f"Task app terminated immediately:\n{tail}")
|
|
96
|
+
|
|
97
|
+
_wait_for_server(base_url)
|
|
98
|
+
yield base_url
|
|
99
|
+
except RuntimeError as e:
|
|
100
|
+
proc.terminate()
|
|
101
|
+
try:
|
|
102
|
+
stdout_capture, _ = proc.communicate(timeout=10)
|
|
103
|
+
except TimeoutExpired:
|
|
104
|
+
proc.kill()
|
|
105
|
+
stdout_capture, _ = proc.communicate()
|
|
106
|
+
tail = "\n".join((stdout_capture or "").strip().splitlines()[-20:])
|
|
107
|
+
pytest.skip(f"Task app failed to start: {e}\n{tail}")
|
|
108
|
+
finally:
|
|
109
|
+
if proc.poll() is None:
|
|
110
|
+
proc.terminate()
|
|
111
|
+
try:
|
|
112
|
+
proc.wait(timeout=5)
|
|
113
|
+
except TimeoutExpired:
|
|
114
|
+
proc.kill()
|
|
115
|
+
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""Integration tests for Enron task app with Groq evaluation."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
import socket
|
|
6
|
+
import subprocess
|
|
7
|
+
from subprocess import TimeoutExpired
|
|
8
|
+
import time
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Iterator
|
|
11
|
+
|
|
12
|
+
import pytest
|
|
13
|
+
|
|
14
|
+
requests = pytest.importorskip("requests")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
HERE = Path(__file__).resolve().parent
|
|
18
|
+
TASK_APP_ROOT = HERE.parents[1]
|
|
19
|
+
CONFIG_PATH = TASK_APP_ROOT / "eval_groq_qwen32.toml"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _which(executable: str) -> bool:
|
|
23
|
+
return any(
|
|
24
|
+
(Path(path) / executable).exists()
|
|
25
|
+
for path in os.getenv("PATH", "").split(os.pathsep)
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _find_free_port() -> int:
|
|
30
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
|
31
|
+
sock.bind(("127.0.0.1", 0))
|
|
32
|
+
return sock.getsockname()[1]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _wait_for_server(base_url: str, timeout: float = 60.0) -> None:
|
|
36
|
+
"""Wait for the Enron server to become ready."""
|
|
37
|
+
deadline = time.time() + timeout
|
|
38
|
+
while time.time() < deadline:
|
|
39
|
+
try:
|
|
40
|
+
resp = requests.get(f"{base_url}/info", timeout=2.0)
|
|
41
|
+
if resp.status_code == 200:
|
|
42
|
+
return
|
|
43
|
+
except Exception:
|
|
44
|
+
time.sleep(0.5)
|
|
45
|
+
raise RuntimeError(f"Task app at {base_url} did not become ready")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@pytest.fixture
|
|
49
|
+
def enron_server(tmp_path: Path) -> Iterator[str]:
|
|
50
|
+
"""Start the Enron task app server for testing."""
|
|
51
|
+
if not _which("uv"):
|
|
52
|
+
pytest.skip("uv executable not found on PATH")
|
|
53
|
+
if "GROQ_API_KEY" not in os.environ:
|
|
54
|
+
pytest.skip("GROQ_API_KEY must be set for Groq-backed evals")
|
|
55
|
+
|
|
56
|
+
port = _find_free_port()
|
|
57
|
+
base_url = f"http://127.0.0.1:{port}"
|
|
58
|
+
trace_dir = tmp_path / "traces"
|
|
59
|
+
trace_dir.mkdir(parents=True, exist_ok=True)
|
|
60
|
+
|
|
61
|
+
env = os.environ.copy()
|
|
62
|
+
cmd = [
|
|
63
|
+
"uv",
|
|
64
|
+
"run",
|
|
65
|
+
"-m",
|
|
66
|
+
"synth_ai",
|
|
67
|
+
"task-app",
|
|
68
|
+
"serve",
|
|
69
|
+
"grpo-enron",
|
|
70
|
+
"--port",
|
|
71
|
+
str(port),
|
|
72
|
+
"--no-reload",
|
|
73
|
+
]
|
|
74
|
+
proc = subprocess.Popen(
|
|
75
|
+
cmd,
|
|
76
|
+
stdout=subprocess.PIPE,
|
|
77
|
+
stderr=subprocess.STDOUT,
|
|
78
|
+
text=True,
|
|
79
|
+
env=env,
|
|
80
|
+
stdin=subprocess.PIPE,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Send "n" to decline tracing
|
|
84
|
+
try:
|
|
85
|
+
if proc.stdin:
|
|
86
|
+
proc.stdin.write("n\n")
|
|
87
|
+
proc.stdin.flush()
|
|
88
|
+
except Exception:
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
stdout_capture = ""
|
|
92
|
+
try:
|
|
93
|
+
time.sleep(2)
|
|
94
|
+
if proc.poll() is not None:
|
|
95
|
+
stdout_capture, _ = proc.communicate(timeout=2)
|
|
96
|
+
tail = "\n".join(stdout_capture.strip().splitlines()[-20:]) if stdout_capture else ""
|
|
97
|
+
pytest.skip(f"Task app terminated immediately:\n{tail}")
|
|
98
|
+
|
|
99
|
+
_wait_for_server(base_url)
|
|
100
|
+
yield base_url
|
|
101
|
+
except RuntimeError as e:
|
|
102
|
+
proc.terminate()
|
|
103
|
+
try:
|
|
104
|
+
stdout_capture, _ = proc.communicate(timeout=10)
|
|
105
|
+
except TimeoutExpired:
|
|
106
|
+
proc.kill()
|
|
107
|
+
stdout_capture, _ = proc.communicate()
|
|
108
|
+
tail = "\n".join((stdout_capture or "").strip().splitlines()[-20:])
|
|
109
|
+
pytest.skip(f"Task app failed to start: {e}\n{tail}")
|
|
110
|
+
finally:
|
|
111
|
+
if proc.poll() is None:
|
|
112
|
+
proc.terminate()
|
|
113
|
+
try:
|
|
114
|
+
proc.wait(timeout=5)
|
|
115
|
+
except TimeoutExpired:
|
|
116
|
+
proc.kill()
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@pytest.mark.slow
|
|
120
|
+
def test_enron_server_health(enron_server: str) -> None:
|
|
121
|
+
"""Test that the Enron server health endpoint works."""
|
|
122
|
+
resp = requests.get(f"{enron_server}/health", timeout=5.0)
|
|
123
|
+
assert resp.status_code in (200, 400), f"Unexpected status: {resp.status_code}"
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@pytest.mark.slow
|
|
127
|
+
def test_enron_task_info(enron_server: str) -> None:
|
|
128
|
+
"""Test that the Enron server returns valid task_info."""
|
|
129
|
+
resp = requests.get(f"{enron_server}/task_info", timeout=5.0)
|
|
130
|
+
assert resp.status_code == 200
|
|
131
|
+
data = resp.json()
|
|
132
|
+
assert "task" in data
|
|
133
|
+
assert data["task"]["id"] == "enron_email_qa"
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
@pytest.mark.slow
|
|
137
|
+
def test_enron_eval_with_groq(enron_server: str) -> None:
|
|
138
|
+
"""Spin up the Enron task app and run a Groq-backed eval."""
|
|
139
|
+
if not CONFIG_PATH.exists():
|
|
140
|
+
pytest.skip(f"Config file not found: {CONFIG_PATH}")
|
|
141
|
+
|
|
142
|
+
cmd = [
|
|
143
|
+
"uv",
|
|
144
|
+
"run",
|
|
145
|
+
"-m",
|
|
146
|
+
"synth_ai",
|
|
147
|
+
"eval",
|
|
148
|
+
"grpo-enron",
|
|
149
|
+
"--config",
|
|
150
|
+
str(CONFIG_PATH),
|
|
151
|
+
"--url",
|
|
152
|
+
enron_server,
|
|
153
|
+
"--model",
|
|
154
|
+
"qwen/qwen3-32b",
|
|
155
|
+
"--seeds",
|
|
156
|
+
"0",
|
|
157
|
+
]
|
|
158
|
+
result = subprocess.run(
|
|
159
|
+
cmd,
|
|
160
|
+
stdout=subprocess.PIPE,
|
|
161
|
+
stderr=subprocess.STDOUT,
|
|
162
|
+
text=True,
|
|
163
|
+
env=os.environ.copy(),
|
|
164
|
+
check=False,
|
|
165
|
+
timeout=300,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
if result.returncode != 0:
|
|
169
|
+
pytest.fail(f"Eval failed with return code {result.returncode}:\n{result.stdout}")
|
|
170
|
+
|
|
171
|
+
# Check for success indicators
|
|
172
|
+
assert "Eval complete" in result.stdout
|
|
173
|
+
assert "1 ok, 0 failed" in result.stdout or "status=200" in result.stdout
|
|
174
|
+
|
|
175
|
+
# Check that we got a meaningful score
|
|
176
|
+
assert "official" in result.stdout.lower() or "mean_return" in result.stdout.lower()
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""Integration test for Enron rollouts via /rollout endpoint."""
|
|
2
|
+
import os
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
requests = pytest.importorskip("requests")
|
|
6
|
+
|
|
7
|
+
# Use the actual ENVIRONMENT_API_KEY from .env
|
|
8
|
+
AUTH_HEADER = {"Authorization": "Bearer sk_env_30c78a787bac223c716918181209f263"}
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@pytest.mark.slow
|
|
12
|
+
def test_enron_manual_rollout(enron_server: str) -> None:
|
|
13
|
+
"""Test a manual Enron rollout with explicit search/read/answer actions."""
|
|
14
|
+
rollout_payload = {
|
|
15
|
+
"run_id": "test_manual_enron",
|
|
16
|
+
"env": {"seed": 0},
|
|
17
|
+
"ops": [
|
|
18
|
+
{
|
|
19
|
+
"tool": "search_emails",
|
|
20
|
+
"args": {
|
|
21
|
+
"inbox": "test@enron.com",
|
|
22
|
+
"keywords": ["test", "question"],
|
|
23
|
+
"max_results": 5,
|
|
24
|
+
},
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
"tool": "answer_question",
|
|
28
|
+
"args": {"answer": "This is a test answer"},
|
|
29
|
+
},
|
|
30
|
+
],
|
|
31
|
+
"policy": {
|
|
32
|
+
"policy_name": "manual",
|
|
33
|
+
"config": {"provider": "noop"},
|
|
34
|
+
},
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
resp = requests.post(
|
|
38
|
+
f"{enron_server}/rollout",
|
|
39
|
+
json=rollout_payload,
|
|
40
|
+
headers=AUTH_HEADER,
|
|
41
|
+
timeout=60.0,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
assert resp.status_code == 200, f"Rollout failed: {resp.status_code} {resp.text}"
|
|
45
|
+
data = resp.json()
|
|
46
|
+
|
|
47
|
+
# Verify response structure
|
|
48
|
+
assert "trajectories" in data
|
|
49
|
+
assert len(data["trajectories"]) > 0
|
|
50
|
+
assert "metrics" in data
|
|
51
|
+
assert "trace" in data
|
|
52
|
+
|
|
53
|
+
# Check that trace is present
|
|
54
|
+
assert data["trace"] is not None
|
|
55
|
+
assert "session_trace" in data["trace"]
|
|
56
|
+
|
|
57
|
+
trajectory = data["trajectories"][0]
|
|
58
|
+
assert "steps" in trajectory
|
|
59
|
+
|
|
60
|
+
# Should have at least initial observation
|
|
61
|
+
assert len(trajectory["steps"]) > 0
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@pytest.mark.slow
|
|
65
|
+
def test_enron_policy_rollout(enron_server: str) -> None:
|
|
66
|
+
"""Test an Enron rollout using Groq policy."""
|
|
67
|
+
if "GROQ_API_KEY" not in os.environ:
|
|
68
|
+
pytest.skip("GROQ_API_KEY required for this test")
|
|
69
|
+
|
|
70
|
+
rollout_payload = {
|
|
71
|
+
"run_id": "test_policy_enron",
|
|
72
|
+
"env": {"seed": 0},
|
|
73
|
+
"ops": [], # Empty ops means use policy
|
|
74
|
+
"policy": {
|
|
75
|
+
"policy_name": "qwen-groq",
|
|
76
|
+
"config": {
|
|
77
|
+
"provider": "groq",
|
|
78
|
+
"model": "qwen/qwen3-32b",
|
|
79
|
+
"temperature": 0.2,
|
|
80
|
+
"max_tokens": 1024,
|
|
81
|
+
},
|
|
82
|
+
},
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
resp = requests.post(
|
|
86
|
+
f"{enron_server}/rollout",
|
|
87
|
+
json=rollout_payload,
|
|
88
|
+
headers=AUTH_HEADER,
|
|
89
|
+
timeout=180.0, # Enron can be slow with multiple tool calls
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
assert resp.status_code == 200, f"Rollout failed: {resp.status_code} {resp.text}"
|
|
93
|
+
data = resp.json()
|
|
94
|
+
|
|
95
|
+
# Verify response structure
|
|
96
|
+
assert "trajectories" in data
|
|
97
|
+
assert "metrics" in data
|
|
98
|
+
assert "trace" in data
|
|
99
|
+
|
|
100
|
+
trajectory = data["trajectories"][0]
|
|
101
|
+
assert "steps" in trajectory
|
|
102
|
+
|
|
103
|
+
# Check that steps were taken
|
|
104
|
+
assert len(trajectory["steps"]) > 0
|
|
105
|
+
|
|
106
|
+
# Verify metrics
|
|
107
|
+
metrics = data["metrics"]
|
|
108
|
+
assert "episode_returns" in metrics or "mean_return" in metrics
|
|
109
|
+
|
|
110
|
+
# Check that we got some reward (could be negative for search penalty)
|
|
111
|
+
if "episode_returns" in metrics and len(metrics["episode_returns"]) > 0:
|
|
112
|
+
# Just verify it's a number
|
|
113
|
+
assert isinstance(metrics["episode_returns"][0], (int, float))
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@pytest.mark.fast
|
|
117
|
+
def test_enron_rollout_with_auth(enron_server: str) -> None:
|
|
118
|
+
"""Test that Enron rollout requires proper authentication."""
|
|
119
|
+
rollout_payload = {
|
|
120
|
+
"run_id": "test_auth",
|
|
121
|
+
"env": {"seed": 0},
|
|
122
|
+
"ops": [],
|
|
123
|
+
"policy": {"config": {"provider": "noop"}},
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
# Try without auth header
|
|
127
|
+
resp = requests.post(
|
|
128
|
+
f"{enron_server}/rollout",
|
|
129
|
+
json=rollout_payload,
|
|
130
|
+
timeout=10.0,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# Should fail without auth (400 or 401)
|
|
134
|
+
assert resp.status_code in (400, 401, 403), f"Expected auth error, got {resp.status_code}"
|
|
135
|
+
|