synth-ai 0.2.12__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (229) hide show
  1. examples/multi_step/configs/crafter_rl_outcome.toml +74 -0
  2. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +186 -0
  3. examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +83 -0
  4. examples/multi_step/configs/crafter_rl_stepwise_simple.toml +78 -0
  5. examples/multi_step/crafter_rl_lora.md +51 -10
  6. examples/multi_step/sse_metrics_streaming_notes.md +357 -0
  7. examples/multi_step/task_app_config_notes.md +7 -1
  8. examples/swe/task_app/grpo_swe_mini.py +55 -26
  9. examples/swe/task_app/hosted/rollout.py +40 -0
  10. examples/swe/task_app/hosted/test_service.py +5 -6
  11. examples/task_apps/TESTING.md +275 -0
  12. examples/task_apps/__init__.py +0 -0
  13. examples/task_apps/crafter/__init__.py +0 -0
  14. examples/task_apps/crafter/task_app/__init__.py +2 -0
  15. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +21 -46
  16. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
  17. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +60 -4
  18. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +109 -45
  19. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +67 -49
  20. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +242 -193
  21. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
  22. examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
  23. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
  24. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
  25. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
  26. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
  27. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
  28. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
  29. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
  30. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
  31. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
  32. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
  33. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
  34. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
  35. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
  36. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
  37. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
  38. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
  39. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
  40. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
  41. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
  42. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
  43. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
  44. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
  45. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
  46. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
  47. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
  48. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
  49. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
  50. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
  51. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
  52. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
  53. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
  54. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
  55. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
  56. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
  57. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
  58. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
  59. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
  60. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
  61. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
  62. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
  63. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
  64. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
  65. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
  66. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
  67. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
  68. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
  69. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
  70. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
  71. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
  72. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
  73. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
  74. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
  75. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
  76. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
  77. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
  78. examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
  79. examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
  80. examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
  81. examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
  82. examples/task_apps/enron/__init__.py +1 -0
  83. examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
  84. examples/task_apps/enron/task_app/README.md +14 -0
  85. examples/task_apps/enron/task_app/__init__.py +1 -0
  86. examples/task_apps/enron/task_app/grpo_enron.py +906 -0
  87. examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
  88. examples/task_apps/enron/tests/__init__.py +2 -0
  89. examples/task_apps/enron/tests/conftest.py +115 -0
  90. examples/task_apps/enron/tests/integration/__init__.py +2 -0
  91. examples/task_apps/enron/tests/integration/test_enron_eval.py +177 -0
  92. examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
  93. examples/task_apps/enron/tests/unit/__init__.py +2 -0
  94. examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
  95. examples/task_apps/math/__init__.py +0 -0
  96. examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
  97. examples/task_apps/pokemon_battle/__init__.py +2 -0
  98. examples/task_apps/pokemon_battle/modal_app.py +104 -0
  99. examples/task_apps/pokemon_battle/task_app/README.md +68 -0
  100. examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
  101. examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
  102. examples/task_apps/pokemon_red/README.md +357 -0
  103. examples/task_apps/pokemon_red/__init__.py +3 -0
  104. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
  105. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +73 -0
  106. examples/task_apps/pokemon_red/task_app.py +606 -0
  107. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +191 -0
  108. examples/task_apps/sokoban/README.md +307 -0
  109. examples/task_apps/sokoban/__init__.py +3 -0
  110. examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
  111. examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
  112. examples/task_apps/sokoban/task_app.py +1058 -0
  113. examples/task_apps/sokoban/tests/__init__.py +2 -0
  114. examples/task_apps/sokoban/tests/conftest.py +113 -0
  115. examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
  116. examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
  117. examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
  118. examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
  119. examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
  120. examples/task_apps/verilog/__init__.py +1 -0
  121. examples/task_apps/verilog/eval_groq_qwen32b.toml +20 -0
  122. examples/task_apps/verilog/task_app/README.md +12 -0
  123. examples/task_apps/verilog/task_app/__init__.py +1 -0
  124. examples/task_apps/verilog/task_app/grpo_verilog.py +931 -0
  125. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
  126. examples/task_apps/verilog/tests/__init__.py +2 -0
  127. examples/task_apps/verilog/tests/conftest.py +115 -0
  128. examples/task_apps/verilog/tests/integration/__init__.py +2 -0
  129. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +179 -0
  130. examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
  131. examples/task_apps/verilog/tests/unit/__init__.py +2 -0
  132. examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
  133. examples/vlm/crafter_openai_vlm_agent.py +4 -4
  134. examples/vlm/run_crafter_vlm_benchmark.py +4 -4
  135. examples/warming_up_to_rl/configs/eval_stepwise_complex.toml +4 -2
  136. examples/warming_up_to_rl/configs/eval_stepwise_simple.toml +4 -2
  137. examples/warming_up_to_rl/run_eval.py +127 -18
  138. examples/workflows/__init__.py +0 -0
  139. examples/workflows/math_rl/__init__.py +0 -0
  140. examples/workflows/math_rl/download_dataset.py +80 -0
  141. synth_ai/__init__.py +41 -1
  142. synth_ai/api/train/builders.py +73 -29
  143. synth_ai/api/train/cli.py +12 -6
  144. synth_ai/api/train/configs/__init__.py +44 -0
  145. synth_ai/api/train/configs/rl.py +134 -0
  146. synth_ai/api/train/configs/sft.py +95 -0
  147. synth_ai/api/train/configs/shared.py +24 -0
  148. synth_ai/api/train/env_resolver.py +5 -2
  149. synth_ai/api/train/supported_algos.py +10 -5
  150. synth_ai/api/train/utils.py +7 -4
  151. synth_ai/cli/__init__.py +7 -51
  152. synth_ai/cli/_storage.py +4 -3
  153. synth_ai/cli/_validate_task_app.py +11 -0
  154. synth_ai/cli/balance.py +4 -3
  155. synth_ai/cli/calc.py +2 -2
  156. synth_ai/cli/demo.py +49 -43
  157. synth_ai/cli/legacy_root_backup.py +1 -1
  158. synth_ai/cli/rl_demo.py +86 -106
  159. synth_ai/cli/root.py +0 -97
  160. synth_ai/cli/task_apps.py +1710 -186
  161. synth_ai/demos/core/cli.py +121 -159
  162. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +28 -16
  163. synth_ai/environments/examples/crafter_classic/environment.py +16 -0
  164. synth_ai/environments/examples/enron/engine.py +7 -2
  165. synth_ai/environments/examples/enron/environment.py +68 -0
  166. synth_ai/environments/examples/red/engine.py +27 -0
  167. synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
  168. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
  169. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
  170. synth_ai/environments/examples/red/environment.py +60 -0
  171. synth_ai/environments/examples/sokoban/taskset.py +116 -0
  172. synth_ai/environments/examples/verilog/engine.py +30 -4
  173. synth_ai/evals/__init__.py +15 -0
  174. synth_ai/evals/client.py +82 -0
  175. synth_ai/evals/types.py +42 -0
  176. synth_ai/jobs/client.py +16 -4
  177. synth_ai/judge_schemas.py +127 -0
  178. synth_ai/py.typed +0 -0
  179. synth_ai/task/__init__.py +14 -5
  180. synth_ai/task/contracts.py +124 -38
  181. synth_ai/task/proxy.py +48 -56
  182. synth_ai/task/rubrics/__init__.py +53 -0
  183. synth_ai/task/rubrics/loaders.py +133 -0
  184. synth_ai/task/rubrics/models.py +57 -0
  185. synth_ai/task/rubrics/scoring.py +113 -0
  186. synth_ai/task/rubrics/strict.py +149 -0
  187. synth_ai/task/server.py +8 -7
  188. synth_ai/task/validators.py +269 -6
  189. synth_ai/tracing_v3/decorators.py +7 -3
  190. synth_ai/tracing_v3/replica_sync.py +4 -4
  191. synth_ai/tracing_v3/serialization.py +130 -0
  192. synth_ai/tracing_v3/trace_utils.py +317 -0
  193. synth_ai/tracing_v3/turso/native_manager.py +3 -3
  194. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/METADATA +4 -1
  195. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/RECORD +228 -89
  196. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/entry_points.txt +0 -1
  197. synth_ai/task/rubrics.py +0 -219
  198. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
  199. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
  200. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
  201. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
  202. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
  203. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
  204. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
  205. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
  206. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +0 -0
  207. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +0 -0
  208. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
  209. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
  210. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
  211. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
  212. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
  213. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
  214. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
  215. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
  216. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
  217. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/utils.py +0 -0
  218. /examples/{rl/task_app → task_apps/math}/README.md +0 -0
  219. /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
  220. /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
  221. /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
  222. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
  223. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
  224. /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
  225. /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
  226. /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
  227. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/WHEEL +0 -0
  228. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/licenses/LICENSE +0 -0
  229. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,275 @@
1
+ # Task App Testing Guide
2
+
3
+ This document describes how to run tests for the task apps in this directory.
4
+
5
+ ## Overview
6
+
7
+ Each task app has unit and integration tests following a consistent pattern inspired by the customer environment tests in `customers/`.
8
+
9
+ ## Test Structure
10
+
11
+ ```
12
+ examples/task_apps/<app_name>/tests/
13
+ ├── __init__.py
14
+ ├── integration/
15
+ │ ├── __init__.py
16
+ │ └── test_<app>_eval.py # Server startup + eval tests
17
+ └── unit/
18
+ ├── __init__.py
19
+ └── test_<app>_*.py # Environment, scoring, dataset tests
20
+ ```
21
+
22
+ ## Running Tests
23
+
24
+ ### Prerequisites
25
+
26
+ ```bash
27
+ # Install test dependencies
28
+ uv sync --dev
29
+
30
+ # Set required environment variables
31
+ export GROQ_API_KEY="your-groq-key"
32
+ export OPENAI_API_KEY="your-openai-key" # For Sokoban
33
+ ```
34
+
35
+ ### Run All Tests for a Task App
36
+
37
+ ```bash
38
+ # Verilog
39
+ pytest examples/task_apps/verilog/tests/ -v
40
+
41
+ # Enron
42
+ pytest examples/task_apps/enron/tests/ -v
43
+
44
+ # Sokoban
45
+ pytest examples/task_apps/sokoban/tests/ -v
46
+ ```
47
+
48
+ ### Run Only Unit Tests (Fast)
49
+
50
+ ```bash
51
+ # Runs quickly, no server startup required
52
+ pytest examples/task_apps/verilog/tests/unit/ -v
53
+ pytest examples/task_apps/enron/tests/unit/ -v
54
+ pytest examples/task_apps/sokoban/tests/unit/ -v
55
+ ```
56
+
57
+ ### Run Only Integration Tests
58
+
59
+ ```bash
60
+ # Slower, starts servers and runs evals
61
+ pytest examples/task_apps/verilog/tests/integration/ -v
62
+ pytest examples/task_apps/enron/tests/integration/ -v
63
+ pytest examples/task_apps/sokoban/tests/integration/ -v
64
+ ```
65
+
66
+ ### Run All Task App Tests
67
+
68
+ ```bash
69
+ # Run everything
70
+ pytest examples/task_apps/*/tests/ -v
71
+
72
+ # Skip slow tests
73
+ pytest examples/task_apps/*/tests/ -v -m "not slow"
74
+ ```
75
+
76
+ ## Test Categories
77
+
78
+ ### Unit Tests
79
+
80
+ **Purpose**: Test individual components in isolation
81
+ - Environment initialization
82
+ - Reward calculation
83
+ - Tool implementations
84
+ - State management
85
+
86
+ **Characteristics**:
87
+ - Fast (< 1 second each)
88
+ - No external dependencies
89
+ - No server startup
90
+ - No API calls
91
+
92
+ **Examples**:
93
+ - `test_verilog_scoring.py`: Tests reward components (compile, simulate, submit)
94
+ - `test_enron_environment.py`: Tests search, answer, reward calculation
95
+ - `test_sokoban_environment.py`: Tests actions, rewards, truncation
96
+
97
+ ### Integration Tests
98
+
99
+ **Purpose**: Test the full system end-to-end
100
+ - Server startup
101
+ - Health/info endpoints
102
+ - Full evaluation runs
103
+ - **Rollout execution** (manual and policy-driven)
104
+
105
+ **Characteristics**:
106
+ - Slower (30-300 seconds)
107
+ - Requires server startup
108
+ - May require API keys
109
+ - Tests real workflows
110
+
111
+ **Examples**:
112
+ - `test_verilog_eval.py`: Starts server, runs Groq eval with Qwen3-32B
113
+ - `test_verilog_rollout.py`: **Manual & policy rollouts via /rollout endpoint**
114
+ - `test_enron_eval.py`: Starts server, runs Groq eval
115
+ - `test_enron_rollout.py`: **Manual & policy rollouts, auth testing**
116
+ - `test_sokoban_eval.py`: Starts server, tests manual rollout
117
+ - `test_sokoban_rollout.py`: **6 rollout tests (manual, policy, difficulties, limits)**
118
+
119
+ ## What Each Test Validates
120
+
121
+ ### Verilog Tests
122
+
123
+ **Unit Tests** (4 tests):
124
+ - ✅ Compile success gives +0.1 reward
125
+ - ✅ Simulation pass gives +1.0 reward
126
+ - ✅ Submit success gives +10.0 reward
127
+ - ✅ Submit checks last simulation output correctly
128
+
129
+ **Integration Tests** (5 tests):
130
+ - ✅ Server starts and responds to /health
131
+ - ✅ /task_info returns valid Verilog task metadata
132
+ - ✅ Full eval with Qwen3-32B completes successfully
133
+ - ✅ **Manual rollout** with explicit write/compile/simulate/submit
134
+ - ✅ **Policy rollout** using Groq/Qwen3-32B (verifies LLM integration)
135
+
136
+ ### Enron Tests
137
+
138
+ **Unit Tests** (3 tests):
139
+ - ✅ search_emails tool works correctly
140
+ - ✅ answer_question tool calculates rewards
141
+ - ✅ Exact answer match gives high reward (>0.9)
142
+ - ✅ Partial answer match gives medium reward (>0.5)
143
+ - ✅ Wrong answer gives low reward (<0.5)
144
+
145
+ **Integration Tests** (6 tests):
146
+ - ✅ Server starts and responds to /health
147
+ - ✅ /task_info returns valid Enron task metadata
148
+ - ✅ Full eval with Qwen3-32B completes successfully
149
+ - ✅ **Manual rollout** with explicit search/read/answer actions
150
+ - ✅ **Policy rollout** using Groq/Qwen3-32B
151
+ - ✅ **Authentication** enforcement (rejects requests without auth header)
152
+
153
+ ### Sokoban Tests
154
+
155
+ **Unit Tests** (3 tests):
156
+ - ✅ Module imports work correctly
157
+ - ✅ Reward components exist (goal achieved, step penalty)
158
+ - ✅ Engine creation with different difficulty levels
159
+
160
+ **Integration Tests** (9 tests):
161
+ - ✅ Server starts and responds to /health
162
+ - ✅ /task_info returns valid Sokoban task metadata
163
+ - ✅ **Manual rollout** with movement actions (left/right/up/down)
164
+ - ✅ **Policy rollout** with OpenAI GPT-5-mini (may skip if slow)
165
+ - ✅ **All difficulty levels** (easy/medium/hard) work correctly
166
+ - ✅ **Max steps limit** enforcement (stops at configured limit)
167
+ - ✅ **Puzzle completion detection** (terminated=True when solved)
168
+ - ✅ Truncation on max_steps
169
+ - ✅ Response structure validation
170
+
171
+ ## Debugging Test Failures
172
+
173
+ ### Server Won't Start
174
+
175
+ ```bash
176
+ # Check if port is already in use
177
+ lsof -i :<port>
178
+
179
+ # Check logs manually
180
+ uv run -m synth_ai task-app serve <app_name> --port 8999
181
+
182
+ # Check environment variables
183
+ echo $GROQ_API_KEY
184
+ echo $OPENAI_API_KEY
185
+ ```
186
+
187
+ ### Tests Timeout
188
+
189
+ ```bash
190
+ # Run with more verbose output
191
+ pytest <test_file> -v -s
192
+
193
+ # Skip slow tests
194
+ pytest <test_file> -v --timeout=60
195
+ ```
196
+
197
+ ### Import Errors
198
+
199
+ ```bash
200
+ # Ensure you're in the right directory
201
+ cd /path/to/synth-ai
202
+
203
+ # Reinstall dependencies
204
+ uv sync --dev
205
+ ```
206
+
207
+ ## CI/CD Integration
208
+
209
+ These tests can be run in CI with:
210
+
211
+ ```yaml
212
+ # .github/workflows/test-task-apps.yml
213
+ - name: Run task app tests
214
+ env:
215
+ GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
216
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
217
+ run: |
218
+ # Unit tests (fast, always run)
219
+ pytest examples/task_apps/*/tests/unit/ -v
220
+
221
+ # Integration tests (slower, only on main)
222
+ if [ "$GITHUB_REF" = "refs/heads/main" ]; then
223
+ pytest examples/task_apps/*/tests/integration/ -v --timeout=300
224
+ fi
225
+ ```
226
+
227
+ ## Adding Tests for New Task Apps
228
+
229
+ When creating a new task app, follow this pattern:
230
+
231
+ 1. **Create test structure**:
232
+ ```bash
233
+ mkdir -p examples/task_apps/<new_app>/tests/{unit,integration}
234
+ touch examples/task_apps/<new_app>/tests/__init__.py
235
+ touch examples/task_apps/<new_app>/tests/unit/__init__.py
236
+ touch examples/task_apps/<new_app>/tests/integration/__init__.py
237
+ ```
238
+
239
+ 2. **Create unit tests** (`tests/unit/test_<app>_*.py`):
240
+ - Test environment initialization
241
+ - Test reward calculation
242
+ - Test tool implementations
243
+ - Test edge cases
244
+
245
+ 3. **Create integration tests** (`tests/integration/test_<app>_eval.py`):
246
+ - Copy from an existing integration test
247
+ - Update app name, port, config path
248
+ - Add app-specific endpoint tests
249
+
250
+ 4. **Add to CI**:
251
+ - Update CI config to include new tests
252
+ - Ensure required env vars are set
253
+
254
+ ## Test Coverage Goals
255
+
256
+ - Unit test coverage: >80%
257
+ - Integration test coverage: 100% of critical paths
258
+ - All public APIs have at least one integration test
259
+ - All reward components have unit tests
260
+
261
+ ## Common Issues
262
+
263
+ ### "Task app terminated immediately"
264
+ - Check that the app name is correct
265
+ - Verify the app is registered in `synth_ai/task/apps.py`
266
+ - Check recent changes to the app code
267
+
268
+ ### "GROQ_API_KEY must be set"
269
+ - Set the environment variable
270
+ - Or skip the test: `pytest -k "not groq"`
271
+
272
+ ### "Config file not found"
273
+ - Ensure eval config exists in task app directory
274
+ - Check the path in the test matches actual location
275
+
File without changes
File without changes
@@ -0,0 +1,2 @@
1
+ """Crafter task app implementation."""
2
+
@@ -68,7 +68,7 @@ def _resolve_repo_root() -> Path:
68
68
  def _resolve_task_app_root(repo_root: Path) -> Path:
69
69
  """Locate the task_app directory even when the module is copied to a temp mount."""
70
70
 
71
- preferred = (repo_root / "examples" / "warming_up_to_rl" / "task_app").resolve()
71
+ preferred = (repo_root / "examples" / "task_apps" / "crafter" / "task_app").resolve()
72
72
  if preferred.is_dir():
73
73
  return preferred
74
74
 
@@ -81,7 +81,7 @@ def _resolve_task_app_root(repo_root: Path) -> Path:
81
81
  if (candidate / "synth_envs_hosted").is_dir():
82
82
  return candidate
83
83
 
84
- fallback = Path("/opt/synth_ai_repo/examples/warming_up_to_rl/task_app")
84
+ fallback = Path("/opt/synth_ai_repo/examples/task_apps/crafter/task_app")
85
85
  if fallback.is_dir():
86
86
  return fallback.resolve()
87
87
 
@@ -93,6 +93,7 @@ TASK_APP_ROOT = _resolve_task_app_root(REPO_ROOT)
93
93
  SYNTH_ENVS_HOSTED_ROOT = (TASK_APP_ROOT / "synth_envs_hosted").resolve()
94
94
 
95
95
  EXAMPLES_ROOT = (REPO_ROOT / "examples").resolve()
96
+ RUBRICS_ROOT = (EXAMPLES_ROOT / "multi_step" / "rubrics").resolve()
96
97
 
97
98
  for path in (REPO_ROOT, TASK_APP_ROOT, SYNTH_ENVS_HOSTED_ROOT, EXAMPLES_ROOT):
98
99
  try:
@@ -305,13 +306,16 @@ def build_dataset() -> tuple[TaskDatasetRegistry, CrafterDataset]:
305
306
  def _base_task_info(dataset: CrafterDataset) -> TaskInfo:
306
307
  return TaskInfo(
307
308
  task={"id": "crafter_classic", "name": "Crafter Classic", "version": "1.0.0"},
308
- environments=["crafter"],
309
+ environment="crafter",
309
310
  action_space={
310
311
  "type": "discrete",
312
+ "description": f"Discrete action space with {len(crafter_constants.actions)} actions including movement, crafting, and interaction",
311
313
  "size": len(crafter_constants.actions),
312
314
  "actions": list(crafter_constants.actions),
313
315
  },
314
316
  observation={
317
+ "type": "dict",
318
+ "description": "RGB frame (64x64x3) plus inventory counts, achievements, and semantic map patches",
315
319
  "summary": "RGB frame plus inventory, achievements, and semantic map patches.",
316
320
  "keys": ["image", "inventory", "achievements", "semantic_map_patch7"],
317
321
  "image_shape": [64, 64, 3],
@@ -335,49 +339,13 @@ def _base_task_info(dataset: CrafterDataset) -> TaskInfo:
335
339
  },
336
340
  "tool": {"name": "interact", "parallel_tool_calls": False},
337
341
  },
338
- capabilities={
339
- "supports_rollout": True,
340
- "supports_env_lifecycle": True,
341
- "requires_api_key_header": True,
342
- },
343
342
  limits={"max_ops": 100000, "max_time_s": 3600},
344
343
  )
345
344
 
346
345
 
347
- OUTCOME_RUBRIC = load_rubric(
348
- {
349
- "version": "1",
350
- "goal_text": "Reward unlocking Crafter achievements and survival.",
351
- "aggregation": "weighted_sum",
352
- "criteria": [
353
- {
354
- "id": "achievements",
355
- "description": "Unlock achievements or crafting milestones.",
356
- "weight": 1.0,
357
- },
358
- {
359
- "id": "survival",
360
- "description": "Maintain health, food, and drink levels.",
361
- "weight": 1.0,
362
- },
363
- ],
364
- }
365
- )
346
+ OUTCOME_RUBRIC = load_rubric(str(RUBRICS_ROOT / "crafter_outcome_rubric.json"))
366
347
 
367
- EVENTS_RUBRIC = load_rubric(
368
- {
369
- "version": "1",
370
- "goal_text": "Encourage purposeful step-wise exploration and crafting.",
371
- "aggregation": "weighted_sum",
372
- "criteria": [
373
- {
374
- "id": "progress_steps",
375
- "description": "Actions progress quests, crafting, or exploration.",
376
- "weight": 1.0,
377
- }
378
- ],
379
- }
380
- )
348
+ EVENTS_RUBRIC = load_rubric(str(RUBRICS_ROOT / "crafter_events_rubric.json"))
381
349
 
382
350
 
383
351
  def describe_taskset(dataset: CrafterDataset) -> dict[str, Any]:
@@ -396,29 +364,36 @@ def provide_task_instances(
396
364
  dataset: CrafterDataset, base_info: TaskInfo, seeds: Sequence[int]
397
365
  ) -> Iterable[TaskInfo]:
398
366
  infos: list[TaskInfo] = []
367
+ base_observation = getattr(base_info, "observation", None)
368
+ if hasattr(base_observation, "model_dump"):
369
+ observation_template = base_observation.model_dump()
370
+ elif isinstance(base_observation, dict):
371
+ observation_template = dict(base_observation)
372
+ else:
373
+ observation_template = {}
374
+
399
375
  for seed_value in seeds:
400
376
  summary = dataset.describe_seed(seed_value)
401
377
  infos.append(
402
378
  TaskInfo(
403
379
  task=base_info.task,
404
- environments=base_info.environments,
380
+ environment=base_info.environment,
405
381
  action_space=base_info.action_space,
406
382
  observation={
407
- **base_info.observation,
383
+ **observation_template,
408
384
  "seed": seed_value,
409
385
  "traits": summary["traits"],
410
386
  "inventory": summary["inventory"],
411
387
  "player_position": summary["player_position"],
412
388
  },
413
389
  dataset={
414
- **base_info.dataset,
390
+ **base_info.dataset.model_dump(),
415
391
  "seed": seed_value,
416
392
  "difficulty": summary["difficulty"],
417
393
  "config": summary["config"],
418
394
  },
419
395
  rubric=base_info.rubric,
420
396
  inference=base_info.inference,
421
- capabilities=base_info.capabilities,
422
397
  limits=base_info.limits,
423
398
  )
424
399
  )
@@ -689,7 +664,7 @@ register_task_app(
689
664
  # Mount repo root so local modules resolve when deployed on Modal
690
665
  (str(REPO_ROOT), "/opt/synth_ai_repo"),
691
666
  (str(REPO_ROOT / "synth_ai"), "/opt/synth_ai_repo/synth_ai"),
692
- (str(TASK_APP_ROOT), "/opt/synth_ai_repo/examples/warming_up_to_rl/task_app"),
667
+ (str(TASK_APP_ROOT), "/opt/synth_ai_repo/examples/task_apps/crafter/task_app"),
693
668
  ),
694
669
  secret_names=("groq-api-key", "openai-api-key"),
695
670
  memory=16384,
@@ -1,7 +1,7 @@
1
1
  """Compatibility wrapper for the GRPO Crafter task app.
2
2
 
3
3
  This module now delegates to the TaskAppConfig defined in the colocated example at
4
- `examples/warming_up_to_rl/task_app/grpo_crafter.py`. It is kept for legacy usage
4
+ `examples/task_apps/crafter/task_app/grpo_crafter.py`. It is kept for legacy usage
5
5
  (running the file directly or targeting `fastapi_app` from external tooling). Prefer using
6
6
  `uvx synth-ai serve grpo-crafter` for local development and testing.
7
7
  """
@@ -44,6 +44,7 @@ class CrafterPolicy(Policy):
44
44
  self.inference_url = inference_url
45
45
  self.model = model
46
46
  self.use_tools = True
47
+ self.use_vision = False # Enable vision for VLMs
47
48
  # Sampling parameters (populated via initialize(config))
48
49
  self.temperature: float | None = None
49
50
  self.top_p: float | None = None
@@ -63,6 +64,11 @@ class CrafterPolicy(Policy):
63
64
  self.model = config["model"]
64
65
  if "use_tools" in config:
65
66
  self.use_tools = bool(config["use_tools"])
67
+ if "use_vision" in config:
68
+ self.use_vision = bool(config["use_vision"])
69
+ # Auto-detect vision capability from model name if not explicitly set
70
+ if "use_vision" not in config and self.model:
71
+ self.use_vision = self._is_vision_model(self.model)
66
72
  # Adopt sampling params from policy config (trainer passes these through)
67
73
  if "temperature" in config:
68
74
  self.temperature = float(config["temperature"]) # fail fast on bad types
@@ -384,6 +390,7 @@ class CrafterPolicy(Policy):
384
390
  "inference_url": self.inference_url,
385
391
  "model": self.model,
386
392
  "use_tools": self.use_tools,
393
+ "use_vision": self.use_vision,
387
394
  },
388
395
  "state": self.state_dict(),
389
396
  }
@@ -396,7 +403,8 @@ class CrafterPolicy(Policy):
396
403
  inference_url=config["inference_url"],
397
404
  model=config.get("model"),
398
405
  )
399
- policy.use_tools = bool(config["use_tools"])
406
+ policy.use_tools = bool(config.get("use_tools", True))
407
+ policy.use_vision = bool(config.get("use_vision", False))
400
408
  policy.load_state_dict(state)
401
409
  return policy
402
410
 
@@ -446,12 +454,60 @@ class CrafterPolicy(Policy):
446
454
 
447
455
  return format_observation(obs_data, step_count=step_idx, max_steps=max_steps)
448
456
 
457
+ @staticmethod
458
+ def _is_vision_model(model_name: str) -> bool:
459
+ """Check if a model supports vision/image inputs based on its name."""
460
+ if not model_name:
461
+ return False
462
+
463
+ model_lower = model_name.lower()
464
+
465
+ # Known vision-capable model patterns
466
+ vision_patterns = [
467
+ "gpt-4o", # GPT-4o series
468
+ "gpt-4-turbo", # GPT-4 Turbo with vision
469
+ "gpt-4-vision", # Explicit vision variant
470
+ "gpt-5", # GPT-5 series (all variants support vision)
471
+ "claude-3", # All Claude 3 models support vision
472
+ "gemini", # Gemini models
473
+ "qwen-vl", # Qwen Vision-Language models
474
+ "qwen2-vl", # Qwen2 VL
475
+ "pixtral", # Mistral's vision model
476
+ "llava", # LLaVA models
477
+ "phi-3-vision", # Microsoft Phi-3 Vision
478
+ "internvl", # InternVL models
479
+ "cogvlm", # CogVLM models
480
+ "vision", # Generic vision indicator
481
+ ]
482
+
483
+ return any(pattern in model_lower for pattern in vision_patterns)
484
+
449
485
  def _extract_image_parts(
450
486
  self, observation: dict[str, Any] | None
451
487
  ) -> list[dict[str, Any]]:
452
- """Crafter policy uses text-only prompts; do not attach image parts."""
453
-
454
- return []
488
+ """Extract image parts from crafter observation for vision-capable models.
489
+
490
+ Returns OpenAI-style image_url format if vision is enabled and image data is available.
491
+ """
492
+ # Only extract images if vision is enabled for this policy
493
+ if not self.use_vision:
494
+ return []
495
+
496
+ if not observation:
497
+ return []
498
+
499
+ # Get the observation data (could be nested)
500
+ obs = observation.get("observation", observation)
501
+ if not isinstance(obs, dict):
502
+ return []
503
+
504
+ # Extract the data URL (includes base64-encoded image)
505
+ data_url = obs.get("observation_image_data_url")
506
+ if not data_url or not isinstance(data_url, str):
507
+ return []
508
+
509
+ # Return OpenAI-style image_url format
510
+ return [{"type": "image_url", "image_url": {"url": data_url}}]
455
511
 
456
512
  def parse_model_response(
457
513
  self, response: str, observation: dict[str, Any]