deepgym 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. deepgym-0.1.0/.env.example +12 -0
  2. deepgym-0.1.0/.github/workflows/ci.yml +65 -0
  3. deepgym-0.1.0/.gitignore +47 -0
  4. deepgym-0.1.0/CHANGELOG.md +15 -0
  5. deepgym-0.1.0/CLAUDE.md +240 -0
  6. deepgym-0.1.0/CONTRIBUTING.md +33 -0
  7. deepgym-0.1.0/LICENSE +21 -0
  8. deepgym-0.1.0/PKG-INFO +326 -0
  9. deepgym-0.1.0/README.md +276 -0
  10. deepgym-0.1.0/data/.gitkeep +0 -0
  11. deepgym-0.1.0/docs/rl-primer.html +837 -0
  12. deepgym-0.1.0/examples/python_sorting/reference_solution.py +35 -0
  13. deepgym-0.1.0/examples/python_sorting/task.md +28 -0
  14. deepgym-0.1.0/examples/python_sorting/verifier.py +147 -0
  15. deepgym-0.1.0/examples/run_example.py +23 -0
  16. deepgym-0.1.0/examples/string_manipulation/reference_solution.py +25 -0
  17. deepgym-0.1.0/examples/string_manipulation/task.md +27 -0
  18. deepgym-0.1.0/examples/string_manipulation/verifier.py +186 -0
  19. deepgym-0.1.0/examples/two_sum/reference_solution.py +18 -0
  20. deepgym-0.1.0/examples/two_sum/task.md +27 -0
  21. deepgym-0.1.0/examples/two_sum/verifier.py +205 -0
  22. deepgym-0.1.0/huggingface/app.py +93 -0
  23. deepgym-0.1.0/huggingface/requirements.txt +2 -0
  24. deepgym-0.1.0/notebooks/quickstart.ipynb +427 -0
  25. deepgym-0.1.0/pyproject.toml +71 -0
  26. deepgym-0.1.0/scripts/demo_grpo_reward.py +164 -0
  27. deepgym-0.1.0/scripts/demo_trl_grpo.py +292 -0
  28. deepgym-0.1.0/scripts/import_bigcodebench.py +206 -0
  29. deepgym-0.1.0/scripts/import_evalplus.py +338 -0
  30. deepgym-0.1.0/scripts/import_humaneval.py +148 -0
  31. deepgym-0.1.0/scripts/import_mbpp.py +178 -0
  32. deepgym-0.1.0/scripts/publish.sh +11 -0
  33. deepgym-0.1.0/scripts/run_blackbox_scan.py +445 -0
  34. deepgym-0.1.0/scripts/run_exploit_scan.py +254 -0
  35. deepgym-0.1.0/scripts/run_rl_discovery.py +293 -0
  36. deepgym-0.1.0/scripts/upload_hf_datasets.py +107 -0
  37. deepgym-0.1.0/scripts/validate_bigcodebench.py +85 -0
  38. deepgym-0.1.0/src/deepgym/__init__.py +58 -0
  39. deepgym-0.1.0/src/deepgym/adversarial.py +1088 -0
  40. deepgym-0.1.0/src/deepgym/api/__init__.py +0 -0
  41. deepgym-0.1.0/src/deepgym/api/app.py +147 -0
  42. deepgym-0.1.0/src/deepgym/api/deps.py +25 -0
  43. deepgym-0.1.0/src/deepgym/api/routes.py +375 -0
  44. deepgym-0.1.0/src/deepgym/api/schemas.py +233 -0
  45. deepgym-0.1.0/src/deepgym/async_core.py +676 -0
  46. deepgym-0.1.0/src/deepgym/cli.py +359 -0
  47. deepgym-0.1.0/src/deepgym/computer_use.py +240 -0
  48. deepgym-0.1.0/src/deepgym/core.py +329 -0
  49. deepgym-0.1.0/src/deepgym/envs/__init__.py +1 -0
  50. deepgym-0.1.0/src/deepgym/envs/anagram_check/metadata.json +7 -0
  51. deepgym-0.1.0/src/deepgym/envs/anagram_check/reference_solution.py +2 -0
  52. deepgym-0.1.0/src/deepgym/envs/anagram_check/task.md +24 -0
  53. deepgym-0.1.0/src/deepgym/envs/anagram_check/verifier.py +139 -0
  54. deepgym-0.1.0/src/deepgym/envs/binary_search/metadata.json +7 -0
  55. deepgym-0.1.0/src/deepgym/envs/binary_search/reference_solution.py +11 -0
  56. deepgym-0.1.0/src/deepgym/envs/binary_search/task.md +24 -0
  57. deepgym-0.1.0/src/deepgym/envs/binary_search/verifier.py +138 -0
  58. deepgym-0.1.0/src/deepgym/envs/climbing_stairs/metadata.json +7 -0
  59. deepgym-0.1.0/src/deepgym/envs/climbing_stairs/reference_solution.py +7 -0
  60. deepgym-0.1.0/src/deepgym/envs/climbing_stairs/task.md +23 -0
  61. deepgym-0.1.0/src/deepgym/envs/climbing_stairs/verifier.py +140 -0
  62. deepgym-0.1.0/src/deepgym/envs/coin_change/metadata.json +7 -0
  63. deepgym-0.1.0/src/deepgym/envs/coin_change/reference_solution.py +8 -0
  64. deepgym-0.1.0/src/deepgym/envs/coin_change/task.md +24 -0
  65. deepgym-0.1.0/src/deepgym/envs/coin_change/verifier.py +150 -0
  66. deepgym-0.1.0/src/deepgym/envs/computer_use/cli_task/metadata.json +8 -0
  67. deepgym-0.1.0/src/deepgym/envs/computer_use/cli_task/reference_solution.py +49 -0
  68. deepgym-0.1.0/src/deepgym/envs/computer_use/cli_task/task.md +14 -0
  69. deepgym-0.1.0/src/deepgym/envs/computer_use/cli_task/verifier.py +140 -0
  70. deepgym-0.1.0/src/deepgym/envs/computer_use/file_organizer/metadata.json +8 -0
  71. deepgym-0.1.0/src/deepgym/envs/computer_use/file_organizer/reference_solution.py +26 -0
  72. deepgym-0.1.0/src/deepgym/envs/computer_use/file_organizer/task.md +19 -0
  73. deepgym-0.1.0/src/deepgym/envs/computer_use/file_organizer/verifier.py +122 -0
  74. deepgym-0.1.0/src/deepgym/envs/fizzbuzz/metadata.json +7 -0
  75. deepgym-0.1.0/src/deepgym/envs/fizzbuzz/reference_solution.py +9 -0
  76. deepgym-0.1.0/src/deepgym/envs/fizzbuzz/task.md +28 -0
  77. deepgym-0.1.0/src/deepgym/envs/fizzbuzz/verifier.py +168 -0
  78. deepgym-0.1.0/src/deepgym/envs/group_anagrams/metadata.json +7 -0
  79. deepgym-0.1.0/src/deepgym/envs/group_anagrams/reference_solution.py +10 -0
  80. deepgym-0.1.0/src/deepgym/envs/group_anagrams/task.md +22 -0
  81. deepgym-0.1.0/src/deepgym/envs/group_anagrams/verifier.py +161 -0
  82. deepgym-0.1.0/src/deepgym/envs/house_robber/metadata.json +7 -0
  83. deepgym-0.1.0/src/deepgym/envs/house_robber/reference_solution.py +10 -0
  84. deepgym-0.1.0/src/deepgym/envs/house_robber/task.md +25 -0
  85. deepgym-0.1.0/src/deepgym/envs/house_robber/verifier.py +145 -0
  86. deepgym-0.1.0/src/deepgym/envs/level_order_traversal/metadata.json +7 -0
  87. deepgym-0.1.0/src/deepgym/envs/level_order_traversal/reference_solution.py +17 -0
  88. deepgym-0.1.0/src/deepgym/envs/level_order_traversal/task.md +23 -0
  89. deepgym-0.1.0/src/deepgym/envs/level_order_traversal/verifier.py +160 -0
  90. deepgym-0.1.0/src/deepgym/envs/longest_common_subsequence/metadata.json +7 -0
  91. deepgym-0.1.0/src/deepgym/envs/longest_common_subsequence/reference_solution.py +10 -0
  92. deepgym-0.1.0/src/deepgym/envs/longest_common_subsequence/task.md +26 -0
  93. deepgym-0.1.0/src/deepgym/envs/longest_common_subsequence/verifier.py +147 -0
  94. deepgym-0.1.0/src/deepgym/envs/longest_consecutive/metadata.json +7 -0
  95. deepgym-0.1.0/src/deepgym/envs/longest_consecutive/reference_solution.py +12 -0
  96. deepgym-0.1.0/src/deepgym/envs/longest_consecutive/task.md +23 -0
  97. deepgym-0.1.0/src/deepgym/envs/longest_consecutive/verifier.py +147 -0
  98. deepgym-0.1.0/src/deepgym/envs/matrix_spiral/metadata.json +7 -0
  99. deepgym-0.1.0/src/deepgym/envs/matrix_spiral/reference_solution.py +21 -0
  100. deepgym-0.1.0/src/deepgym/envs/matrix_spiral/task.md +23 -0
  101. deepgym-0.1.0/src/deepgym/envs/matrix_spiral/verifier.py +165 -0
  102. deepgym-0.1.0/src/deepgym/envs/max_subarray/metadata.json +7 -0
  103. deepgym-0.1.0/src/deepgym/envs/max_subarray/reference_solution.py +7 -0
  104. deepgym-0.1.0/src/deepgym/envs/max_subarray/task.md +23 -0
  105. deepgym-0.1.0/src/deepgym/envs/max_subarray/verifier.py +142 -0
  106. deepgym-0.1.0/src/deepgym/envs/merge_intervals/metadata.json +7 -0
  107. deepgym-0.1.0/src/deepgym/envs/merge_intervals/reference_solution.py +11 -0
  108. deepgym-0.1.0/src/deepgym/envs/merge_intervals/task.md +23 -0
  109. deepgym-0.1.0/src/deepgym/envs/merge_intervals/verifier.py +157 -0
  110. deepgym-0.1.0/src/deepgym/envs/multi_turn/debug_fix/buggy.py +10 -0
  111. deepgym-0.1.0/src/deepgym/envs/multi_turn/debug_fix/reference_solution.py +48 -0
  112. deepgym-0.1.0/src/deepgym/envs/multi_turn/debug_fix/task.md +7 -0
  113. deepgym-0.1.0/src/deepgym/envs/multi_turn/debug_fix/test_buggy.py +41 -0
  114. deepgym-0.1.0/src/deepgym/envs/multi_turn/debug_fix/verifier.py +53 -0
  115. deepgym-0.1.0/src/deepgym/envs/palindrome_check/metadata.json +7 -0
  116. deepgym-0.1.0/src/deepgym/envs/palindrome_check/reference_solution.py +3 -0
  117. deepgym-0.1.0/src/deepgym/envs/palindrome_check/task.md +24 -0
  118. deepgym-0.1.0/src/deepgym/envs/palindrome_check/verifier.py +142 -0
  119. deepgym-0.1.0/src/deepgym/envs/python_sorting/reference_solution.py +35 -0
  120. deepgym-0.1.0/src/deepgym/envs/python_sorting/task.md +28 -0
  121. deepgym-0.1.0/src/deepgym/envs/python_sorting/verifier.py +197 -0
  122. deepgym-0.1.0/src/deepgym/envs/registry.json +225 -0
  123. deepgym-0.1.0/src/deepgym/envs/remove_duplicates/metadata.json +7 -0
  124. deepgym-0.1.0/src/deepgym/envs/remove_duplicates/reference_solution.py +8 -0
  125. deepgym-0.1.0/src/deepgym/envs/remove_duplicates/task.md +23 -0
  126. deepgym-0.1.0/src/deepgym/envs/remove_duplicates/verifier.py +137 -0
  127. deepgym-0.1.0/src/deepgym/envs/reverse_string/metadata.json +7 -0
  128. deepgym-0.1.0/src/deepgym/envs/reverse_string/reference_solution.py +2 -0
  129. deepgym-0.1.0/src/deepgym/envs/reverse_string/task.md +24 -0
  130. deepgym-0.1.0/src/deepgym/envs/reverse_string/verifier.py +134 -0
  131. deepgym-0.1.0/src/deepgym/envs/roman_to_integer/metadata.json +7 -0
  132. deepgym-0.1.0/src/deepgym/envs/roman_to_integer/reference_solution.py +9 -0
  133. deepgym-0.1.0/src/deepgym/envs/roman_to_integer/task.md +27 -0
  134. deepgym-0.1.0/src/deepgym/envs/roman_to_integer/verifier.py +170 -0
  135. deepgym-0.1.0/src/deepgym/envs/rotate_array/metadata.json +7 -0
  136. deepgym-0.1.0/src/deepgym/envs/rotate_array/reference_solution.py +5 -0
  137. deepgym-0.1.0/src/deepgym/envs/rotate_array/task.md +24 -0
  138. deepgym-0.1.0/src/deepgym/envs/rotate_array/verifier.py +139 -0
  139. deepgym-0.1.0/src/deepgym/envs/string_manipulation/reference_solution.py +25 -0
  140. deepgym-0.1.0/src/deepgym/envs/string_manipulation/task.md +27 -0
  141. deepgym-0.1.0/src/deepgym/envs/string_manipulation/verifier.py +244 -0
  142. deepgym-0.1.0/src/deepgym/envs/tool_use/api_request/metadata.json +8 -0
  143. deepgym-0.1.0/src/deepgym/envs/tool_use/api_request/reference_solution.py +16 -0
  144. deepgym-0.1.0/src/deepgym/envs/tool_use/api_request/task.md +12 -0
  145. deepgym-0.1.0/src/deepgym/envs/tool_use/api_request/verifier.py +175 -0
  146. deepgym-0.1.0/src/deepgym/envs/tool_use/data_pipeline/metadata.json +8 -0
  147. deepgym-0.1.0/src/deepgym/envs/tool_use/data_pipeline/reference_solution.py +22 -0
  148. deepgym-0.1.0/src/deepgym/envs/tool_use/data_pipeline/task.md +14 -0
  149. deepgym-0.1.0/src/deepgym/envs/tool_use/data_pipeline/verifier.py +152 -0
  150. deepgym-0.1.0/src/deepgym/envs/top_k_frequent/metadata.json +7 -0
  151. deepgym-0.1.0/src/deepgym/envs/top_k_frequent/reference_solution.py +7 -0
  152. deepgym-0.1.0/src/deepgym/envs/top_k_frequent/task.md +23 -0
  153. deepgym-0.1.0/src/deepgym/envs/top_k_frequent/verifier.py +144 -0
  154. deepgym-0.1.0/src/deepgym/envs/two_sum/reference_solution.py +18 -0
  155. deepgym-0.1.0/src/deepgym/envs/two_sum/task.md +27 -0
  156. deepgym-0.1.0/src/deepgym/envs/two_sum/verifier.py +275 -0
  157. deepgym-0.1.0/src/deepgym/envs/valid_parentheses/metadata.json +7 -0
  158. deepgym-0.1.0/src/deepgym/envs/valid_parentheses/reference_solution.py +11 -0
  159. deepgym-0.1.0/src/deepgym/envs/valid_parentheses/task.md +25 -0
  160. deepgym-0.1.0/src/deepgym/envs/valid_parentheses/verifier.py +163 -0
  161. deepgym-0.1.0/src/deepgym/exceptions.py +17 -0
  162. deepgym-0.1.0/src/deepgym/exploit_db.py +216 -0
  163. deepgym-0.1.0/src/deepgym/gym.py +336 -0
  164. deepgym-0.1.0/src/deepgym/integrations/__init__.py +10 -0
  165. deepgym-0.1.0/src/deepgym/integrations/hf.py +281 -0
  166. deepgym-0.1.0/src/deepgym/integrations/lm_eval.py +230 -0
  167. deepgym-0.1.0/src/deepgym/integrations/openrlhf.py +64 -0
  168. deepgym-0.1.0/src/deepgym/integrations/reward.py +150 -0
  169. deepgym-0.1.0/src/deepgym/integrations/trl.py +105 -0
  170. deepgym-0.1.0/src/deepgym/integrations/verl.py +116 -0
  171. deepgym-0.1.0/src/deepgym/models.py +312 -0
  172. deepgym-0.1.0/src/deepgym/multi_turn.py +381 -0
  173. deepgym-0.1.0/src/deepgym/registry.py +300 -0
  174. deepgym-0.1.0/src/deepgym/rl_exploit_discovery.py +519 -0
  175. deepgym-0.1.0/src/deepgym/sandbox.py +509 -0
  176. deepgym-0.1.0/src/deepgym/static/__init__.py +0 -0
  177. deepgym-0.1.0/src/deepgym/static/web_ui.html +1077 -0
  178. deepgym-0.1.0/src/deepgym/verifier.py +95 -0
  179. deepgym-0.1.0/src/deepgym/verifier_template.py +164 -0
  180. deepgym-0.1.0/src/deepgym/web.py +183 -0
  181. deepgym-0.1.0/tests/__init__.py +0 -0
  182. deepgym-0.1.0/tests/conftest.py +103 -0
  183. deepgym-0.1.0/tests/test_adversarial.py +195 -0
  184. deepgym-0.1.0/tests/test_api.py +224 -0
  185. deepgym-0.1.0/tests/test_cli.py +177 -0
  186. deepgym-0.1.0/tests/test_core.py +84 -0
  187. deepgym-0.1.0/tests/test_envs_smoke.py +74 -0
  188. deepgym-0.1.0/tests/test_exploit_db.py +235 -0
  189. deepgym-0.1.0/tests/test_gym.py +309 -0
  190. deepgym-0.1.0/tests/test_integrations.py +173 -0
  191. deepgym-0.1.0/tests/test_local_executor.py +166 -0
  192. deepgym-0.1.0/tests/test_models.py +181 -0
  193. deepgym-0.1.0/tests/test_multi_turn.py +208 -0
  194. deepgym-0.1.0/tests/test_per_test_traces.py +364 -0
  195. deepgym-0.1.0/tests/test_registry.py +109 -0
  196. deepgym-0.1.0/tests/test_verifier_template.py +163 -0
  197. deepgym-0.1.0/tests/test_web.py +120 -0
@@ -0,0 +1,12 @@
1
+ # DeepGym Configuration
2
+ # Copy to .env and fill in values
3
+
4
+ # Daytona (for sandbox mode)
5
+ # DAYTONA_API_KEY=your_key_here
6
+ # DAYTONA_API_URL=http://localhost:3000 # for self-hosted
7
+
8
+ # DeepGym API Server (optional)
9
+ # DEEPGYM_API_KEY=your_server_key_here
10
+
11
+ # Anthropic (for LLM adversarial testing)
12
+ # ANTHROPIC_API_KEY=your_key_here
@@ -0,0 +1,65 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main, master]
6
+ pull_request:
7
+ branches: [main, master]
8
+
9
+ jobs:
10
+ test:
11
+ name: Test (Python ${{ matrix.python-version }})
12
+ runs-on: ubuntu-latest
13
+ strategy:
14
+ fail-fast: false
15
+ matrix:
16
+ python-version: ["3.10", "3.11", "3.12"]
17
+
18
+ steps:
19
+ - uses: actions/checkout@v4
20
+
21
+ - name: Set up Python ${{ matrix.python-version }}
22
+ uses: actions/setup-python@v5
23
+ with:
24
+ python-version: ${{ matrix.python-version }}
25
+ cache: pip
26
+
27
+ - name: Install dependencies
28
+ run: pip install -e ".[dev]"
29
+
30
+ - name: Lint (ruff)
31
+ run: ruff check src/
32
+
33
+ - name: Format check (ruff)
34
+ run: ruff format --check src/
35
+
36
+ - name: Run tests
37
+ run: pytest --tb=short -q
38
+ env:
39
+ DEEPGYM_NO_AUTH: "true"
40
+
41
+ publish:
42
+ name: Publish to PyPI
43
+ runs-on: ubuntu-latest
44
+ needs: test
45
+ if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
46
+ environment: pypi
47
+ permissions:
48
+ id-token: write
49
+
50
+ steps:
51
+ - uses: actions/checkout@v4
52
+
53
+ - name: Set up Python
54
+ uses: actions/setup-python@v5
55
+ with:
56
+ python-version: "3.12"
57
+ cache: pip
58
+
59
+ - name: Build package
60
+ run: |
61
+ pip install hatch
62
+ hatch build
63
+
64
+ - name: Publish to PyPI
65
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,47 @@
1
+ # Environment and secrets
2
+ .env
3
+
4
+ # Python
5
+ __pycache__/
6
+ *.py[cod]
7
+ *$py.class
8
+ *.so
9
+
10
+ # Distribution / packaging
11
+ dist/
12
+ build/
13
+ *.egg-info/
14
+ *.egg
15
+
16
+ # Virtual environments
17
+ .venv/
18
+ venv/
19
+
20
+ # Testing
21
+ .pytest_cache/
22
+ test_output/
23
+ htmlcov/
24
+ .coverage
25
+ coverage.xml
26
+
27
+ # Linting / formatting
28
+ .ruff_cache/
29
+ .mypy_cache/
30
+
31
+ # IDE
32
+ .vscode/
33
+ .idea/
34
+ *.swp
35
+ *.swo
36
+ *~
37
+
38
+ # OS
39
+ .DS_Store
40
+ Thumbs.db
41
+
42
+ # Data (SQLite databases are local, not tracked)
43
+ data/*.db
44
+
45
+ # Daytona (submodule, not part of this repo)
46
+ daytona/
47
+ .DS_Store
@@ -0,0 +1,15 @@
1
+ # Changelog
2
+
3
+ ## 0.1.0 (2026-03-17)
4
+
5
+ Initial release.
6
+
7
+ - Core SDK: DeepGym client with run(), run_batch(), eval()
8
+ - 25 built-in environments
9
+ - 2,350+ importable benchmarks
10
+ - Gymnasium-style API
11
+ - Framework integrations: TRL, verl, OpenRLHF
12
+ - Multi-turn environment support
13
+ - Web debugging UI
14
+ - FastAPI server with async jobs
15
+ - CLI: run, eval, serve, web, create
@@ -0,0 +1,240 @@
1
+ # DeepGym
2
+
3
+ Managed sandboxed execution, scoring, and evaluation infrastructure for RL and agent training loops. Models take actions, we execute them in Daytona sandboxes, run verification, and return reward signals.
4
+
5
+ ## Project structure
6
+
7
+ ```
8
+ src/deepgym/
9
+ ├── models.py # All Pydantic models (Environment, RunResult, VerifierResult, Job, etc.)
10
+ ├── core.py # Sync DeepGym client (mode: auto/daytona/local)
11
+ ├── async_core.py # Async client with semaphore-based concurrency
12
+ ├── sandbox.py # Daytona sandbox lifecycle + LocalExecutor fallback
13
+ ├── verifier.py # Verifier model + protocol validation
14
+ ├── verifier_template.py # Wrapper normalizing any verifier to JSON protocol
15
+ ├── adversarial.py # Reward-hack detection (5 attack strategies)
16
+ ├── exceptions.py # DeepGymError hierarchy
17
+ ├── cli.py # CLI: run, eval, serve, create
18
+ └── api/
19
+ ├── app.py # FastAPI app + API key auth middleware
20
+ ├── routes.py # Sync + async job endpoints
21
+ ├── schemas.py # Request/response Pydantic models
22
+ └── deps.py # Dependency injection
23
+ examples/ # 3 example environments with verifiers + solutions
24
+ ```
25
+
26
+ ## Commands
27
+
28
+ ```bash
29
+ # Install (local mode, no Daytona needed)
30
+ pip install -e .
31
+
32
+ # Install with Daytona support
33
+ pip install -e ".[daytona]"
34
+
35
+ # Install everything (dev + daytona)
36
+ pip install -e ".[all]"
37
+
38
+ # Run the API server
39
+ deepgym serve --host 127.0.0.1 --port 8000
40
+
41
+ # Run a single environment
42
+ deepgym run --task task.md --verifier verifier.py --solution solution.py
43
+
44
+ # Run linter
45
+ ruff check src/
46
+
47
+ # Run tests
48
+ pytest
49
+ ```
50
+
51
+ ## Daytona setup
52
+
53
+ Self-hosted (local):
54
+ ```bash
55
+ git clone https://github.com/daytonaio/daytona
56
+ docker compose -f docker/docker-compose.yaml up -d
57
+ # Dashboard: http://localhost:3000 (dev@daytona.io / password)
58
+ # Set DAYTONA_API_URL and DAYTONA_API_KEY for the local instance
59
+ ```
60
+
61
+ Cloud: set `DAYTONA_API_KEY` from app.daytona.io.
62
+
63
+ ## Core principles
64
+
65
+ All code MUST be fully optimized:
66
+ - Maximize algorithmic big-O efficiency for memory and runtime.
67
+ - Use parallelization and vectorization where appropriate.
68
+ - Follow DRY — maximize code reuse, no duplicated logic.
69
+ - No extra code beyond what is necessary. Zero technical debt.
70
+ - If code is not fully optimized, do another pass before finishing.
71
+
72
+ ## Code standards
73
+
74
+ ### Python version and types
75
+
76
+ - Target Python 3.10+. Use `X | Y` union syntax, not `Union[X, Y]` or `Optional[X]`.
77
+ - Use `from __future__ import annotations` only when needed for forward refs.
78
+ - Every public function and method has type annotations on all parameters and return type.
79
+ - Never use `Any` type unless absolutely necessary — prefer specific types.
80
+ - Use `Literal` for constrained string values, not bare `str`.
81
+ - Prefer `Sequence` over `list` in function signatures when the function only reads from the collection.
82
+ - Use `is` for comparing with `None`, `True`, `False`.
83
+
84
+ ### Pydantic
85
+
86
+ - All data models use Pydantic `BaseModel`, not `dataclass`.
87
+ - Use `Field()` for validation constraints (`ge=`, `le=`, `min_length=`, etc.).
88
+ - Use `model_validator` for cross-field validation, not `__post_init__`.
89
+ - Immutable models: set `model_config = ConfigDict(frozen=True)` where the model should not be mutated after creation.
90
+ - Never use `dict()` on models — use `model_dump()`.
91
+
92
+ ### Error handling
93
+
94
+ - Use the exception hierarchy in `exceptions.py`: `DeepGymError` > `VerifierError`, `SandboxError`, `TimeoutError`.
95
+ - Never silently swallow errors. If a verifier fails to parse, raise `VerifierError` with context, don't return a zero score.
96
+ - Always include the original exception as `raise XError(...) from e`.
97
+ - Never use bare `except:` clauses. Catch specific exceptions.
98
+ - Sandbox cleanup goes in `finally` blocks. Cleanup errors are logged but not raised.
99
+ - Use context managers (`with` statements) for resource cleanup.
100
+ - Provide meaningful error messages with context.
101
+ - Use `logger.error()` not `print()` for error reporting.
102
+
103
+ ### Verifier protocol
104
+
105
+ Every verifier outputs JSON to stdout:
106
+ ```json
107
+ {
108
+ "schema_version": "1.0",
109
+ "score": 0.85,
110
+ "passed": true,
111
+ "details": "8/10 tests passed",
112
+ "reward_components": {"correctness": 0.8, "efficiency": 0.9},
113
+ "metrics": {"execution_time_ms": 142, "memory_mb": 24},
114
+ "seed": 42,
115
+ "truncated": false,
116
+ "error_type": null
117
+ }
118
+ ```
119
+ - `score` is always 0.0-1.0, clamped.
120
+ - User verifiers return float, bool, or dict — the wrapper template normalizes to this schema.
121
+ - Exit codes: 0 = passed, 1 = failed, 2 = verifier error.
122
+
123
+ ### Async patterns
124
+
125
+ - Use `AsyncDaytona` and `asyncio.Semaphore` for parallel execution, never raw thread spawning.
126
+ - The sync `DeepGym` client uses `ThreadPoolExecutor` for `run_batch` only.
127
+ - All async methods are prefixed with `async def`, never wrap sync code in `asyncio.to_thread` unless interfacing with sync-only libraries.
128
+ - Use `asyncio.gather(*tasks, return_exceptions=True)` for batch operations — don't let one failure kill the batch.
129
+
130
+ ### Function and class design
131
+
132
+ - Keep functions focused on a single responsibility.
133
+ - Never use mutable objects (lists, dicts) as default argument values. Use `Field(default_factory=...)` or `None`.
134
+ - Limit function parameters to 5 or fewer. Use a config/params object for more.
135
+ - Return early to reduce nesting.
136
+ - Keep classes focused on a single responsibility.
137
+ - Keep `__init__` simple — avoid complex logic.
138
+ - Prefer composition over inheritance.
139
+ - Use `@property` for computed attributes.
140
+ - Use list comprehensions and generator expressions where clearer than loops.
141
+ - Use `enumerate()` instead of manual counter variables.
142
+ - Use f-strings for string formatting.
143
+
144
+ ### Documentation
145
+
146
+ - Docstrings on all public classes, functions, and methods.
147
+ - Use imperative mood: "Create a sandbox" not "Creates a sandbox".
148
+ - Document function parameters, return values, and exceptions raised (Args/Returns/Raises).
149
+ - Keep comments up-to-date with code changes.
150
+ - Include examples in docstrings for complex functions.
151
+
152
+ ```python
153
+ def run(self, env: Environment, model_output: str) -> RunResult:
154
+ """Run a model output against an environment verifier in a sandbox.
155
+
156
+ Args:
157
+ env: The environment specification.
158
+ model_output: Model-generated solution source code.
159
+
160
+ Returns:
161
+ RunResult with score, pass/fail, timing, and verifier details.
162
+
163
+ Raises:
164
+ VerifierError: If verifier output is not valid JSON.
165
+ SandboxError: If sandbox creation fails.
166
+ """
167
+ ```
168
+
169
+ ### Code style
170
+
171
+ - Max line length: 100 (configured in pyproject.toml ruff).
172
+ - Imports: stdlib, then third-party, then local. Enforced by ruff `I` rules.
173
+ - No wildcard imports. No `import *`.
174
+ - Prefer early returns over deep nesting.
175
+ - No dead code. No commented-out code. No TODO without a linked issue.
176
+ - Single quotes for strings unless the string contains a single quote.
177
+ - Use `pathlib.Path` for file operations, not `os.path`.
178
+ - Use `logging` module, not `print()`, for any operational output. `print()` is only for CLI user-facing output.
179
+ - Use snake_case for functions/variables, PascalCase for classes, UPPER_CASE for constants.
180
+ - Never use emoji or unicode that emulates emoji in code or output.
181
+
182
+ ### Testing
183
+
184
+ - Tests go in `tests/` mirroring `src/deepgym/` structure.
185
+ - Use `pytest` with `pytest-asyncio` for async tests.
186
+ - Write unit tests for all new functions and classes.
187
+ - Test the verifier protocol contract explicitly — verify JSON output shape.
188
+ - Use `LocalExecutor` for tests, never require Daytona in CI.
189
+ - No mocking Daytona in tests — use `LocalExecutor` or skip with `@pytest.mark.skipif`.
190
+ - Follow Arrange-Act-Assert pattern.
191
+ - Never commit commented-out tests.
192
+ - Save test files before running them.
193
+ - Ensure test output folders are in `.gitignore`.
194
+
195
+ ### Security
196
+
197
+ - Sandbox network isolation is ON by default.
198
+ - Never run user-provided code on the host. Always in sandbox or subprocess with timeout.
199
+ - API key auth is required in production (`DEEPGYM_API_KEY` env var). Dev mode (unset) skips auth.
200
+ - Never store secrets, API keys, or passwords in code. Use `.env` files (ensure `.env` is in `.gitignore`).
201
+ - Never log or print API keys, tokens, PII, sandbox contents, or user code at INFO level. DEBUG only.
202
+ - Never log URLs containing API keys.
203
+ - Use environment variables for all sensitive configuration.
204
+ - Verifier code is untrusted — always run with resource limits (timeout, memory).
205
+
206
+ ### Git
207
+
208
+ - Conventional commits: `feat:`, `fix:`, `refactor:`, `docs:`, `test:`, `chore:`.
209
+ - One logical change per commit.
210
+ - Branch naming: `feat/xxx`, `fix/xxx`, `refactor/xxx`.
211
+ - Never commit commented-out code — delete it.
212
+ - Never commit debug print statements or breakpoints.
213
+ - Never commit credentials or sensitive data.
214
+
215
+ ### Before committing checklist
216
+
217
+ - All tests pass (`pytest`).
218
+ - Linter and formatter pass (`ruff check src/ && ruff format src/`).
219
+ - All functions have docstrings and type hints.
220
+ - No commented-out code or debug statements.
221
+ - No hardcoded credentials.
222
+
223
+ ### Maintainability
224
+
225
+ Long term maintainability is a core priority. If you add new functionality, first check if there is shared logic that can be extracted to a separate module. Duplicate logic across multiple files is a code smell and should be avoided. Don't be afraid to change existing code. Don't take shortcuts by just adding local logic to solve a problem.
226
+
227
+ - Before adding new code, search for existing utilities that do the same thing.
228
+ - If you find yourself writing the same pattern in 2+ places, extract it immediately.
229
+ - Prefer modifying existing modules over creating new ones when the functionality is related.
230
+ - Keep module responsibilities clear and documented in docstrings.
231
+ - When refactoring, update all callers — don't leave dead imports or compatibility shims.
232
+
233
+ ### What NOT to do
234
+
235
+ - Don't add abstractions until there are 3+ concrete uses. Three similar lines > premature abstraction.
236
+ - Don't add optional parameters "for future use." Add them when needed.
237
+ - Don't use `Any` in type annotations unless interfacing with untyped external code.
238
+ - Don't add logging, metrics, or config for things that aren't built yet.
239
+ - Don't write defensive code against impossible states. Trust the type system and Pydantic validation.
240
+ - Don't use global mutable state. Pass dependencies explicitly or use FastAPI's DI.
@@ -0,0 +1,33 @@
1
+ # Contributing to DeepGym
2
+
3
+ ## Development Setup
4
+
5
+ ```bash
6
+ git clone https://github.com/abhishekgahlot2/deepgym.git
7
+ cd deepgym
8
+ pip install -e ".[dev]"
9
+ pytest
10
+ ruff check src/
11
+ ```
12
+
13
+ ## Running Tests
14
+
15
+ ```bash
16
+ pytest tests/ -v
17
+ ```
18
+
19
+ ## Code Style
20
+
21
+ - Python 3.10+
22
+ - Ruff for linting and formatting
23
+ - Single quotes for strings
24
+ - Type annotations on all public functions
25
+ - Docstrings on all public classes and methods
26
+
27
+ ## Submitting Changes
28
+
29
+ 1. Fork the repo
30
+ 2. Create a feature branch
31
+ 3. Make your changes with tests
32
+ 4. Run pytest and ruff
33
+ 5. Submit a PR
deepgym-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Abhishek Gahlot
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.