deepgym 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepgym-0.1.0/.env.example +12 -0
- deepgym-0.1.0/.github/workflows/ci.yml +65 -0
- deepgym-0.1.0/.gitignore +47 -0
- deepgym-0.1.0/CHANGELOG.md +15 -0
- deepgym-0.1.0/CLAUDE.md +240 -0
- deepgym-0.1.0/CONTRIBUTING.md +33 -0
- deepgym-0.1.0/LICENSE +21 -0
- deepgym-0.1.0/PKG-INFO +326 -0
- deepgym-0.1.0/README.md +276 -0
- deepgym-0.1.0/data/.gitkeep +0 -0
- deepgym-0.1.0/docs/rl-primer.html +837 -0
- deepgym-0.1.0/examples/python_sorting/reference_solution.py +35 -0
- deepgym-0.1.0/examples/python_sorting/task.md +28 -0
- deepgym-0.1.0/examples/python_sorting/verifier.py +147 -0
- deepgym-0.1.0/examples/run_example.py +23 -0
- deepgym-0.1.0/examples/string_manipulation/reference_solution.py +25 -0
- deepgym-0.1.0/examples/string_manipulation/task.md +27 -0
- deepgym-0.1.0/examples/string_manipulation/verifier.py +186 -0
- deepgym-0.1.0/examples/two_sum/reference_solution.py +18 -0
- deepgym-0.1.0/examples/two_sum/task.md +27 -0
- deepgym-0.1.0/examples/two_sum/verifier.py +205 -0
- deepgym-0.1.0/huggingface/app.py +93 -0
- deepgym-0.1.0/huggingface/requirements.txt +2 -0
- deepgym-0.1.0/notebooks/quickstart.ipynb +427 -0
- deepgym-0.1.0/pyproject.toml +71 -0
- deepgym-0.1.0/scripts/demo_grpo_reward.py +164 -0
- deepgym-0.1.0/scripts/demo_trl_grpo.py +292 -0
- deepgym-0.1.0/scripts/import_bigcodebench.py +206 -0
- deepgym-0.1.0/scripts/import_evalplus.py +338 -0
- deepgym-0.1.0/scripts/import_humaneval.py +148 -0
- deepgym-0.1.0/scripts/import_mbpp.py +178 -0
- deepgym-0.1.0/scripts/publish.sh +11 -0
- deepgym-0.1.0/scripts/run_blackbox_scan.py +445 -0
- deepgym-0.1.0/scripts/run_exploit_scan.py +254 -0
- deepgym-0.1.0/scripts/run_rl_discovery.py +293 -0
- deepgym-0.1.0/scripts/upload_hf_datasets.py +107 -0
- deepgym-0.1.0/scripts/validate_bigcodebench.py +85 -0
- deepgym-0.1.0/src/deepgym/__init__.py +58 -0
- deepgym-0.1.0/src/deepgym/adversarial.py +1088 -0
- deepgym-0.1.0/src/deepgym/api/__init__.py +0 -0
- deepgym-0.1.0/src/deepgym/api/app.py +147 -0
- deepgym-0.1.0/src/deepgym/api/deps.py +25 -0
- deepgym-0.1.0/src/deepgym/api/routes.py +375 -0
- deepgym-0.1.0/src/deepgym/api/schemas.py +233 -0
- deepgym-0.1.0/src/deepgym/async_core.py +676 -0
- deepgym-0.1.0/src/deepgym/cli.py +359 -0
- deepgym-0.1.0/src/deepgym/computer_use.py +240 -0
- deepgym-0.1.0/src/deepgym/core.py +329 -0
- deepgym-0.1.0/src/deepgym/envs/__init__.py +1 -0
- deepgym-0.1.0/src/deepgym/envs/anagram_check/metadata.json +7 -0
- deepgym-0.1.0/src/deepgym/envs/anagram_check/reference_solution.py +2 -0
- deepgym-0.1.0/src/deepgym/envs/anagram_check/task.md +24 -0
- deepgym-0.1.0/src/deepgym/envs/anagram_check/verifier.py +139 -0
- deepgym-0.1.0/src/deepgym/envs/binary_search/metadata.json +7 -0
- deepgym-0.1.0/src/deepgym/envs/binary_search/reference_solution.py +11 -0
- deepgym-0.1.0/src/deepgym/envs/binary_search/task.md +24 -0
- deepgym-0.1.0/src/deepgym/envs/binary_search/verifier.py +138 -0
- deepgym-0.1.0/src/deepgym/envs/climbing_stairs/metadata.json +7 -0
- deepgym-0.1.0/src/deepgym/envs/climbing_stairs/reference_solution.py +7 -0
- deepgym-0.1.0/src/deepgym/envs/climbing_stairs/task.md +23 -0
- deepgym-0.1.0/src/deepgym/envs/climbing_stairs/verifier.py +140 -0
- deepgym-0.1.0/src/deepgym/envs/coin_change/metadata.json +7 -0
- deepgym-0.1.0/src/deepgym/envs/coin_change/reference_solution.py +8 -0
- deepgym-0.1.0/src/deepgym/envs/coin_change/task.md +24 -0
- deepgym-0.1.0/src/deepgym/envs/coin_change/verifier.py +150 -0
- deepgym-0.1.0/src/deepgym/envs/computer_use/cli_task/metadata.json +8 -0
- deepgym-0.1.0/src/deepgym/envs/computer_use/cli_task/reference_solution.py +49 -0
- deepgym-0.1.0/src/deepgym/envs/computer_use/cli_task/task.md +14 -0
- deepgym-0.1.0/src/deepgym/envs/computer_use/cli_task/verifier.py +140 -0
- deepgym-0.1.0/src/deepgym/envs/computer_use/file_organizer/metadata.json +8 -0
- deepgym-0.1.0/src/deepgym/envs/computer_use/file_organizer/reference_solution.py +26 -0
- deepgym-0.1.0/src/deepgym/envs/computer_use/file_organizer/task.md +19 -0
- deepgym-0.1.0/src/deepgym/envs/computer_use/file_organizer/verifier.py +122 -0
- deepgym-0.1.0/src/deepgym/envs/fizzbuzz/metadata.json +7 -0
- deepgym-0.1.0/src/deepgym/envs/fizzbuzz/reference_solution.py +9 -0
- deepgym-0.1.0/src/deepgym/envs/fizzbuzz/task.md +28 -0
- deepgym-0.1.0/src/deepgym/envs/fizzbuzz/verifier.py +168 -0
- deepgym-0.1.0/src/deepgym/envs/group_anagrams/metadata.json +7 -0
- deepgym-0.1.0/src/deepgym/envs/group_anagrams/reference_solution.py +10 -0
- deepgym-0.1.0/src/deepgym/envs/group_anagrams/task.md +22 -0
- deepgym-0.1.0/src/deepgym/envs/group_anagrams/verifier.py +161 -0
- deepgym-0.1.0/src/deepgym/envs/house_robber/metadata.json +7 -0
- deepgym-0.1.0/src/deepgym/envs/house_robber/reference_solution.py +10 -0
- deepgym-0.1.0/src/deepgym/envs/house_robber/task.md +25 -0
- deepgym-0.1.0/src/deepgym/envs/house_robber/verifier.py +145 -0
- deepgym-0.1.0/src/deepgym/envs/level_order_traversal/metadata.json +7 -0
- deepgym-0.1.0/src/deepgym/envs/level_order_traversal/reference_solution.py +17 -0
- deepgym-0.1.0/src/deepgym/envs/level_order_traversal/task.md +23 -0
- deepgym-0.1.0/src/deepgym/envs/level_order_traversal/verifier.py +160 -0
- deepgym-0.1.0/src/deepgym/envs/longest_common_subsequence/metadata.json +7 -0
- deepgym-0.1.0/src/deepgym/envs/longest_common_subsequence/reference_solution.py +10 -0
- deepgym-0.1.0/src/deepgym/envs/longest_common_subsequence/task.md +26 -0
- deepgym-0.1.0/src/deepgym/envs/longest_common_subsequence/verifier.py +147 -0
- deepgym-0.1.0/src/deepgym/envs/longest_consecutive/metadata.json +7 -0
- deepgym-0.1.0/src/deepgym/envs/longest_consecutive/reference_solution.py +12 -0
- deepgym-0.1.0/src/deepgym/envs/longest_consecutive/task.md +23 -0
- deepgym-0.1.0/src/deepgym/envs/longest_consecutive/verifier.py +147 -0
- deepgym-0.1.0/src/deepgym/envs/matrix_spiral/metadata.json +7 -0
- deepgym-0.1.0/src/deepgym/envs/matrix_spiral/reference_solution.py +21 -0
- deepgym-0.1.0/src/deepgym/envs/matrix_spiral/task.md +23 -0
- deepgym-0.1.0/src/deepgym/envs/matrix_spiral/verifier.py +165 -0
- deepgym-0.1.0/src/deepgym/envs/max_subarray/metadata.json +7 -0
- deepgym-0.1.0/src/deepgym/envs/max_subarray/reference_solution.py +7 -0
- deepgym-0.1.0/src/deepgym/envs/max_subarray/task.md +23 -0
- deepgym-0.1.0/src/deepgym/envs/max_subarray/verifier.py +142 -0
- deepgym-0.1.0/src/deepgym/envs/merge_intervals/metadata.json +7 -0
- deepgym-0.1.0/src/deepgym/envs/merge_intervals/reference_solution.py +11 -0
- deepgym-0.1.0/src/deepgym/envs/merge_intervals/task.md +23 -0
- deepgym-0.1.0/src/deepgym/envs/merge_intervals/verifier.py +157 -0
- deepgym-0.1.0/src/deepgym/envs/multi_turn/debug_fix/buggy.py +10 -0
- deepgym-0.1.0/src/deepgym/envs/multi_turn/debug_fix/reference_solution.py +48 -0
- deepgym-0.1.0/src/deepgym/envs/multi_turn/debug_fix/task.md +7 -0
- deepgym-0.1.0/src/deepgym/envs/multi_turn/debug_fix/test_buggy.py +41 -0
- deepgym-0.1.0/src/deepgym/envs/multi_turn/debug_fix/verifier.py +53 -0
- deepgym-0.1.0/src/deepgym/envs/palindrome_check/metadata.json +7 -0
- deepgym-0.1.0/src/deepgym/envs/palindrome_check/reference_solution.py +3 -0
- deepgym-0.1.0/src/deepgym/envs/palindrome_check/task.md +24 -0
- deepgym-0.1.0/src/deepgym/envs/palindrome_check/verifier.py +142 -0
- deepgym-0.1.0/src/deepgym/envs/python_sorting/reference_solution.py +35 -0
- deepgym-0.1.0/src/deepgym/envs/python_sorting/task.md +28 -0
- deepgym-0.1.0/src/deepgym/envs/python_sorting/verifier.py +197 -0
- deepgym-0.1.0/src/deepgym/envs/registry.json +225 -0
- deepgym-0.1.0/src/deepgym/envs/remove_duplicates/metadata.json +7 -0
- deepgym-0.1.0/src/deepgym/envs/remove_duplicates/reference_solution.py +8 -0
- deepgym-0.1.0/src/deepgym/envs/remove_duplicates/task.md +23 -0
- deepgym-0.1.0/src/deepgym/envs/remove_duplicates/verifier.py +137 -0
- deepgym-0.1.0/src/deepgym/envs/reverse_string/metadata.json +7 -0
- deepgym-0.1.0/src/deepgym/envs/reverse_string/reference_solution.py +2 -0
- deepgym-0.1.0/src/deepgym/envs/reverse_string/task.md +24 -0
- deepgym-0.1.0/src/deepgym/envs/reverse_string/verifier.py +134 -0
- deepgym-0.1.0/src/deepgym/envs/roman_to_integer/metadata.json +7 -0
- deepgym-0.1.0/src/deepgym/envs/roman_to_integer/reference_solution.py +9 -0
- deepgym-0.1.0/src/deepgym/envs/roman_to_integer/task.md +27 -0
- deepgym-0.1.0/src/deepgym/envs/roman_to_integer/verifier.py +170 -0
- deepgym-0.1.0/src/deepgym/envs/rotate_array/metadata.json +7 -0
- deepgym-0.1.0/src/deepgym/envs/rotate_array/reference_solution.py +5 -0
- deepgym-0.1.0/src/deepgym/envs/rotate_array/task.md +24 -0
- deepgym-0.1.0/src/deepgym/envs/rotate_array/verifier.py +139 -0
- deepgym-0.1.0/src/deepgym/envs/string_manipulation/reference_solution.py +25 -0
- deepgym-0.1.0/src/deepgym/envs/string_manipulation/task.md +27 -0
- deepgym-0.1.0/src/deepgym/envs/string_manipulation/verifier.py +244 -0
- deepgym-0.1.0/src/deepgym/envs/tool_use/api_request/metadata.json +8 -0
- deepgym-0.1.0/src/deepgym/envs/tool_use/api_request/reference_solution.py +16 -0
- deepgym-0.1.0/src/deepgym/envs/tool_use/api_request/task.md +12 -0
- deepgym-0.1.0/src/deepgym/envs/tool_use/api_request/verifier.py +175 -0
- deepgym-0.1.0/src/deepgym/envs/tool_use/data_pipeline/metadata.json +8 -0
- deepgym-0.1.0/src/deepgym/envs/tool_use/data_pipeline/reference_solution.py +22 -0
- deepgym-0.1.0/src/deepgym/envs/tool_use/data_pipeline/task.md +14 -0
- deepgym-0.1.0/src/deepgym/envs/tool_use/data_pipeline/verifier.py +152 -0
- deepgym-0.1.0/src/deepgym/envs/top_k_frequent/metadata.json +7 -0
- deepgym-0.1.0/src/deepgym/envs/top_k_frequent/reference_solution.py +7 -0
- deepgym-0.1.0/src/deepgym/envs/top_k_frequent/task.md +23 -0
- deepgym-0.1.0/src/deepgym/envs/top_k_frequent/verifier.py +144 -0
- deepgym-0.1.0/src/deepgym/envs/two_sum/reference_solution.py +18 -0
- deepgym-0.1.0/src/deepgym/envs/two_sum/task.md +27 -0
- deepgym-0.1.0/src/deepgym/envs/two_sum/verifier.py +275 -0
- deepgym-0.1.0/src/deepgym/envs/valid_parentheses/metadata.json +7 -0
- deepgym-0.1.0/src/deepgym/envs/valid_parentheses/reference_solution.py +11 -0
- deepgym-0.1.0/src/deepgym/envs/valid_parentheses/task.md +25 -0
- deepgym-0.1.0/src/deepgym/envs/valid_parentheses/verifier.py +163 -0
- deepgym-0.1.0/src/deepgym/exceptions.py +17 -0
- deepgym-0.1.0/src/deepgym/exploit_db.py +216 -0
- deepgym-0.1.0/src/deepgym/gym.py +336 -0
- deepgym-0.1.0/src/deepgym/integrations/__init__.py +10 -0
- deepgym-0.1.0/src/deepgym/integrations/hf.py +281 -0
- deepgym-0.1.0/src/deepgym/integrations/lm_eval.py +230 -0
- deepgym-0.1.0/src/deepgym/integrations/openrlhf.py +64 -0
- deepgym-0.1.0/src/deepgym/integrations/reward.py +150 -0
- deepgym-0.1.0/src/deepgym/integrations/trl.py +105 -0
- deepgym-0.1.0/src/deepgym/integrations/verl.py +116 -0
- deepgym-0.1.0/src/deepgym/models.py +312 -0
- deepgym-0.1.0/src/deepgym/multi_turn.py +381 -0
- deepgym-0.1.0/src/deepgym/registry.py +300 -0
- deepgym-0.1.0/src/deepgym/rl_exploit_discovery.py +519 -0
- deepgym-0.1.0/src/deepgym/sandbox.py +509 -0
- deepgym-0.1.0/src/deepgym/static/__init__.py +0 -0
- deepgym-0.1.0/src/deepgym/static/web_ui.html +1077 -0
- deepgym-0.1.0/src/deepgym/verifier.py +95 -0
- deepgym-0.1.0/src/deepgym/verifier_template.py +164 -0
- deepgym-0.1.0/src/deepgym/web.py +183 -0
- deepgym-0.1.0/tests/__init__.py +0 -0
- deepgym-0.1.0/tests/conftest.py +103 -0
- deepgym-0.1.0/tests/test_adversarial.py +195 -0
- deepgym-0.1.0/tests/test_api.py +224 -0
- deepgym-0.1.0/tests/test_cli.py +177 -0
- deepgym-0.1.0/tests/test_core.py +84 -0
- deepgym-0.1.0/tests/test_envs_smoke.py +74 -0
- deepgym-0.1.0/tests/test_exploit_db.py +235 -0
- deepgym-0.1.0/tests/test_gym.py +309 -0
- deepgym-0.1.0/tests/test_integrations.py +173 -0
- deepgym-0.1.0/tests/test_local_executor.py +166 -0
- deepgym-0.1.0/tests/test_models.py +181 -0
- deepgym-0.1.0/tests/test_multi_turn.py +208 -0
- deepgym-0.1.0/tests/test_per_test_traces.py +364 -0
- deepgym-0.1.0/tests/test_registry.py +109 -0
- deepgym-0.1.0/tests/test_verifier_template.py +163 -0
- deepgym-0.1.0/tests/test_web.py +120 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# DeepGym Configuration
|
|
2
|
+
# Copy to .env and fill in values
|
|
3
|
+
|
|
4
|
+
# Daytona (for sandbox mode)
|
|
5
|
+
# DAYTONA_API_KEY=your_key_here
|
|
6
|
+
# DAYTONA_API_URL=http://localhost:3000 # for self-hosted
|
|
7
|
+
|
|
8
|
+
# DeepGym API Server (optional)
|
|
9
|
+
# DEEPGYM_API_KEY=your_server_key_here
|
|
10
|
+
|
|
11
|
+
# Anthropic (for LLM adversarial testing)
|
|
12
|
+
# ANTHROPIC_API_KEY=your_key_here
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main, master]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main, master]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
name: Test (Python ${{ matrix.python-version }})
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
strategy:
|
|
14
|
+
fail-fast: false
|
|
15
|
+
matrix:
|
|
16
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
17
|
+
|
|
18
|
+
steps:
|
|
19
|
+
- uses: actions/checkout@v4
|
|
20
|
+
|
|
21
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
22
|
+
uses: actions/setup-python@v5
|
|
23
|
+
with:
|
|
24
|
+
python-version: ${{ matrix.python-version }}
|
|
25
|
+
cache: pip
|
|
26
|
+
|
|
27
|
+
- name: Install dependencies
|
|
28
|
+
run: pip install -e ".[dev]"
|
|
29
|
+
|
|
30
|
+
- name: Lint (ruff)
|
|
31
|
+
run: ruff check src/
|
|
32
|
+
|
|
33
|
+
- name: Format check (ruff)
|
|
34
|
+
run: ruff format --check src/
|
|
35
|
+
|
|
36
|
+
- name: Run tests
|
|
37
|
+
run: pytest --tb=short -q
|
|
38
|
+
env:
|
|
39
|
+
DEEPGYM_NO_AUTH: "true"
|
|
40
|
+
|
|
41
|
+
publish:
|
|
42
|
+
name: Publish to PyPI
|
|
43
|
+
runs-on: ubuntu-latest
|
|
44
|
+
needs: test
|
|
45
|
+
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
|
|
46
|
+
environment: pypi
|
|
47
|
+
permissions:
|
|
48
|
+
id-token: write
|
|
49
|
+
|
|
50
|
+
steps:
|
|
51
|
+
- uses: actions/checkout@v4
|
|
52
|
+
|
|
53
|
+
- name: Set up Python
|
|
54
|
+
uses: actions/setup-python@v5
|
|
55
|
+
with:
|
|
56
|
+
python-version: "3.12"
|
|
57
|
+
cache: pip
|
|
58
|
+
|
|
59
|
+
- name: Build package
|
|
60
|
+
run: |
|
|
61
|
+
pip install hatch
|
|
62
|
+
hatch build
|
|
63
|
+
|
|
64
|
+
- name: Publish to PyPI
|
|
65
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
deepgym-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# Environment and secrets
|
|
2
|
+
.env
|
|
3
|
+
|
|
4
|
+
# Python
|
|
5
|
+
__pycache__/
|
|
6
|
+
*.py[cod]
|
|
7
|
+
*$py.class
|
|
8
|
+
*.so
|
|
9
|
+
|
|
10
|
+
# Distribution / packaging
|
|
11
|
+
dist/
|
|
12
|
+
build/
|
|
13
|
+
*.egg-info/
|
|
14
|
+
*.egg
|
|
15
|
+
|
|
16
|
+
# Virtual environments
|
|
17
|
+
.venv/
|
|
18
|
+
venv/
|
|
19
|
+
|
|
20
|
+
# Testing
|
|
21
|
+
.pytest_cache/
|
|
22
|
+
test_output/
|
|
23
|
+
htmlcov/
|
|
24
|
+
.coverage
|
|
25
|
+
coverage.xml
|
|
26
|
+
|
|
27
|
+
# Linting / formatting
|
|
28
|
+
.ruff_cache/
|
|
29
|
+
.mypy_cache/
|
|
30
|
+
|
|
31
|
+
# IDE
|
|
32
|
+
.vscode/
|
|
33
|
+
.idea/
|
|
34
|
+
*.swp
|
|
35
|
+
*.swo
|
|
36
|
+
*~
|
|
37
|
+
|
|
38
|
+
# OS
|
|
39
|
+
.DS_Store
|
|
40
|
+
Thumbs.db
|
|
41
|
+
|
|
42
|
+
# Data (SQLite databases are local, not tracked)
|
|
43
|
+
data/*.db
|
|
44
|
+
|
|
45
|
+
# Daytona (submodule, not part of this repo)
|
|
46
|
+
daytona/
|
|
47
|
+
.DS_Store
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.1.0 (2026-03-17)
|
|
4
|
+
|
|
5
|
+
Initial release.
|
|
6
|
+
|
|
7
|
+
- Core SDK: DeepGym client with run(), run_batch(), eval()
|
|
8
|
+
- 25 built-in environments
|
|
9
|
+
- 2,350+ importable benchmarks
|
|
10
|
+
- Gymnasium-style API
|
|
11
|
+
- Framework integrations: TRL, verl, OpenRLHF
|
|
12
|
+
- Multi-turn environment support
|
|
13
|
+
- Web debugging UI
|
|
14
|
+
- FastAPI server with async jobs
|
|
15
|
+
- CLI: run, eval, serve, web, create
|
deepgym-0.1.0/CLAUDE.md
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
# DeepGym
|
|
2
|
+
|
|
3
|
+
Managed sandboxed execution, scoring, and evaluation infrastructure for RL and agent training loops. Models take actions, we execute them in Daytona sandboxes, run verification, and return reward signals.
|
|
4
|
+
|
|
5
|
+
## Project structure
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
src/deepgym/
|
|
9
|
+
├── models.py # All Pydantic models (Environment, RunResult, VerifierResult, Job, etc.)
|
|
10
|
+
├── core.py # Sync DeepGym client (mode: auto/daytona/local)
|
|
11
|
+
├── async_core.py # Async client with semaphore-based concurrency
|
|
12
|
+
├── sandbox.py # Daytona sandbox lifecycle + LocalExecutor fallback
|
|
13
|
+
├── verifier.py # Verifier model + protocol validation
|
|
14
|
+
├── verifier_template.py # Wrapper normalizing any verifier to JSON protocol
|
|
15
|
+
├── adversarial.py # Reward-hack detection (5 attack strategies)
|
|
16
|
+
├── exceptions.py # DeepGymError hierarchy
|
|
17
|
+
├── cli.py # CLI: run, eval, serve, create
|
|
18
|
+
└── api/
|
|
19
|
+
├── app.py # FastAPI app + API key auth middleware
|
|
20
|
+
├── routes.py # Sync + async job endpoints
|
|
21
|
+
├── schemas.py # Request/response Pydantic models
|
|
22
|
+
└── deps.py # Dependency injection
|
|
23
|
+
examples/ # 3 example environments with verifiers + solutions
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Commands
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
# Install (local mode, no Daytona needed)
|
|
30
|
+
pip install -e .
|
|
31
|
+
|
|
32
|
+
# Install with Daytona support
|
|
33
|
+
pip install -e ".[daytona]"
|
|
34
|
+
|
|
35
|
+
# Install everything (dev + daytona)
|
|
36
|
+
pip install -e ".[all]"
|
|
37
|
+
|
|
38
|
+
# Run the API server
|
|
39
|
+
deepgym serve --host 127.0.0.1 --port 8000
|
|
40
|
+
|
|
41
|
+
# Run a single environment
|
|
42
|
+
deepgym run --task task.md --verifier verifier.py --solution solution.py
|
|
43
|
+
|
|
44
|
+
# Run linter
|
|
45
|
+
ruff check src/
|
|
46
|
+
|
|
47
|
+
# Run tests
|
|
48
|
+
pytest
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Daytona setup
|
|
52
|
+
|
|
53
|
+
Self-hosted (local):
|
|
54
|
+
```bash
|
|
55
|
+
git clone https://github.com/daytonaio/daytona
|
|
56
|
+
docker compose -f docker/docker-compose.yaml up -d
|
|
57
|
+
# Dashboard: http://localhost:3000 (dev@daytona.io / password)
|
|
58
|
+
# Set DAYTONA_API_URL and DAYTONA_API_KEY for the local instance
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Cloud: set `DAYTONA_API_KEY` from app.daytona.io.
|
|
62
|
+
|
|
63
|
+
## Core principles
|
|
64
|
+
|
|
65
|
+
All code MUST be fully optimized:
|
|
66
|
+
- Maximize algorithmic big-O efficiency for memory and runtime.
|
|
67
|
+
- Use parallelization and vectorization where appropriate.
|
|
68
|
+
- Follow DRY — maximize code reuse, no duplicated logic.
|
|
69
|
+
- No extra code beyond what is necessary. Zero technical debt.
|
|
70
|
+
- If code is not fully optimized, do another pass before finishing.
|
|
71
|
+
|
|
72
|
+
## Code standards
|
|
73
|
+
|
|
74
|
+
### Python version and types
|
|
75
|
+
|
|
76
|
+
- Target Python 3.10+. Use `X | Y` union syntax, not `Union[X, Y]` or `Optional[X]`.
|
|
77
|
+
- Use `from __future__ import annotations` only when needed for forward refs.
|
|
78
|
+
- Every public function and method has type annotations on all parameters and return type.
|
|
79
|
+
- Never use `Any` type unless absolutely necessary — prefer specific types.
|
|
80
|
+
- Use `Literal` for constrained string values, not bare `str`.
|
|
81
|
+
- Prefer `Sequence` over `list` in function signatures when the function only reads from the collection.
|
|
82
|
+
- Use `is` for comparing with `None`, `True`, `False`.
|
|
83
|
+
|
|
84
|
+
### Pydantic
|
|
85
|
+
|
|
86
|
+
- All data models use Pydantic `BaseModel`, not `dataclass`.
|
|
87
|
+
- Use `Field()` for validation constraints (`ge=`, `le=`, `min_length=`, etc.).
|
|
88
|
+
- Use `model_validator` for cross-field validation, not `__post_init__`.
|
|
89
|
+
- Immutable models: set `model_config = ConfigDict(frozen=True)` where the model should not be mutated after creation.
|
|
90
|
+
- Never use `dict()` on models — use `model_dump()`.
|
|
91
|
+
|
|
92
|
+
### Error handling
|
|
93
|
+
|
|
94
|
+
- Use the exception hierarchy in `exceptions.py`: `DeepGymError` > `VerifierError`, `SandboxError`, `TimeoutError`.
|
|
95
|
+
- Never silently swallow errors. If a verifier fails to parse, raise `VerifierError` with context, don't return a zero score.
|
|
96
|
+
- Always include the original exception as `raise XError(...) from e`.
|
|
97
|
+
- Never use bare `except:` clauses. Catch specific exceptions.
|
|
98
|
+
- Sandbox cleanup goes in `finally` blocks. Cleanup errors are logged but not raised.
|
|
99
|
+
- Use context managers (`with` statements) for resource cleanup.
|
|
100
|
+
- Provide meaningful error messages with context.
|
|
101
|
+
- Use `logger.error()` not `print()` for error reporting.
|
|
102
|
+
|
|
103
|
+
### Verifier protocol
|
|
104
|
+
|
|
105
|
+
Every verifier outputs JSON to stdout:
|
|
106
|
+
```json
|
|
107
|
+
{
|
|
108
|
+
"schema_version": "1.0",
|
|
109
|
+
"score": 0.85,
|
|
110
|
+
"passed": true,
|
|
111
|
+
"details": "8/10 tests passed",
|
|
112
|
+
"reward_components": {"correctness": 0.8, "efficiency": 0.9},
|
|
113
|
+
"metrics": {"execution_time_ms": 142, "memory_mb": 24},
|
|
114
|
+
"seed": 42,
|
|
115
|
+
"truncated": false,
|
|
116
|
+
"error_type": null
|
|
117
|
+
}
|
|
118
|
+
```
|
|
119
|
+
- `score` is always 0.0-1.0, clamped.
|
|
120
|
+
- User verifiers return float, bool, or dict — the wrapper template normalizes to this schema.
|
|
121
|
+
- Exit codes: 0 = passed, 1 = failed, 2 = verifier error.
|
|
122
|
+
|
|
123
|
+
### Async patterns
|
|
124
|
+
|
|
125
|
+
- Use `AsyncDaytona` and `asyncio.Semaphore` for parallel execution, never raw thread spawning.
|
|
126
|
+
- The sync `DeepGym` client uses `ThreadPoolExecutor` for `run_batch` only.
|
|
127
|
+
- All async methods are prefixed with `async def`, never wrap sync code in `asyncio.to_thread` unless interfacing with sync-only libraries.
|
|
128
|
+
- Use `asyncio.gather(*tasks, return_exceptions=True)` for batch operations — don't let one failure kill the batch.
|
|
129
|
+
|
|
130
|
+
### Function and class design
|
|
131
|
+
|
|
132
|
+
- Keep functions focused on a single responsibility.
|
|
133
|
+
- Never use mutable objects (lists, dicts) as default argument values. Use `Field(default_factory=...)` or `None`.
|
|
134
|
+
- Limit function parameters to 5 or fewer. Use a config/params object for more.
|
|
135
|
+
- Return early to reduce nesting.
|
|
136
|
+
- Keep classes focused on a single responsibility.
|
|
137
|
+
- Keep `__init__` simple — avoid complex logic.
|
|
138
|
+
- Prefer composition over inheritance.
|
|
139
|
+
- Use `@property` for computed attributes.
|
|
140
|
+
- Use list comprehensions and generator expressions where clearer than loops.
|
|
141
|
+
- Use `enumerate()` instead of manual counter variables.
|
|
142
|
+
- Use f-strings for string formatting.
|
|
143
|
+
|
|
144
|
+
### Documentation
|
|
145
|
+
|
|
146
|
+
- Docstrings on all public classes, functions, and methods.
|
|
147
|
+
- Use imperative mood: "Create a sandbox" not "Creates a sandbox".
|
|
148
|
+
- Document function parameters, return values, and exceptions raised (Args/Returns/Raises).
|
|
149
|
+
- Keep comments up-to-date with code changes.
|
|
150
|
+
- Include examples in docstrings for complex functions.
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
def run(self, env: Environment, model_output: str) -> RunResult:
|
|
154
|
+
"""Run a model output against an environment verifier in a sandbox.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
env: The environment specification.
|
|
158
|
+
model_output: Model-generated solution source code.
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
RunResult with score, pass/fail, timing, and verifier details.
|
|
162
|
+
|
|
163
|
+
Raises:
|
|
164
|
+
VerifierError: If verifier output is not valid JSON.
|
|
165
|
+
SandboxError: If sandbox creation fails.
|
|
166
|
+
"""
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### Code style
|
|
170
|
+
|
|
171
|
+
- Max line length: 100 (configured in pyproject.toml ruff).
|
|
172
|
+
- Imports: stdlib, then third-party, then local. Enforced by ruff `I` rules.
|
|
173
|
+
- No wildcard imports. No `import *`.
|
|
174
|
+
- Prefer early returns over deep nesting.
|
|
175
|
+
- No dead code. No commented-out code. No TODO without a linked issue.
|
|
176
|
+
- Single quotes for strings unless the string contains a single quote.
|
|
177
|
+
- Use `pathlib.Path` for file operations, not `os.path`.
|
|
178
|
+
- Use `logging` module, not `print()`, for any operational output. `print()` is only for CLI user-facing output.
|
|
179
|
+
- Use snake_case for functions/variables, PascalCase for classes, UPPER_CASE for constants.
|
|
180
|
+
- Never use emoji or unicode that emulates emoji in code or output.
|
|
181
|
+
|
|
182
|
+
### Testing
|
|
183
|
+
|
|
184
|
+
- Tests go in `tests/` mirroring `src/deepgym/` structure.
|
|
185
|
+
- Use `pytest` with `pytest-asyncio` for async tests.
|
|
186
|
+
- Write unit tests for all new functions and classes.
|
|
187
|
+
- Test the verifier protocol contract explicitly — verify JSON output shape.
|
|
188
|
+
- Use `LocalExecutor` for tests, never require Daytona in CI.
|
|
189
|
+
- No mocking Daytona in tests — use `LocalExecutor` or skip with `@pytest.mark.skipif`.
|
|
190
|
+
- Follow Arrange-Act-Assert pattern.
|
|
191
|
+
- Never commit commented-out tests.
|
|
192
|
+
- Save test files before running them.
|
|
193
|
+
- Ensure test output folders are in `.gitignore`.
|
|
194
|
+
|
|
195
|
+
### Security
|
|
196
|
+
|
|
197
|
+
- Sandbox network isolation is ON by default.
|
|
198
|
+
- Never run user-provided code on the host. Always in sandbox or subprocess with timeout.
|
|
199
|
+
- API key auth is required in production (`DEEPGYM_API_KEY` env var). Dev mode (unset) skips auth.
|
|
200
|
+
- Never store secrets, API keys, or passwords in code. Use `.env` files (ensure `.env` is in `.gitignore`).
|
|
201
|
+
- Never log or print API keys, tokens, PII, sandbox contents, or user code at INFO level. DEBUG only.
|
|
202
|
+
- Never log URLs containing API keys.
|
|
203
|
+
- Use environment variables for all sensitive configuration.
|
|
204
|
+
- Verifier code is untrusted — always run with resource limits (timeout, memory).
|
|
205
|
+
|
|
206
|
+
### Git
|
|
207
|
+
|
|
208
|
+
- Conventional commits: `feat:`, `fix:`, `refactor:`, `docs:`, `test:`, `chore:`.
|
|
209
|
+
- One logical change per commit.
|
|
210
|
+
- Branch naming: `feat/xxx`, `fix/xxx`, `refactor/xxx`.
|
|
211
|
+
- Never commit commented-out code — delete it.
|
|
212
|
+
- Never commit debug print statements or breakpoints.
|
|
213
|
+
- Never commit credentials or sensitive data.
|
|
214
|
+
|
|
215
|
+
### Before committing checklist
|
|
216
|
+
|
|
217
|
+
- All tests pass (`pytest`).
|
|
218
|
+
- Linter and formatter pass (`ruff check src/ && ruff format src/`).
|
|
219
|
+
- All functions have docstrings and type hints.
|
|
220
|
+
- No commented-out code or debug statements.
|
|
221
|
+
- No hardcoded credentials.
|
|
222
|
+
|
|
223
|
+
### Maintainability
|
|
224
|
+
|
|
225
|
+
Long term maintainability is a core priority. If you add new functionality, first check if there is shared logic that can be extracted to a separate module. Duplicate logic across multiple files is a code smell and should be avoided. Don't be afraid to change existing code. Don't take shortcuts by just adding local logic to solve a problem.
|
|
226
|
+
|
|
227
|
+
- Before adding new code, search for existing utilities that do the same thing.
|
|
228
|
+
- If you find yourself writing the same pattern in 2+ places, extract it immediately.
|
|
229
|
+
- Prefer modifying existing modules over creating new ones when the functionality is related.
|
|
230
|
+
- Keep module responsibilities clear and documented in docstrings.
|
|
231
|
+
- When refactoring, update all callers — don't leave dead imports or compatibility shims.
|
|
232
|
+
|
|
233
|
+
### What NOT to do
|
|
234
|
+
|
|
235
|
+
- Don't add abstractions until there are 3+ concrete uses. Three similar lines > premature abstraction.
|
|
236
|
+
- Don't add optional parameters "for future use." Add them when needed.
|
|
237
|
+
- Don't use `Any` in type annotations unless interfacing with untyped external code.
|
|
238
|
+
- Don't add logging, metrics, or config for things that aren't built yet.
|
|
239
|
+
- Don't write defensive code against impossible states. Trust the type system and Pydantic validation.
|
|
240
|
+
- Don't use global mutable state. Pass dependencies explicitly or use FastAPI's DI.
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Contributing to DeepGym
|
|
2
|
+
|
|
3
|
+
## Development Setup
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
git clone https://github.com/abhishekgahlot2/deepgym.git
|
|
7
|
+
cd deepgym
|
|
8
|
+
pip install -e ".[dev]"
|
|
9
|
+
pytest
|
|
10
|
+
ruff check src/
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Running Tests
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pytest tests/ -v
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Code Style
|
|
20
|
+
|
|
21
|
+
- Python 3.10+
|
|
22
|
+
- Ruff for linting and formatting
|
|
23
|
+
- Single quotes for strings
|
|
24
|
+
- Type annotations on all public functions
|
|
25
|
+
- Docstrings on all public classes and methods
|
|
26
|
+
|
|
27
|
+
## Submitting Changes
|
|
28
|
+
|
|
29
|
+
1. Fork the repo
|
|
30
|
+
2. Create a feature branch
|
|
31
|
+
3. Make your changes with tests
|
|
32
|
+
4. Run pytest and ruff
|
|
33
|
+
5. Submit a PR
|
deepgym-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Abhishek Gahlot
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|