nl-code 0.4.1__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. nl_code-0.5.0/.env.example +27 -0
  2. {nl_code-0.4.1 → nl_code-0.5.0}/.gitignore +1 -0
  3. {nl_code-0.4.1 → nl_code-0.5.0}/AGENTS.md +4 -0
  4. {nl_code-0.4.1 → nl_code-0.5.0}/CLAUDE.md +4 -0
  5. nl_code-0.5.0/PKG-INFO +132 -0
  6. nl_code-0.5.0/README.md +96 -0
  7. nl_code-0.5.0/docker/scientific.Dockerfile +67 -0
  8. {nl_code-0.4.1 → nl_code-0.5.0}/pyproject.toml +23 -1
  9. {nl_code-0.4.1 → nl_code-0.5.0}/src/nl_code/__init__.py +4 -0
  10. nl_code-0.5.0/src/nl_code/code_execution/__init__.py +27 -0
  11. nl_code-0.5.0/src/nl_code/code_execution/models.py +152 -0
  12. nl_code-0.5.0/src/nl_code/code_execution/runner.py +918 -0
  13. nl_code-0.5.0/src/nl_code/code_execution/worker.py +627 -0
  14. {nl_code-0.4.1 → nl_code-0.5.0}/src/nl_code/code_validation.py +8 -2
  15. nl_code-0.5.0/src/nl_code/datasets/bigcodebench_lite_pro_dataset.py +96 -0
  16. {nl_code-0.4.1 → nl_code-0.5.0}/src/nl_code/datasets/bigcodebench_lite_pro_task.py +9 -31
  17. nl_code-0.5.0/src/nl_code/datasets/cache.py +163 -0
  18. nl_code-0.5.0/src/nl_code/datasets/cache_cli.py +75 -0
  19. nl_code-0.5.0/src/nl_code/datasets/catalog.py +21 -0
  20. nl_code-0.5.0/src/nl_code/datasets/classeval_dataset.py +97 -0
  21. {nl_code-0.4.1 → nl_code-0.5.0}/src/nl_code/datasets/classeval_task.py +33 -114
  22. nl_code-0.5.0/src/nl_code/datasets/dataset.py +252 -0
  23. nl_code-0.5.0/src/nl_code/datasets/gt_verification.py +24 -0
  24. nl_code-0.5.0/src/nl_code/datasets/humaneval_dataset.py +85 -0
  25. nl_code-0.5.0/src/nl_code/datasets/humaneval_pro_dataset.py +84 -0
  26. {nl_code-0.4.1 → nl_code-0.5.0}/src/nl_code/datasets/humaneval_pro_task.py +5 -31
  27. {nl_code-0.4.1 → nl_code-0.5.0}/src/nl_code/datasets/humaneval_task.py +7 -29
  28. nl_code-0.5.0/src/nl_code/datasets/mbpp_pro_dataset.py +84 -0
  29. {nl_code-0.4.1 → nl_code-0.5.0}/src/nl_code/datasets/mbpp_pro_task.py +5 -31
  30. nl_code-0.5.0/src/nl_code/test_cli.py +36 -0
  31. {nl_code-0.4.1 → nl_code-0.5.0}/tests/conftest.py +35 -8
  32. {nl_code-0.4.1 → nl_code-0.5.0}/tests/test_bigcodebench_lite_pro_dataset.py +35 -24
  33. {nl_code-0.4.1 → nl_code-0.5.0}/tests/test_bigcodebench_lite_pro_task.py +7 -5
  34. {nl_code-0.4.1 → nl_code-0.5.0}/tests/test_classeval_dataset.py +24 -20
  35. {nl_code-0.4.1 → nl_code-0.5.0}/tests/test_classeval_task.py +12 -5
  36. {nl_code-0.4.1 → nl_code-0.5.0}/tests/test_code_validation.py +10 -2
  37. {nl_code-0.4.1 → nl_code-0.5.0}/tests/test_dataset.py +17 -24
  38. nl_code-0.5.0/tests/test_dataset_cache_cli.py +44 -0
  39. {nl_code-0.4.1 → nl_code-0.5.0}/tests/test_dataset_explorer_compare.py +13 -5
  40. {nl_code-0.4.1 → nl_code-0.5.0}/tests/test_dataset_slice.py +2 -0
  41. nl_code-0.5.0/tests/test_execution_models.py +169 -0
  42. nl_code-0.5.0/tests/test_execution_runner.py +583 -0
  43. nl_code-0.5.0/tests/test_execution_worker.py +537 -0
  44. nl_code-0.5.0/tests/test_gt_verification.py +34 -0
  45. {nl_code-0.4.1 → nl_code-0.5.0}/tests/test_humaneval_dataset.py +31 -24
  46. {nl_code-0.4.1 → nl_code-0.5.0}/tests/test_humaneval_pro_dataset.py +11 -19
  47. {nl_code-0.4.1 → nl_code-0.5.0}/tests/test_humaneval_pro_task.py +7 -5
  48. {nl_code-0.4.1 → nl_code-0.5.0}/tests/test_humaneval_task.py +6 -5
  49. {nl_code-0.4.1 → nl_code-0.5.0}/tests/test_mbpp_pro_dataset.py +10 -20
  50. {nl_code-0.4.1 → nl_code-0.5.0}/tests/test_mbpp_pro_task.py +7 -5
  51. nl_code-0.5.0/tests/test_test_cli.py +34 -0
  52. {nl_code-0.4.1 → nl_code-0.5.0}/uv.lock +310 -2
  53. nl_code-0.4.1/.git +0 -1
  54. nl_code-0.4.1/PKG-INFO +0 -35
  55. nl_code-0.4.1/README.md +0 -11
  56. nl_code-0.4.1/src/nl_code/code_execution/__init__.py +0 -1
  57. nl_code-0.4.1/src/nl_code/code_execution/models.py +0 -35
  58. nl_code-0.4.1/src/nl_code/code_execution/runner.py +0 -161
  59. nl_code-0.4.1/src/nl_code/code_execution/worker.py +0 -440
  60. nl_code-0.4.1/src/nl_code/datasets/bigcodebench_lite_pro_dataset.py +0 -37
  61. nl_code-0.4.1/src/nl_code/datasets/classeval_dataset.py +0 -27
  62. nl_code-0.4.1/src/nl_code/datasets/dataset.py +0 -100
  63. nl_code-0.4.1/src/nl_code/datasets/humaneval_dataset.py +0 -27
  64. nl_code-0.4.1/src/nl_code/datasets/humaneval_pro_dataset.py +0 -29
  65. nl_code-0.4.1/src/nl_code/datasets/mbpp_pro_dataset.py +0 -29
  66. nl_code-0.4.1/tests/test_execution_models.py +0 -45
  67. nl_code-0.4.1/tests/test_execution_runner.py +0 -124
  68. nl_code-0.4.1/tests/test_execution_worker.py +0 -168
  69. {nl_code-0.4.1 → nl_code-0.5.0}/.python-version +0 -0
  70. {nl_code-0.4.1 → nl_code-0.5.0}/nbs/bigcodebench_lite_pro_validation.py +0 -0
  71. {nl_code-0.4.1 → nl_code-0.5.0}/nbs/classeval_ground_truth.py +0 -0
  72. {nl_code-0.4.1 → nl_code-0.5.0}/nbs/humaneval_description_quality.py +0 -0
  73. {nl_code-0.4.1 → nl_code-0.5.0}/nbs/humaneval_pro_validation.py +0 -0
  74. {nl_code-0.4.1 → nl_code-0.5.0}/nbs/mbpp_pro_validation.py +0 -0
  75. {nl_code-0.4.1 → nl_code-0.5.0}/src/nl_code/code_analysis.py +0 -0
  76. {nl_code-0.4.1 → nl_code-0.5.0}/src/nl_code/code_parsing.py +0 -0
  77. {nl_code-0.4.1 → nl_code-0.5.0}/src/nl_code/datasets/__init__.py +0 -0
  78. {nl_code-0.4.1 → nl_code-0.5.0}/src/nl_code/datasets/dataset_slice.py +0 -0
  79. {nl_code-0.4.1 → nl_code-0.5.0}/src/nl_code/datasets/pro_task_helpers.py +0 -0
  80. {nl_code-0.4.1 → nl_code-0.5.0}/src/nl_code/datasets/task.py +0 -0
  81. {nl_code-0.4.1 → nl_code-0.5.0}/src/nl_code/evaluation/__init__.py +0 -0
  82. {nl_code-0.4.1 → nl_code-0.5.0}/src/nl_code/evaluation/length.py +0 -0
  83. {nl_code-0.4.1 → nl_code-0.5.0}/src/nl_code/evaluation/overlap.py +0 -0
  84. {nl_code-0.4.1 → nl_code-0.5.0}/src/nl_code/evaluation/tokenizer.py +0 -0
  85. {nl_code-0.4.1 → nl_code-0.5.0}/src/nl_code/py.typed +0 -0
  86. {nl_code-0.4.1 → nl_code-0.5.0}/tests/test_code_analysis.py +0 -0
  87. {nl_code-0.4.1 → nl_code-0.5.0}/tests/test_code_parsing.py +0 -0
  88. {nl_code-0.4.1 → nl_code-0.5.0}/tests/test_length.py +0 -0
  89. {nl_code-0.4.1 → nl_code-0.5.0}/tests/test_overlap.py +0 -0
  90. {nl_code-0.4.1 → nl_code-0.5.0}/tests/test_task.py +0 -0
  91. {nl_code-0.4.1 → nl_code-0.5.0}/tests/test_tokenizer.py +0 -0
  92. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/.gitignore +0 -0
  93. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/README.md +0 -0
  94. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/backend/.env.example +0 -0
  95. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/backend/__init__.py +0 -0
  96. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/backend/data/.gitkeep +0 -0
  97. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/backend/main.py +0 -0
  98. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/backend/models/__init__.py +0 -0
  99. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/backend/models/dataset_explorer.py +0 -0
  100. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/backend/routers/__init__.py +0 -0
  101. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/backend/routers/datasets.py +0 -0
  102. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/backend/services/datasets.py +0 -0
  103. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/biome.json +0 -0
  104. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/index.html +0 -0
  105. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/package-lock.json +0 -0
  106. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/package.json +0 -0
  107. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/postcss.config.js +0 -0
  108. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/App.tsx +0 -0
  109. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/api/client.ts +0 -0
  110. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/api/datasets.ts +0 -0
  111. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/components/charts/Plot.tsx +0 -0
  112. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/components/code/PythonCodeBlock.tsx +0 -0
  113. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/components/detail/InspectorSections.tsx +0 -0
  114. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/components/layout/Layout.tsx +0 -0
  115. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/components/layout/Sidebar.tsx +0 -0
  116. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/components/ui/badge.tsx +0 -0
  117. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/components/ui/button.tsx +0 -0
  118. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/components/ui/card.tsx +0 -0
  119. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/components/ui/code-block.tsx +0 -0
  120. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/components/ui/input.tsx +0 -0
  121. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/components/ui/label.tsx +0 -0
  122. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/components/ui/page-skeletons.tsx +0 -0
  123. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/components/ui/page-status.tsx +0 -0
  124. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/components/ui/select.tsx +0 -0
  125. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/components/ui/separator.tsx +0 -0
  126. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/components/ui/skeleton.tsx +0 -0
  127. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/components/ui/slider.tsx +0 -0
  128. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/hooks/useTaskFilters.ts +0 -0
  129. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/index.css +0 -0
  130. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/lib/chartColors.ts +0 -0
  131. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/lib/metrics.ts +0 -0
  132. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/lib/utils.ts +0 -0
  133. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/main.tsx +0 -0
  134. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/pages/ComparePage.tsx +0 -0
  135. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/pages/HomeRedirect.tsx +0 -0
  136. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/pages/OverviewPage.tsx +0 -0
  137. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/pages/RawDetailPage.tsx +0 -0
  138. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/pages/TaskBrowserPage.tsx +0 -0
  139. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/pages/TaskDetailPage.tsx +0 -0
  140. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/plotly-cartesian.d.ts +0 -0
  141. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/src/types/datasetExplorer.ts +0 -0
  142. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/tailwind.config.js +0 -0
  143. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/tsconfig.json +0 -0
  144. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/tsconfig.node.json +0 -0
  145. {nl_code-0.4.1 → nl_code-0.5.0}/ui/dataset-explorer/frontend/vite.config.ts +0 -0
@@ -0,0 +1,27 @@
1
+ # Database
2
+ DR_LLM_DATABASE_URL=postgresql://localhost/nl_latents
3
+
4
+ # Optional API credentials
5
+ # OPENROUTER_API_KEY=
6
+ # GOOGLE_API_KEY=
7
+
8
+ # Code execution — worker limits (inside container/subprocess)
9
+ DR_DOCKER_WORKER_MAX_STDIN_BYTES=52428800
10
+ DR_DOCKER_WORKER_MAX_STDOUT_BYTES=1048576
11
+ DR_DOCKER_WORKER_CPU_SECONDS=20
12
+ DR_DOCKER_WORKER_MEMORY_BYTES=4294967296
13
+ DR_DOCKER_WORKER_FILE_BYTES=10485760
14
+ DR_DOCKER_WORKER_NPROC=256
15
+ DR_DOCKER_WORKER_SKIP_LIMITS=false
16
+
17
+ # Code execution — runner limits (host side)
18
+ NL_CODE_EVAL_WORKER_MAX_STDOUT_BYTES=1048576
19
+ NL_CODE_EVAL_WORKER_MAX_STDERR_BYTES=1048576
20
+
21
+ # Code execution — Docker container limits
22
+ DR_DOCKER_MEMORY=4g
23
+ DR_DOCKER_CPUS=1.0
24
+ DR_DOCKER_PIDS_LIMIT=256
25
+ DR_DOCKER_NOFILE=1024
26
+ DR_DOCKER_FSIZE_BYTES=10485760
27
+ DR_DOCKER_TMPFS_SIZE=64m
@@ -1,3 +1,4 @@
1
+ .env
1
2
  # Python
2
3
  __pycache__/
3
4
  *.pyc
@@ -11,8 +11,12 @@ uv run ruff format .
11
11
  uv run ruff check .
12
12
  uv run ty check
13
13
  uv run pytest
14
+ uv run nl-code-test docker
14
15
  ```
15
16
 
17
+ `uv run pytest` runs the default non-Docker suite. Run `uv run nl-code-test docker`
18
+ separately to execute the `@pytest.mark.docker` integration tests.
19
+
16
20
  ### Frontend (from ui/dataset-explorer/frontend/)
17
21
 
18
22
  ```bash
@@ -11,8 +11,12 @@ uv run ruff format .
11
11
  uv run ruff check .
12
12
  uv run ty check
13
13
  uv run pytest
14
+ uv run nl-code-test docker
14
15
  ```
15
16
 
17
+ `uv run pytest` runs the default non-Docker suite. Run `uv run nl-code-test docker`
18
+ separately to execute the `@pytest.mark.docker` integration tests.
19
+
16
20
  ### Frontend (from ui/dataset-explorer/frontend/)
17
21
 
18
22
  ```bash
nl_code-0.5.0/PKG-INFO ADDED
@@ -0,0 +1,132 @@
1
+ Metadata-Version: 2.4
2
+ Name: nl-code
3
+ Version: 0.5.0
4
+ Summary: Primitives for research into LLMs and code
5
+ Author-email: Danielle Rothermel <danielle.rothermel@gmail.com>
6
+ Requires-Python: >=3.12
7
+ Requires-Dist: datasets>=3.0.0
8
+ Requires-Dist: dr-docker==0.4.5
9
+ Requires-Dist: fastapi>=0.135.3
10
+ Requires-Dist: pydantic>=2.12.0
11
+ Requires-Dist: python-dotenv>=1.2.2
12
+ Requires-Dist: typer>=0.24.1
13
+ Requires-Dist: uvicorn[standard]>=0.44.0
14
+ Provides-Extra: bigcodebench
15
+ Requires-Dist: beautifulsoup4>=4.12; extra == 'bigcodebench'
16
+ Requires-Dist: gensim>=4.3; extra == 'bigcodebench'
17
+ Requires-Dist: holidays>=0.60; extra == 'bigcodebench'
18
+ Requires-Dist: matplotlib>=3.9; extra == 'bigcodebench'
19
+ Requires-Dist: nltk>=3.9; extra == 'bigcodebench'
20
+ Requires-Dist: numpy>=1.26; extra == 'bigcodebench'
21
+ Requires-Dist: openpyxl>=3.1; extra == 'bigcodebench'
22
+ Requires-Dist: pandas>=2.2; extra == 'bigcodebench'
23
+ Requires-Dist: pypdf2>=3.0; extra == 'bigcodebench'
24
+ Requires-Dist: python-dateutil>=2.9; extra == 'bigcodebench'
25
+ Requires-Dist: python-docx>=1.1; extra == 'bigcodebench'
26
+ Requires-Dist: pytz>=2024.1; extra == 'bigcodebench'
27
+ Requires-Dist: regex>=2024.4; extra == 'bigcodebench'
28
+ Requires-Dist: reportlab>=4.2; extra == 'bigcodebench'
29
+ Requires-Dist: scikit-learn>=1.5; extra == 'bigcodebench'
30
+ Requires-Dist: scipy>=1.14; extra == 'bigcodebench'
31
+ Requires-Dist: seaborn>=0.13; extra == 'bigcodebench'
32
+ Requires-Dist: statsmodels>=0.14; extra == 'bigcodebench'
33
+ Provides-Extra: docker
34
+ Requires-Dist: dr-docker>=0.4.5; extra == 'docker'
35
+ Description-Content-Type: text/markdown
36
+
37
+ # nl-code
38
+
39
+ Primitives for research into LLMs and code generation. Provides dataset loading, code execution (with Docker isolation), code analysis, and a dataset explorer UI.
40
+
41
+ ## Install
42
+
43
+ ```bash
44
+ uv add nl-code # core
45
+ uv add nl-code[docker] # + Docker execution via dr-docker
46
+ uv add nl-code[bigcodebench] # + scientific libs for BigCodeBench/ClassEval
47
+ ```
48
+
49
+ ## Code Execution
50
+
51
+ Execute generated code in isolated Docker containers.
52
+
53
+ Three execution modes covering all supported dataset test formats:
54
+
55
+ - **function_call** — call a named function with inputs, compare return values (HumanEval)
56
+ - **assertion** — exec code + assertion-based test code (HumanEval-Pro, MBPP-Pro, BigCodeBench Lite Pro)
57
+ - **unittest** — exec code + unittest.TestCase classes (ClassEval)
58
+
59
+ Batch variants (`batch_run_test_cases`, `batch_run_assertion_tests`, `batch_run_unittest_tests`) process many code samples in a single container with auto-chunking.
60
+
61
+ ### Build The Docker Image
62
+
63
+ Build the execution image from the repo root:
64
+
65
+ ```bash
66
+ docker build -t nl-code/code-eval-scientific:v1 -f docker/scientific.Dockerfile .
67
+ ```
68
+
69
+ This is the default runtime image used by the execution pipeline. The Dockerfile
70
+ installs both the `bigcodebench` dependency set and the pinned `dr-docker`
71
+ runtime dependency directly from `pyproject.toml`, so the image stays aligned
72
+ with the repo's declared execution requirements.
73
+
74
+ ### Run The Docker Test Tier
75
+
76
+ Docker-dependent tests are marked with `@pytest.mark.docker` and are excluded
77
+ from the default `pytest` run.
78
+
79
+ Run them explicitly with:
80
+
81
+ ```bash
82
+ uv run nl-code-test docker
83
+ ```
84
+
85
+ You can pass extra pytest arguments through after `docker`, for example:
86
+
87
+ ```bash
88
+ uv run nl-code-test docker -q tests/test_execution_runner.py
89
+ ```
90
+
91
+ ## Datasets
92
+
93
+ Loaders for HumanEval, HumanEval-Pro, MBPP-Pro, BigCodeBench Lite Pro, and ClassEval. Datasets are fetched from HuggingFace, parsed into `Task` objects, and cached locally. `DatasetSlice` supports filtering, seeded shuffling, and limit.
94
+
95
+ ## Dataset Explorer
96
+
97
+ A FastAPI + React app for browsing and comparing datasets. Run from `ui/dataset-explorer/`.
98
+
99
+ ## Headless validation runs
100
+
101
+ General dataset validation/debugging commands that import `matplotlib` should run headlessly with:
102
+
103
+ ```bash
104
+ MPLBACKEND=Agg uv run python ...
105
+ ```
106
+
107
+ ## Rebuild Dataset Caches
108
+
109
+ Run the Docker-backed cache rebuilds with:
110
+
111
+ ```bash
112
+ uv run python -m nl_code.datasets.cache_cli rebuild humaneval-plus
113
+ uv run python -m nl_code.datasets.cache_cli rebuild humaneval-pro
114
+ uv run python -m nl_code.datasets.cache_cli rebuild mbpp-pro
115
+ uv run python -m nl_code.datasets.cache_cli rebuild class-eval
116
+ uv run python -m nl_code.datasets.cache_cli rebuild bigcodebench-lite-pro
117
+ ```
118
+
119
+ `cache_cli rebuild` sets `MPLBACKEND=Agg` automatically.
120
+
121
+ Current observed results with the default execution image and env limits:
122
+
123
+ ```text
124
+ humaneval-plus: cached 163 tasks (163 raw, 1 flawed)
125
+ humaneval-pro: cached 164 tasks (164 raw, 0 flawed)
126
+ mbpp-pro: cached 375 tasks (375 raw, 3 flawed)
127
+ class-eval: cached 98 tasks (98 raw, 2 flawed)
128
+ bigcodebench-lite-pro: cached 54 tasks (54 raw, 3 flawed)
129
+ ```
130
+
131
+ The remaining flawed samples above are dataset-level failures, not Docker
132
+ runtime failures.
@@ -0,0 +1,96 @@
1
+ # nl-code
2
+
3
+ Primitives for research into LLMs and code generation. Provides dataset loading, code execution (with Docker isolation), code analysis, and a dataset explorer UI.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ uv add nl-code # core
9
+ uv add nl-code[docker] # + Docker execution via dr-docker
10
+ uv add nl-code[bigcodebench] # + scientific libs for BigCodeBench/ClassEval
11
+ ```
12
+
13
+ ## Code Execution
14
+
15
+ Execute generated code in isolated Docker containers.
16
+
17
+ Three execution modes covering all supported dataset test formats:
18
+
19
+ - **function_call** — call a named function with inputs, compare return values (HumanEval)
20
+ - **assertion** — exec code + assertion-based test code (HumanEval-Pro, MBPP-Pro, BigCodeBench Lite Pro)
21
+ - **unittest** — exec code + unittest.TestCase classes (ClassEval)
22
+
23
+ Batch variants (`batch_run_test_cases`, `batch_run_assertion_tests`, `batch_run_unittest_tests`) process many code samples in a single container with auto-chunking.
24
+
25
+ ### Build The Docker Image
26
+
27
+ Build the execution image from the repo root:
28
+
29
+ ```bash
30
+ docker build -t nl-code/code-eval-scientific:v1 -f docker/scientific.Dockerfile .
31
+ ```
32
+
33
+ This is the default runtime image used by the execution pipeline. The Dockerfile
34
+ installs both the `bigcodebench` dependency set and the pinned `dr-docker`
35
+ runtime dependency directly from `pyproject.toml`, so the image stays aligned
36
+ with the repo's declared execution requirements.
37
+
38
+ ### Run The Docker Test Tier
39
+
40
+ Docker-dependent tests are marked with `@pytest.mark.docker` and are excluded
41
+ from the default `pytest` run.
42
+
43
+ Run them explicitly with:
44
+
45
+ ```bash
46
+ uv run nl-code-test docker
47
+ ```
48
+
49
+ You can pass extra pytest arguments through after `docker`, for example:
50
+
51
+ ```bash
52
+ uv run nl-code-test docker -q tests/test_execution_runner.py
53
+ ```
54
+
55
+ ## Datasets
56
+
57
+ Loaders for HumanEval, HumanEval-Pro, MBPP-Pro, BigCodeBench Lite Pro, and ClassEval. Datasets are fetched from HuggingFace, parsed into `Task` objects, and cached locally. `DatasetSlice` supports filtering, seeded shuffling, and limit.
58
+
59
+ ## Dataset Explorer
60
+
61
+ A FastAPI + React app for browsing and comparing datasets. Run from `ui/dataset-explorer/`.
62
+
63
+ ## Headless validation runs
64
+
65
+ General dataset validation/debugging commands that import `matplotlib` should run headlessly with:
66
+
67
+ ```bash
68
+ MPLBACKEND=Agg uv run python ...
69
+ ```
70
+
71
+ ## Rebuild Dataset Caches
72
+
73
+ Run the Docker-backed cache rebuilds with:
74
+
75
+ ```bash
76
+ uv run python -m nl_code.datasets.cache_cli rebuild humaneval-plus
77
+ uv run python -m nl_code.datasets.cache_cli rebuild humaneval-pro
78
+ uv run python -m nl_code.datasets.cache_cli rebuild mbpp-pro
79
+ uv run python -m nl_code.datasets.cache_cli rebuild class-eval
80
+ uv run python -m nl_code.datasets.cache_cli rebuild bigcodebench-lite-pro
81
+ ```
82
+
83
+ `cache_cli rebuild` sets `MPLBACKEND=Agg` automatically.
84
+
85
+ Current observed results with the default execution image and env limits:
86
+
87
+ ```text
88
+ humaneval-plus: cached 163 tasks (163 raw, 1 flawed)
89
+ humaneval-pro: cached 164 tasks (164 raw, 0 flawed)
90
+ mbpp-pro: cached 375 tasks (375 raw, 3 flawed)
91
+ class-eval: cached 98 tasks (98 raw, 2 flawed)
92
+ bigcodebench-lite-pro: cached 54 tasks (54 raw, 3 flawed)
93
+ ```
94
+
95
+ The remaining flawed samples above are dataset-level failures, not Docker
96
+ runtime failures.
@@ -0,0 +1,67 @@
1
+ # Python code evaluation sandbox.
2
+ # Uses the scientific dependency set from pyproject.toml.
3
+ #
4
+ # Build:
5
+ # docker build -t nl-code/code-eval-scientific:v1 -f docker/scientific.Dockerfile .
6
+ FROM python:3.12-slim
7
+
8
+ LABEL org.opencontainers.image.title="nl-code/code-eval-scientific"
9
+ LABEL org.opencontainers.image.description="Python code evaluation sandbox (scientific)"
10
+ LABEL org.opencontainers.image.version="v1"
11
+
12
+ COPY pyproject.toml /tmp/nl-code/pyproject.toml
13
+
14
+ RUN python - <<'PY' > /tmp/bigcodebench-requirements.txt
15
+ from pathlib import Path
16
+ import tomllib
17
+
18
+ pyproject = tomllib.loads(Path("/tmp/nl-code/pyproject.toml").read_text())
19
+ requirements = pyproject["project"]["optional-dependencies"]["bigcodebench"]
20
+ print("\n".join(requirements))
21
+ PY
22
+
23
+ RUN python - <<'PY' > /tmp/dr-docker-requirements.txt
24
+ from pathlib import Path
25
+ import tomllib
26
+
27
+ pyproject = tomllib.loads(Path("/tmp/nl-code/pyproject.toml").read_text())
28
+ requirements = [
29
+ requirement
30
+ for requirement in pyproject["project"]["dependencies"]
31
+ if requirement.startswith("dr-docker")
32
+ ]
33
+ if len(requirements) != 1:
34
+ raise SystemExit(
35
+ f"expected exactly one dr-docker requirement, found {len(requirements)}"
36
+ )
37
+ print(requirements[0])
38
+ PY
39
+
40
+ RUN pip install --no-cache-dir \
41
+ -r /tmp/bigcodebench-requirements.txt \
42
+ -r /tmp/dr-docker-requirements.txt \
43
+ && rm -rf \
44
+ /root/.cache/pip \
45
+ /tmp/bigcodebench-requirements.txt \
46
+ /tmp/dr-docker-requirements.txt \
47
+ /tmp/nl-code
48
+
49
+ # Preload NLTK resources needed by ClassEval tasks so evaluation does not
50
+ # attempt network downloads inside the sandbox at runtime.
51
+ ENV NLTK_DATA=/usr/local/share/nltk_data
52
+ RUN python -m nltk.downloader -d "${NLTK_DATA}" \
53
+ averaged_perceptron_tagger \
54
+ averaged_perceptron_tagger_eng \
55
+ punkt \
56
+ punkt_tab \
57
+ wordnet \
58
+ omw-1.4
59
+
60
+ RUN useradd -m -s /bin/bash evaluser \
61
+ && mkdir -p /sandbox \
62
+ && chown evaluser:evaluser /sandbox
63
+
64
+ COPY --chown=evaluser:evaluser src/nl_code/code_execution/worker.py /sandbox/worker.py
65
+
66
+ USER evaluser
67
+ WORKDIR /tmp
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "nl-code"
3
- version = "0.4.1"
3
+ version = "0.5.0"
4
4
  description = "Primitives for research into LLMs and code"
5
5
  readme = "README.md"
6
6
  authors = [
@@ -12,9 +12,15 @@ dependencies = [
12
12
  "datasets>=3.0.0",
13
13
  "fastapi>=0.135.3",
14
14
  "uvicorn[standard]>=0.44.0",
15
+ "typer>=0.24.1",
16
+ "python-dotenv>=1.2.2",
17
+ "dr-docker==0.4.5",
15
18
  ]
16
19
 
17
20
  [project.optional-dependencies]
21
+ docker = [
22
+ "dr-docker>=0.4.5",
23
+ ]
18
24
  bigcodebench = [
19
25
  "numpy>=1.26",
20
26
  "pandas>=2.2",
@@ -27,8 +33,18 @@ bigcodebench = [
27
33
  "python-dateutil>=2.9",
28
34
  "holidays>=0.60",
29
35
  "regex>=2024.4",
36
+ "beautifulsoup4>=4.12",
37
+ "python-docx>=1.1",
38
+ "openpyxl>=3.1",
39
+ "gensim>=4.3",
40
+ "nltk>=3.9",
41
+ "PyPDF2>=3.0",
42
+ "reportlab>=4.2",
30
43
  ]
31
44
 
45
+ [project.scripts]
46
+ nl-code-test = "nl_code.test_cli:app"
47
+
32
48
  [build-system]
33
49
  requires = ["hatchling"]
34
50
  build-backend = "hatchling.build"
@@ -41,6 +57,12 @@ dev = [
41
57
  "ty>=0.0.29",
42
58
  ]
43
59
 
60
+ [tool.pytest.ini_options]
61
+ markers = [
62
+ "docker: tests that require a running Docker daemon",
63
+ ]
64
+ addopts = "-m 'not docker'"
65
+
44
66
  [tool.ruff]
45
67
  include = ["src/**/*.py", "tests/**/*.py"]
46
68
 
@@ -1 +1,5 @@
1
1
  """Primitives for research into LLMs and code."""
2
+
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
@@ -0,0 +1,27 @@
1
+ """Code execution with Docker isolation."""
2
+
3
+ from nl_code.code_execution.models import (
4
+ DEFAULT_CODE_EVAL_IMAGE as DEFAULT_CODE_EVAL_IMAGE,
5
+ SCIENTIFIC_CODE_EVAL_IMAGE as SCIENTIFIC_CODE_EVAL_IMAGE,
6
+ AssertionBatchItem as AssertionBatchItem,
7
+ AssertionTestResult as AssertionTestResult,
8
+ CodeExecutionInfrastructureError as CodeExecutionInfrastructureError,
9
+ ExecutionResult as ExecutionResult,
10
+ FunctionCallBatchItem as FunctionCallBatchItem,
11
+ TestCase as TestCase,
12
+ TestCaseResult as TestCaseResult,
13
+ UnittestBatchItem as UnittestBatchItem,
14
+ UnittestResult as UnittestResult,
15
+ UnittestTestDetail as UnittestTestDetail,
16
+ )
17
+ from nl_code.code_execution.runner import (
18
+ EXEC_MODE_DOCKER as EXEC_MODE_DOCKER,
19
+ batch_run_assertion_tests as batch_run_assertion_tests,
20
+ batch_run_test_cases as batch_run_test_cases,
21
+ batch_run_unittest_tests as batch_run_unittest_tests,
22
+ check_compiles as check_compiles,
23
+ run_assertion_test as run_assertion_test,
24
+ run_function_batch as run_function_batch,
25
+ run_test_cases as run_test_cases,
26
+ run_unittest_test as run_unittest_test,
27
+ )
@@ -0,0 +1,152 @@
1
+ from typing import Any
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ DEFAULT_CODE_EVAL_IMAGE = "nl-code/code-eval-scientific:v1"
6
+ SCIENTIFIC_CODE_EVAL_IMAGE = DEFAULT_CODE_EVAL_IMAGE
7
+
8
+
9
+ class CodeExecutionInfrastructureError(RuntimeError):
10
+ """Raised when the execution platform itself fails.
11
+
12
+ This error means Docker/subprocess infrastructure could not run the code.
13
+ It is NEVER raised for code-level failures (syntax errors, runtime
14
+ exceptions, wrong answers) — those are returned in result objects.
15
+
16
+ Callers should ``try/except CodeExecutionInfrastructureError`` to handle
17
+ platform issues and inspect result objects for code-level failures.
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ *,
23
+ stage: str,
24
+ execution_mode: str,
25
+ detail: str,
26
+ ) -> None:
27
+ self.stage = stage
28
+ self.execution_mode = execution_mode
29
+ self.detail = detail
30
+ super().__init__(
31
+ f"execution infrastructure failure "
32
+ f"(stage={stage}, mode={execution_mode}): {detail}"
33
+ )
34
+
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # Function-call execution models
38
+ # ---------------------------------------------------------------------------
39
+
40
+
41
+ class ExecutionResult(BaseModel):
42
+ """Result of executing a function with a single input."""
43
+
44
+ input_value: Any
45
+ return_value: Any | None = None
46
+ return_type: str | None = None
47
+ stdout: str = ""
48
+ stdout_truncated: bool = False
49
+ error: str | None = None
50
+ compile_success: bool | None = None
51
+ compile_error: str | None = None
52
+
53
+
54
+ class TestCase(BaseModel):
55
+ """A single test case: input and expected output."""
56
+
57
+ __test__ = False
58
+
59
+ input_value: Any
60
+ expected_output: Any
61
+
62
+
63
+ class TestCaseResult(BaseModel):
64
+ """Result of comparing execution output to expected output."""
65
+
66
+ __test__ = False
67
+
68
+ input_value: Any
69
+ expected_output: Any
70
+ actual_output: Any | None = None
71
+ passed: bool = False
72
+ error: str | None = None
73
+ compile_success: bool | None = None
74
+ compile_error: str | None = None
75
+
76
+
77
+ # ---------------------------------------------------------------------------
78
+ # Assertion execution models (Pro datasets)
79
+ # ---------------------------------------------------------------------------
80
+
81
+
82
+ class AssertionTestResult(BaseModel):
83
+ """Result of running code against assertion-based test code."""
84
+
85
+ __test__ = False
86
+
87
+ passed: bool
88
+ error: str | None = None
89
+ stdout: str = ""
90
+ compile_success: bool | None = None
91
+ compile_error: str | None = None
92
+
93
+
94
+ # ---------------------------------------------------------------------------
95
+ # Unittest execution models (ClassEval)
96
+ # ---------------------------------------------------------------------------
97
+
98
+
99
+ class UnittestTestDetail(BaseModel):
100
+ """Result of running a single unittest.TestCase class."""
101
+
102
+ __test__ = False
103
+
104
+ test_class_name: str
105
+ tests_run: int
106
+ tests_passed: int
107
+ tests_failed: int
108
+ tests_errored: int
109
+ tests_skipped: int = 0
110
+ failures: list[str] = Field(default_factory=list)
111
+ errors: list[str] = Field(default_factory=list)
112
+ passed: bool
113
+
114
+
115
+ class UnittestResult(BaseModel):
116
+ """Aggregated result across all unittest test classes."""
117
+
118
+ all_passed: bool
119
+ total_tests_run: int
120
+ total_tests_passed: int
121
+ total_tests_failed: int
122
+ total_tests_errored: int
123
+ per_test_class: list[UnittestTestDetail]
124
+ error: str | None = None
125
+
126
+
127
+ # ---------------------------------------------------------------------------
128
+ # Batch item models
129
+ # ---------------------------------------------------------------------------
130
+
131
+
132
+ class FunctionCallBatchItem(BaseModel):
133
+ """A single item for batch_run_test_cases."""
134
+
135
+ code: str
136
+ function_name: str
137
+ test_cases: list[TestCase]
138
+
139
+
140
+ class AssertionBatchItem(BaseModel):
141
+ """A single item for batch_run_assertion_tests."""
142
+
143
+ code: str
144
+ test_code: str
145
+
146
+
147
+ class UnittestBatchItem(BaseModel):
148
+ """A single item for batch_run_unittest_tests."""
149
+
150
+ code: str
151
+ test_code: str
152
+ test_class_names: list[str]