wisent 0.1.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (237) hide show
  1. wisent/__init__.py +1 -8
  2. wisent/benchmarks/__init__.py +0 -0
  3. wisent/benchmarks/coding/__init__.py +0 -0
  4. wisent/benchmarks/coding/metrics/__init__.py +0 -0
  5. wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
  6. wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
  7. wisent/benchmarks/coding/metrics/evaluator.py +275 -0
  8. wisent/benchmarks/coding/metrics/passk.py +66 -0
  9. wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
  10. wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
  11. wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
  12. wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
  13. wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
  14. wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
  15. wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
  16. wisent/benchmarks/coding/providers/__init__.py +18 -0
  17. wisent/benchmarks/coding/providers/core/__init__.py +0 -0
  18. wisent/benchmarks/coding/providers/core/atoms.py +31 -0
  19. wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
  20. wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
  21. wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
  22. wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
  23. wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
  24. wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
  25. wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
  26. wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
  27. wisent/classifiers/__init__.py +0 -0
  28. wisent/classifiers/core/__init__.py +0 -0
  29. wisent/classifiers/core/atoms.py +747 -0
  30. wisent/classifiers/models/__init__.py +0 -0
  31. wisent/classifiers/models/logistic.py +29 -0
  32. wisent/classifiers/models/mlp.py +47 -0
  33. wisent/cli/__init__.py +0 -0
  34. wisent/cli/classifiers/__init__.py +0 -0
  35. wisent/cli/classifiers/classifier_rotator.py +137 -0
  36. wisent/cli/cli_logger.py +142 -0
  37. wisent/cli/data_loaders/__init__.py +0 -0
  38. wisent/cli/data_loaders/data_loader_rotator.py +96 -0
  39. wisent/cli/evaluators/__init__.py +0 -0
  40. wisent/cli/evaluators/evaluator_rotator.py +148 -0
  41. wisent/cli/steering_methods/__init__.py +0 -0
  42. wisent/cli/steering_methods/steering_rotator.py +110 -0
  43. wisent/cli/wisent_cli/__init__.py +0 -0
  44. wisent/cli/wisent_cli/commands/__init__.py +0 -0
  45. wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
  46. wisent/cli/wisent_cli/commands/listing.py +154 -0
  47. wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
  48. wisent/cli/wisent_cli/main.py +93 -0
  49. wisent/cli/wisent_cli/shell.py +80 -0
  50. wisent/cli/wisent_cli/ui.py +69 -0
  51. wisent/cli/wisent_cli/util/__init__.py +0 -0
  52. wisent/cli/wisent_cli/util/aggregations.py +43 -0
  53. wisent/cli/wisent_cli/util/parsing.py +126 -0
  54. wisent/cli/wisent_cli/version.py +4 -0
  55. wisent/core/__init__.py +27 -0
  56. wisent/core/activations/__init__.py +0 -0
  57. wisent/core/activations/activations_collector.py +338 -0
  58. wisent/core/activations/core/__init__.py +0 -0
  59. wisent/core/activations/core/atoms.py +216 -0
  60. wisent/core/agent/__init__.py +18 -0
  61. wisent/core/agent/budget.py +638 -0
  62. wisent/core/agent/device_benchmarks.py +685 -0
  63. wisent/core/agent/diagnose/__init__.py +55 -0
  64. wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
  65. wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
  66. wisent/core/agent/diagnose/create_classifier.py +1154 -0
  67. wisent/core/agent/diagnose/response_diagnostics.py +268 -0
  68. wisent/core/agent/diagnose/select_classifiers.py +506 -0
  69. wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
  70. wisent/core/agent/diagnose/tasks/__init__.py +33 -0
  71. wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
  72. wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
  73. wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
  74. wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
  75. wisent/core/agent/diagnose.py +242 -0
  76. wisent/core/agent/steer.py +212 -0
  77. wisent/core/agent/timeout.py +134 -0
  78. wisent/core/autonomous_agent.py +1234 -0
  79. wisent/core/bigcode_integration.py +583 -0
  80. wisent/core/contrastive_pairs/__init__.py +15 -0
  81. wisent/core/contrastive_pairs/core/__init__.py +0 -0
  82. wisent/core/contrastive_pairs/core/atoms.py +45 -0
  83. wisent/core/contrastive_pairs/core/buliders.py +59 -0
  84. wisent/core/contrastive_pairs/core/pair.py +178 -0
  85. wisent/core/contrastive_pairs/core/response.py +152 -0
  86. wisent/core/contrastive_pairs/core/serialization.py +300 -0
  87. wisent/core/contrastive_pairs/core/set.py +133 -0
  88. wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
  89. wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
  90. wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
  91. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
  92. wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
  93. wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
  94. wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
  95. wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
  96. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
  102. wisent/core/data_loaders/__init__.py +0 -0
  103. wisent/core/data_loaders/core/__init__.py +0 -0
  104. wisent/core/data_loaders/core/atoms.py +98 -0
  105. wisent/core/data_loaders/loaders/__init__.py +0 -0
  106. wisent/core/data_loaders/loaders/custom.py +120 -0
  107. wisent/core/data_loaders/loaders/lm_loader.py +218 -0
  108. wisent/core/detection_handling.py +257 -0
  109. wisent/core/download_full_benchmarks.py +1386 -0
  110. wisent/core/evaluators/__init__.py +0 -0
  111. wisent/core/evaluators/oracles/__init__.py +0 -0
  112. wisent/core/evaluators/oracles/interactive.py +73 -0
  113. wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
  114. wisent/core/evaluators/oracles/user_specified.py +67 -0
  115. wisent/core/hyperparameter_optimizer.py +429 -0
  116. wisent/core/lm_eval_harness_ground_truth.py +1396 -0
  117. wisent/core/log_likelihoods_evaluator.py +321 -0
  118. wisent/core/managed_cached_benchmarks.py +595 -0
  119. wisent/core/mixed_benchmark_sampler.py +364 -0
  120. wisent/core/model_config_manager.py +330 -0
  121. wisent/core/model_persistence.py +317 -0
  122. wisent/core/models/__init__.py +0 -0
  123. wisent/core/models/core/__init__.py +0 -0
  124. wisent/core/models/core/atoms.py +460 -0
  125. wisent/core/models/wisent_model.py +727 -0
  126. wisent/core/multi_steering.py +316 -0
  127. wisent/core/optuna/__init__.py +57 -0
  128. wisent/core/optuna/classifier/__init__.py +25 -0
  129. wisent/core/optuna/classifier/activation_generator.py +349 -0
  130. wisent/core/optuna/classifier/classifier_cache.py +509 -0
  131. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
  132. wisent/core/optuna/steering/__init__.py +0 -0
  133. wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
  134. wisent/core/optuna/steering/data_utils.py +342 -0
  135. wisent/core/optuna/steering/metrics.py +474 -0
  136. wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
  137. wisent/core/optuna/steering/steering_optimization.py +1111 -0
  138. wisent/core/parser.py +1668 -0
  139. wisent/core/prompts/__init__.py +0 -0
  140. wisent/core/prompts/core/__init__.py +0 -0
  141. wisent/core/prompts/core/atom.py +57 -0
  142. wisent/core/prompts/core/prompt_formater.py +157 -0
  143. wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
  144. wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
  145. wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
  146. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
  147. wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
  148. wisent/core/representation.py +5 -0
  149. wisent/core/sample_size_optimizer.py +648 -0
  150. wisent/core/sample_size_optimizer_v2.py +355 -0
  151. wisent/core/save_results.py +277 -0
  152. wisent/core/steering.py +652 -0
  153. wisent/core/steering_method.py +26 -0
  154. wisent/core/steering_methods/__init__.py +0 -0
  155. wisent/core/steering_methods/core/__init__.py +0 -0
  156. wisent/core/steering_methods/core/atoms.py +153 -0
  157. wisent/core/steering_methods/methods/__init__.py +0 -0
  158. wisent/core/steering_methods/methods/caa.py +44 -0
  159. wisent/core/steering_optimizer.py +1297 -0
  160. wisent/core/task_interface.py +132 -0
  161. wisent/core/task_selector.py +189 -0
  162. wisent/core/tasks/__init__.py +175 -0
  163. wisent/core/tasks/aime_task.py +141 -0
  164. wisent/core/tasks/file_task.py +211 -0
  165. wisent/core/tasks/hle_task.py +180 -0
  166. wisent/core/tasks/hmmt_task.py +119 -0
  167. wisent/core/tasks/livecodebench_task.py +201 -0
  168. wisent/core/tasks/livemathbench_task.py +158 -0
  169. wisent/core/tasks/lm_eval_task.py +455 -0
  170. wisent/core/tasks/math500_task.py +84 -0
  171. wisent/core/tasks/polymath_task.py +146 -0
  172. wisent/core/tasks/supergpqa_task.py +220 -0
  173. wisent/core/time_estimator.py +149 -0
  174. wisent/core/timing_calibration.py +174 -0
  175. wisent/core/tracking/__init__.py +54 -0
  176. wisent/core/tracking/latency.py +618 -0
  177. wisent/core/tracking/memory.py +359 -0
  178. wisent/core/trainers/__init__.py +0 -0
  179. wisent/core/trainers/core/__init__.py +11 -0
  180. wisent/core/trainers/core/atoms.py +45 -0
  181. wisent/core/trainers/steering_trainer.py +271 -0
  182. wisent/core/user_model_config.py +158 -0
  183. wisent/opti/__init__.py +0 -0
  184. wisent/opti/core/__init__.py +0 -0
  185. wisent/opti/core/atoms.py +175 -0
  186. wisent/opti/methods/__init__.py +0 -0
  187. wisent/opti/methods/opti_classificator.py +172 -0
  188. wisent/opti/methods/opti_steering.py +138 -0
  189. wisent/synthetic/__init__.py +0 -0
  190. wisent/synthetic/cleaners/__init__.py +0 -0
  191. wisent/synthetic/cleaners/core/__init__.py +0 -0
  192. wisent/synthetic/cleaners/core/atoms.py +58 -0
  193. wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
  194. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  195. wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
  196. wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
  197. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  198. wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
  199. wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
  200. wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
  201. wisent/synthetic/db_instructions/__init__.py +0 -0
  202. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  203. wisent/synthetic/db_instructions/core/atoms.py +25 -0
  204. wisent/synthetic/db_instructions/mini_dp.py +37 -0
  205. wisent/synthetic/generators/__init__.py +0 -0
  206. wisent/synthetic/generators/core/__init__.py +0 -0
  207. wisent/synthetic/generators/core/atoms.py +73 -0
  208. wisent/synthetic/generators/diversities/__init__.py +0 -0
  209. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  210. wisent/synthetic/generators/diversities/core/core.py +68 -0
  211. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  212. wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
  213. wisent/synthetic/generators/pairs_generator.py +179 -0
  214. wisent-0.5.2.dist-info/METADATA +67 -0
  215. wisent-0.5.2.dist-info/RECORD +218 -0
  216. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/WHEEL +1 -1
  217. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info/licenses}/LICENSE +2 -2
  218. wisent/activations/__init__.py +0 -9
  219. wisent/activations/client.py +0 -97
  220. wisent/activations/extractor.py +0 -251
  221. wisent/activations/models.py +0 -95
  222. wisent/client.py +0 -45
  223. wisent/control_vector/__init__.py +0 -9
  224. wisent/control_vector/client.py +0 -85
  225. wisent/control_vector/manager.py +0 -168
  226. wisent/control_vector/models.py +0 -70
  227. wisent/inference/__init__.py +0 -9
  228. wisent/inference/client.py +0 -103
  229. wisent/inference/inferencer.py +0 -250
  230. wisent/inference/models.py +0 -66
  231. wisent/utils/__init__.py +0 -3
  232. wisent/utils/auth.py +0 -30
  233. wisent/utils/http.py +0 -228
  234. wisent/version.py +0 -3
  235. wisent-0.1.1.dist-info/METADATA +0 -142
  236. wisent-0.1.1.dist-info/RECORD +0 -23
  237. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,107 @@
1
+ # coding/llm_sanitizer/util.py
2
+ from __future__ import annotations
3
+ import re
4
+ from textwrap import dedent
5
+
6
+ _FENCE_RE = re.compile(
7
+ r"```(?P<lang>[a-zA-Z0-9_+-]*)\s*\n(?P<code>.*?)(?:```|$)", re.DOTALL
8
+ )
9
+
10
+ def extract_code_block(raw: str, prefer_langs=("python","py","cpp","c++","java")) -> str:
11
+ """
12
+ Return the best-looking fenced code block; else the raw text.
13
+
14
+ Args:
15
+ raw:
16
+ The raw text possibly containing fenced code blocks.
17
+ prefer_langs:
18
+ Languages to prefer when selecting a code block.
19
+
20
+ Returns:
21
+ The extracted code block, or the raw text if no fenced blocks found.
22
+
23
+ Examples:
24
+ >>> extract_code_block("Here is some code:\\n```python\\ndef foo(): pass\\n```")
25
+ 'def foo(): pass'
26
+ >>> extract_code_block("No code blocks here.")
27
+ 'No code blocks here.'
28
+ >>> extract_code_block("Multiple:\\n```java\\nclass A {}\\n```\\n```python\\ndef f(): pass\\n```")
29
+ 'def f(): pass'
30
+ """
31
+ matches = list(_FENCE_RE.finditer(raw))
32
+ if not matches:
33
+ return strip_triple_quotes(raw)
34
+ def score(m):
35
+ lang = (m.group("lang") or "").lower()
36
+ pref = 1 if lang in prefer_langs else 0
37
+ return (pref, len(m.group("code")))
38
+ m = max(matches, key=score)
39
+ return m.group("code").strip()
40
+
41
+ def strip_triple_quotes(s: str) -> str:
42
+ """
43
+ If the string is wrapped in triple quotes, strip them.
44
+
45
+ Args:
46
+ s:
47
+ The input string.
48
+
49
+ Returns:
50
+ The string with triple quotes removed if they were present.
51
+
52
+ Examples:
53
+ >>> strip_triple_quotes('\"\"\"def foo(): pass\"\"\"')
54
+ 'def foo(): pass'
55
+ >>> strip_triple_quotes("'''def foo(): pass'''")
56
+ 'def foo(): pass'
57
+ >>> strip_triple_quotes('def foo(): pass')
58
+ 'def foo(): pass'
59
+ """
60
+ s = s.strip()
61
+ if s.startswith('"""') and s.endswith('"""'):
62
+ return s[3:-3].strip()
63
+ if s.startswith("'''") and s.endswith("'''"):
64
+ return s[3:-3].strip()
65
+ return s
66
+
67
+ def normalize_whitespace(code: str) -> str:
68
+ """
69
+ Normalize line endings to LF, dedent, and strip leading/trailing whitespace.
70
+
71
+ arguments:
72
+ code:
73
+ The input code string.
74
+
75
+ returns:
76
+ The normalized code string.
77
+
78
+ examples:
79
+ >>> normalize_whitespace(" def foo():\\n pass ")
80
+ 'def foo():\\n pass'
81
+ >>> normalize_whitespace("def foo():\\r\\n pass\\r")
82
+ 'def foo():\\n pass'
83
+ """
84
+ code = code.replace("\r\n","\n").replace("\r","\n")
85
+ code = dedent(code).strip()
86
+ return code
87
+
88
+ def maybe_black(code: str) -> str:
89
+ """
90
+ If Black is installed, format; otherwise return as-is.
91
+
92
+ arguments:
93
+ code:
94
+ The input Python code string.
95
+
96
+ returns:
97
+ The formatted code string if Black is available; else the original code.
98
+
99
+ examples:
100
+ >>> maybe_black("def foo():pass")
101
+ 'def foo():\\n pass\\n'
102
+ """
103
+ try:
104
+ import black
105
+ return black.format_str(code, mode=black.FileMode())
106
+ except Exception:
107
+ return code
@@ -0,0 +1,18 @@
1
+ # coding/providers/core/atoms.py
2
+ from __future__ import annotations
3
+ from dataclasses import dataclass
4
+ from typing import Dict, Iterable, Protocol, Literal
5
+
6
+ Language = Literal["python", "cpp", "java"]
7
+
8
+ @dataclass(frozen=True)
9
+ class CodingTask:
10
+ """A normalized task with language + harness files to be executed."""
11
+ language: Language
12
+ files: Dict[str, str] # e.g., {"solution.py": "...", "tests.py": "..."} or C++/Java equivalents
13
+ options: Dict[str, object] # e.g., {"cxx_std": "c++20", "java_main": "MainTest"}
14
+
15
+ class Provider(Protocol):
16
+ """Dataset provider yields tasks (codegen or self-repair compatible)."""
17
+ name: str
18
+ def iter_tasks(self, split: str = "test") -> Iterable[CodingTask]: ...
File without changes
@@ -0,0 +1,31 @@
1
+ from __future__ import annotations
2
+ from dataclasses import dataclass
3
+ from typing import Iterable, Protocol, Literal
4
+
5
+ Language = Literal["python", "cpp", "java"]
6
+
7
+ @dataclass(frozen=True)
8
+ class CodingTask:
9
+ """
10
+ A normalized task with language + harness files to be executed.
11
+
12
+ attributes:
13
+ language:
14
+ The programming language of the task.
15
+ files:
16
+ A dictionary mapping filenames to their content. For example,
17
+ {"solution.py": "...", "tests.py": "..."} for Python tasks,
18
+ or equivalent files for C++/Java tasks.
19
+ options:
20
+ A dictionary of additional options that may be required for
21
+ execution. For example, {"cxx_std": "c++20"} for C++ tasks,
22
+ or {"java_main": "MainTest"} for Java tasks.
23
+ """
24
+ language: Language
25
+ files: dict[str, str]
26
+ options: dict[str, object]
27
+
28
+ class Provider(Protocol):
29
+ """Dataset provider yields tasks (codegen or self-repair compatible)."""
30
+ name: str
31
+ def iter_tasks(self, split: str = "test") -> Iterable[CodingTask]: ...
@@ -0,0 +1,53 @@
1
+ # coding/providers/livecodebench/provider.py
2
+ from __future__ import annotations
3
+ from typing import Iterable
4
+ from ..core.atoms import CodingTask, Language
5
+
6
+ class LiveCodeBenchProvider:
7
+ """
8
+ Sketch adapter: load LiveCodeBench (code_generation_lite) and render tasks.
9
+ Note: HF card states it's used for self-repair with test case feedback too.
10
+ """
11
+ name = "livecodebench"
12
+
13
+ def __init__(self, language: Language = "python"):
14
+ self.language = language
15
+
16
+ def iter_tasks(self, split: str = "test") -> Iterable[CodingTask]:
17
+ # placeholder: integrate HF datasets on your host and transform each row
18
+ # according to `self.language` into {files} + options.
19
+ # HF dataset card: "also used for self-repair using test case feedback".
20
+ # https://huggingface.co/datasets/livecodebench/code_generation_lite
21
+ # (Keep this stub lean; real impl will map test templates per language.)
22
+ # Yield a toy one so examples work:
23
+ if self.language == "python":
24
+ yield CodingTask(
25
+ language="python",
26
+ files={
27
+ "solution.py": "def add(a,b): return a - b # BUG",
28
+ "tests.py": "from solution import add\n"
29
+ "def test_ok(): assert add(1,2)==3\n"
30
+ "def test_neg(): assert add(-5,2)==-3\n"
31
+ },
32
+ options={}
33
+ )
34
+ elif self.language == "cpp":
35
+ yield CodingTask(
36
+ language="cpp",
37
+ files={
38
+ "solution.cpp":"int add(int a,int b){return a-b;}",
39
+ "test_main.cpp":"#include <cassert>\nint add(int,int);\nint main(){assert(add(1,2)==3);assert(add(-5,2)==-3);return 0;}"
40
+ },
41
+ options={"cxx_std":"c++17"}
42
+ )
43
+ else: # java
44
+ yield CodingTask(
45
+ language="java",
46
+ files={
47
+ "Solution.java":"public class Solution{public static int add(int a,int b){return a-b;}}",
48
+ "MainTest.java":"public class MainTest{public static void main(String[]a){"
49
+ "if(Solution.add(1,2)!=3)throw new RuntimeException(\"f1\");"
50
+ "if(Solution.add(-5,2)!=-3)throw new RuntimeException(\"f2\");}}"
51
+ },
52
+ options={"java_main":"MainTest"}
53
+ )
File without changes
File without changes
@@ -0,0 +1,105 @@
1
+ from __future__ import annotations
2
+ from dataclasses import dataclass
3
+ from typing import Protocol, runtime_checkable
4
+
5
+ __all__ = ["Job", "Result", "LanguageRecipe", "SandboxExecutor"]
6
+
7
+ @dataclass(frozen=True)
8
+ class Job:
9
+ """How to build + run a submission inside a sandbox.
10
+
11
+ attributes:
12
+ language:
13
+ Programming language, e.g. "python", "cpp", "java".
14
+ compile_argv:
15
+ If not None, argv to compile the code (e.g. ["g++", "-o", "program", "solution.cpp"]).
16
+ If None, no compilation step is done.
17
+ run_argv:
18
+ argv to run the code (e.g. ["./program"] or ["python3", "solution.py"]).
19
+ cpu_limit_s:
20
+ CPU time limit in seconds (e.g. 3).
21
+ wall_timeout_s:
22
+ Wall clock timeout in seconds (e.g. 8).
23
+ mem_limit_mb:
24
+ Memory limit in megabytes (e.g. 4096).
25
+ fsize_mb:
26
+ Max file size in megabytes (e.g. 16).
27
+ nproc:
28
+ Max number of processes/threads (e.g. 128).
29
+ nofile:
30
+ Max number of open files (e.g. 512).
31
+
32
+ example:
33
+ >>> job = Job(
34
+ >>> language="python",
35
+ >>> compile_argv=None,
36
+ >>> run_argv=["python3", "solution.py"],
37
+ >>> cpu_limit_s=3,
38
+ >>> wall_timeout_s=8,
39
+ >>> mem_limit_mb=4096,
40
+ >>> fsize_mb=16,
41
+ >>> nproc=128,
42
+ >>> nofile=512,
43
+ >>> )
44
+ """
45
+ language: str
46
+ compile_argv: list[str] | None
47
+ run_argv: list[str]
48
+ cpu_limit_s: int = 3
49
+ wall_timeout_s: int = 8
50
+ mem_limit_mb: int = 4096
51
+ fsize_mb: int = 16
52
+ nproc: int = 128
53
+ nofile: int = 512
54
+
55
+ @dataclass(frozen=True)
56
+ class Result:
57
+ """
58
+ Result of running a Job inside a sandbox.
59
+
60
+ attributes:
61
+ status:
62
+ One of "ok", "compile_error", "runtime_error", "timeout".
63
+ exit_code:
64
+ Exit code of the program (or compiler), or -1 if killed by timeout or OOM.
65
+ stdout:
66
+ Captured standard output (max 32k chars).
67
+ stderr:
68
+ Captured standard error (max 32k chars).
69
+ elapsed:
70
+ Wall clock time elapsed in seconds (float).
71
+
72
+ example:
73
+ >>> res = Result(
74
+ >>> status="ok",
75
+ >>> exit_code=0,
76
+ >>> stdout="Hello, world!",
77
+ >>> stderr="",
78
+ >>> elapsed=1.23,
79
+ >>> )
80
+ """
81
+ status: str
82
+ exit_code: int
83
+ stdout: str
84
+ stderr: str
85
+ elapsed: float
86
+
87
+ @runtime_checkable
88
+ class LanguageRecipe(Protocol):
89
+ """
90
+ Knows how to create a Job for a given language and set of files.
91
+
92
+ attributes:
93
+ language:
94
+ The programming language this recipe supports, e.g. "python", "cpp", "java".
95
+ """
96
+ language: str
97
+ def make_job(self, **options) -> Job: ...
98
+
99
+
100
+ @runtime_checkable
101
+ class SandboxExecutor(Protocol):
102
+ """
103
+ Executes a Job inside a sandbox, given a read-only job dir of files.
104
+ """
105
+ def run(self, files: dict[str, str], job: Job) -> Result: ...
@@ -0,0 +1,118 @@
1
+ from __future__ import annotations
2
+ import json, os, subprocess, tempfile
3
+ from typing import TYPE_CHECKING
4
+ from wisent.benchmarks.coding.safe_docker.core.atoms import Result, SandboxExecutor
5
+
6
+ if TYPE_CHECKING:
7
+ from wisent.benchmarks.coding.safe_docker.core.atoms import Job
8
+
9
+ __all__ = ["DockerSandboxExecutor"]
10
+
11
+ DEFAULT_IMAGE = "coding/sandbox:polyglot-1.0"
12
+
13
+ SAFE_FLAGS = [
14
+ "--rm", "--network=none",
15
+ "--pids-limit=256",
16
+ "--read-only",
17
+ "--cap-drop=ALL",
18
+ "--security-opt=no-new-privileges",
19
+ ]
20
+
21
+ TMPFS_FLAGS = [
22
+ "--tmpfs", "/tmp:exec,mode=1777,size=134217728",
23
+ "--tmpfs", "/work:exec,mode=1777,size=268435456",
24
+ ]
25
+
26
+
27
+ class DockerSandboxExecutor(SandboxExecutor):
28
+ """
29
+ Executes a Job inside a Docker container, given a read-only job dir of files.
30
+ """
31
+ def __init__(self, image: str = DEFAULT_IMAGE, runtime: str | None = None):
32
+ self.image = image
33
+ self.runtime = runtime
34
+
35
+ def run(self, files: dict[str, str], job: Job) -> Result:
36
+ """
37
+ Runs a Job inside a Docker container, given a read-only job dir of files.
38
+
39
+ arguments:
40
+ files:
41
+ A mapping of filename to file content, representing the job directory.
42
+ job:
43
+ The Job to execute.
44
+
45
+ exceptions:
46
+ Raises subprocess.CalledProcessError if the `docker` command itself fails.
47
+
48
+ returns:
49
+ A Result object with the outcome of the execution.
50
+
51
+ example (pythonm add function)
52
+ >>> from wisent.benchmarks.coding.safe_docker.core.atoms import Job, Result
53
+ >>> from wisent.benchmarks.coding.safe_docker.core.runtime import DockerSandboxExecutor
54
+ >>> job = Job(
55
+ ... language="python",
56
+ ... compile_argv=None,
57
+ ... run_argv=["python3", "/job/tests.py"],
58
+ ... cpu_limit_s=2,
59
+ ... wall_timeout_s=5,
60
+ ... mem_limit_mb=256,
61
+ ... )
62
+ >>> files = {
63
+ ... "solution.py": "def add(a,b): return a + b",
64
+ ... "tests.py": "from solution import add\ndef test_ok(): assert add(1,2)==3",
65
+ ... }
66
+ >>> res: Result = DockerSandboxExecutor().run(files, job)
67
+ >>> res.status
68
+ 'ok'
69
+ >>> res.exit_code
70
+ 0
71
+ >>> res.stdout
72
+ 'test_ok passed'
73
+ >>> res.stderr
74
+ ''
75
+ >>> round(res.elapsed, 2)
76
+ 0.23
77
+ """
78
+ with tempfile.TemporaryDirectory() as tmp:
79
+ job_dir = os.path.join(tmp, "job")
80
+ os.makedirs(job_dir, exist_ok=True)
81
+ for name, content in files.items():
82
+ with open(os.path.join(job_dir, name), "w", encoding="utf-8") as f:
83
+ f.write(content)
84
+ with open(os.path.join(job_dir, "job.json"), "w", encoding="utf-8") as f:
85
+ json.dump({
86
+ "language": job.language,
87
+ "compile": {"argv": job.compile_argv} if job.compile_argv else None,
88
+ "run": {"argv": job.run_argv},
89
+ "cpu_limit_s": job.cpu_limit_s,
90
+ "wall_timeout_s": job.wall_timeout_s,
91
+ "mem_limit_mb": job.mem_limit_mb,
92
+ "fsize_mb": job.fsize_mb,
93
+ "nproc": job.nproc,
94
+ "nofile": job.nofile,
95
+ }, f)
96
+ base = ["docker"]
97
+ if self.runtime:
98
+ base += ["--runtime", self.runtime]
99
+ cmd = base + ["run", "-i", *SAFE_FLAGS, *TMPFS_FLAGS, "-v", f"{job_dir}:/job:ro", self.image]
100
+ p = subprocess.run(cmd, check=False, capture_output=True, text=True)
101
+ out = (p.stdout or "").strip()
102
+ try:
103
+ payload = json.loads(out)
104
+ except json.JSONDecodeError:
105
+ return Result(
106
+ status="runtime_error",
107
+ exit_code=p.returncode,
108
+ stdout=p.stdout or "",
109
+ stderr=p.stderr or "Failed to parse executor output as JSON.",
110
+ elapsed=0.0,
111
+ )
112
+ return Result(
113
+ status=payload.get("status","runtime_error"),
114
+ exit_code=int(payload.get("exit_code", p.returncode)),
115
+ stdout=payload.get("stdout",""),
116
+ stderr=payload.get("stderr",""),
117
+ elapsed=float(payload.get("elapsed",0.0)),
118
+ )
@@ -0,0 +1,123 @@
1
+ from __future__ import annotations
2
+ import json, os, shutil, subprocess, sys, time, signal, resource
3
+
4
+ from wisent.benchmarks.coding.safe_docker.core.atoms import Job
5
+
6
+ JOB_FILE = "/job/job.json"
7
+ WORKDIR = "/work"
8
+
9
+ def set_limits(job: Job):
10
+ """
11
+ Set resource limits for the sandboxed process.
12
+
13
+ attributes:
14
+ job:
15
+ A Job object containing resource limit parameters.
16
+
17
+ example:
18
+
19
+ """
20
+ resource.setrlimit(resource.RLIMIT_CPU, (job["cpu_limit_s"],)*2)
21
+ resource.setrlimit(resource.RLIMIT_AS, (job["mem_limit_mb"]*1024*1024,)*2)
22
+ resource.setrlimit(resource.RLIMIT_FSIZE,(job["fsize_mb"]*1024*1024,)*2)
23
+ resource.setrlimit(resource.RLIMIT_NPROC,(job["nproc"],)*2)
24
+ resource.setrlimit(resource.RLIMIT_NOFILE,(job["nofile"],)*2)
25
+ resource.setrlimit(resource.RLIMIT_CORE,(0,0))
26
+ os.setsid()
27
+
28
+ def run(argv: list[str], job: Job) -> tuple[int,str,str,float,str]:
29
+ """
30
+ Run a command in a subprocess with resource limits.
31
+
32
+ attributes:
33
+ argv:
34
+ Command and arguments to run as a list of strings.
35
+ job:
36
+ A Job object containing resource limit parameters.
37
+
38
+ returns:
39
+ A tuple containing:
40
+ - exit code (int)
41
+ - standard output (str)
42
+ - standard error (str)
43
+ - elapsed time in seconds (float)
44
+ - status (str): "ok", "nonzero", "timeout", "missing", or "error"
45
+
46
+ example:
47
+ >>> code, out, err, elapsed, status = run(["python3", "solution.py"], job)
48
+ >>> print(status)
49
+ "ok"
50
+ >>> print(elapsed)
51
+ 0.123
52
+ >>> print(out)
53
+ "Hello, world!"
54
+ >>> print(err)
55
+ ""
56
+ """
57
+ start = time.time()
58
+ try:
59
+ p = subprocess.Popen(argv, cwd=WORKDIR, text=True,
60
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE,
61
+ preexec_fn=lambda: set_limits(job))
62
+ try:
63
+ out, err = p.communicate(timeout=job["wall_timeout_s"])
64
+ except subprocess.TimeoutExpired:
65
+ try: os.killpg(p.pid, signal.SIGKILL)
66
+ except Exception: pass
67
+ return 124, "", f"Time limit exceeded ({job['wall_timeout_s']}s)\n", time.time()-start, "timeout"
68
+ status = "ok" if p.returncode == 0 else "nonzero"
69
+ return p.returncode, out, err, time.time()-start, status
70
+ except FileNotFoundError as e:
71
+ return 127, "", f"{e}\n", time.time()-start, "missing"
72
+ except Exception as e:
73
+ return 1, "", f"{e}\n", time.time()-start, "error"
74
+
75
+ def copy_job():
76
+ """
77
+ Copy job files from /job to /work directory.
78
+ """
79
+ os.makedirs(WORKDIR, exist_ok=True)
80
+ for root, _, files in os.walk("/job"):
81
+ rel = os.path.relpath(root, "/job")
82
+ dst = os.path.join(WORKDIR, "" if rel == "." else rel)
83
+ os.makedirs(dst, exist_ok=True)
84
+ for f in files:
85
+ shutil.copy2(os.path.join(root, f), os.path.join(dst, f))
86
+
87
+ def main():
88
+ """
89
+ Main function to execute the job defined in /job/job.json.
90
+
91
+ returns:
92
+ Exit code 0 on success, 2 if job file is missing.
93
+ """
94
+ if not os.path.exists(JOB_FILE):
95
+ print("Missing /job/job.json", file=sys.stderr); return 2
96
+ with open(JOB_FILE, "r", encoding="utf-8") as f:
97
+ job = json.load(f)
98
+
99
+ copy_job()
100
+
101
+ # optional quick syntax check for Python
102
+ if job["language"] == "python":
103
+ _, _, err, _, _ = run([sys.executable, "-m", "py_compile", "solution.py"], job)
104
+ if err:
105
+ print(json.dumps({"status":"compile_error","stdout":"","stderr":err,"elapsed":0.0,"exit_code":1}))
106
+ return 0
107
+
108
+ if job.get("compile"):
109
+ code, out, err, el, _ = run(job["compile"]["argv"], job)
110
+ if code != 0:
111
+ print(json.dumps({"status":"compile_error","stdout":out,"stderr":err,"elapsed":el,"exit_code":code}))
112
+ return 0
113
+
114
+ code, out, err, el, status = run(job["run"]["argv"], job)
115
+ payload = {
116
+ "status": "ok" if code == 0 else ("timeout" if status == "timeout" else "runtime_error"),
117
+ "stdout": out, "stderr": err, "elapsed": el, "exit_code": code
118
+ }
119
+ print(json.dumps(payload))
120
+ return 0
121
+
122
+ if __name__ == "__main__":
123
+ raise SystemExit(main())
@@ -0,0 +1,60 @@
1
+ from __future__ import annotations
2
+ from typing import Dict
3
+ from wisent.benchmarks.coding.safe_docker.core.atoms import Job, LanguageRecipe
4
+
5
+ class PythonRecipe(LanguageRecipe):
6
+ """
7
+ Recipe for running Python code in a sandboxed environment.
8
+ """
9
+ language = "python"
10
+ def make_job(self, **options) -> Job:
11
+ run = ["pytest", "-q", "--maxfail=1", "--tb=short", "-rA", "tests.py"]
12
+ return Job(language="python", compile_argv=None, run_argv=run,
13
+ cpu_limit_s=options.get("cpu_limit_s",3),
14
+ wall_timeout_s=options.get("time_limit_s",8),
15
+ mem_limit_mb=options.get("mem_limit_mb",768))
16
+
17
+ class CppRecipe(LanguageRecipe):
18
+ language = "cpp"
19
+ def make_job(self, **options) -> Job:
20
+ std = options.get("cxx_std", "c++17")
21
+ compile_cmd = ["bash","-lc", f"g++ -std={std} -O2 -pipe -o program solution.cpp test_main.cpp"]
22
+ run_cmd = ["bash","-lc","./program"]
23
+ return Job(language="cpp", compile_argv=compile_cmd, run_argv=run_cmd,
24
+ cpu_limit_s=options.get("cpu_limit_s",3),
25
+ wall_timeout_s=options.get("time_limit_s",8),
26
+ mem_limit_mb=options.get("mem_limit_mb",768))
27
+
28
+ class JavaRecipe:
29
+ language = "java"
30
+ def make_job(self, **options) -> Job:
31
+ main = options.get("java_main", "MainTest")
32
+
33
+ java_opts = options.get(
34
+ "java_opts",
35
+ "-Xms32m -Xmx256m -Xss512k "
36
+ "-XX:CompressedClassSpaceSize=64m "
37
+ "-XX:MaxMetaspaceSize=128m "
38
+ "-XX:ReservedCodeCacheSize=64m "
39
+ "-XX:MaxDirectMemorySize=64m "
40
+ "-XX:+UseSerialGC -XX:+ExitOnOutOfMemoryError"
41
+ )
42
+
43
+ compile_cmd = ["bash", "-lc", "javac *.java"]
44
+ run_cmd = ["bash", "-lc", f"java {java_opts} {main}"]
45
+
46
+ return Job(
47
+ language="java",
48
+ compile_argv=compile_cmd,
49
+ run_argv=run_cmd,
50
+ cpu_limit_s=options.get("cpu_limit_s", 3),
51
+ wall_timeout_s=options.get("time_limit_s", 8),
52
+ mem_limit_mb=options.get("mem_limit_mb", 768),
53
+ )
54
+
55
+
56
+ RECIPE_REGISTRY = {
57
+ "python": PythonRecipe(),
58
+ "cpp": CppRecipe(),
59
+ "java": JavaRecipe(),
60
+ }
File without changes
File without changes