wisent 0.1.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (237) hide show
  1. wisent/__init__.py +1 -8
  2. wisent/benchmarks/__init__.py +0 -0
  3. wisent/benchmarks/coding/__init__.py +0 -0
  4. wisent/benchmarks/coding/metrics/__init__.py +0 -0
  5. wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
  6. wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
  7. wisent/benchmarks/coding/metrics/evaluator.py +275 -0
  8. wisent/benchmarks/coding/metrics/passk.py +66 -0
  9. wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
  10. wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
  11. wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
  12. wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
  13. wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
  14. wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
  15. wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
  16. wisent/benchmarks/coding/providers/__init__.py +18 -0
  17. wisent/benchmarks/coding/providers/core/__init__.py +0 -0
  18. wisent/benchmarks/coding/providers/core/atoms.py +31 -0
  19. wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
  20. wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
  21. wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
  22. wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
  23. wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
  24. wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
  25. wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
  26. wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
  27. wisent/classifiers/__init__.py +0 -0
  28. wisent/classifiers/core/__init__.py +0 -0
  29. wisent/classifiers/core/atoms.py +747 -0
  30. wisent/classifiers/models/__init__.py +0 -0
  31. wisent/classifiers/models/logistic.py +29 -0
  32. wisent/classifiers/models/mlp.py +47 -0
  33. wisent/cli/__init__.py +0 -0
  34. wisent/cli/classifiers/__init__.py +0 -0
  35. wisent/cli/classifiers/classifier_rotator.py +137 -0
  36. wisent/cli/cli_logger.py +142 -0
  37. wisent/cli/data_loaders/__init__.py +0 -0
  38. wisent/cli/data_loaders/data_loader_rotator.py +96 -0
  39. wisent/cli/evaluators/__init__.py +0 -0
  40. wisent/cli/evaluators/evaluator_rotator.py +148 -0
  41. wisent/cli/steering_methods/__init__.py +0 -0
  42. wisent/cli/steering_methods/steering_rotator.py +110 -0
  43. wisent/cli/wisent_cli/__init__.py +0 -0
  44. wisent/cli/wisent_cli/commands/__init__.py +0 -0
  45. wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
  46. wisent/cli/wisent_cli/commands/listing.py +154 -0
  47. wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
  48. wisent/cli/wisent_cli/main.py +93 -0
  49. wisent/cli/wisent_cli/shell.py +80 -0
  50. wisent/cli/wisent_cli/ui.py +69 -0
  51. wisent/cli/wisent_cli/util/__init__.py +0 -0
  52. wisent/cli/wisent_cli/util/aggregations.py +43 -0
  53. wisent/cli/wisent_cli/util/parsing.py +126 -0
  54. wisent/cli/wisent_cli/version.py +4 -0
  55. wisent/core/__init__.py +27 -0
  56. wisent/core/activations/__init__.py +0 -0
  57. wisent/core/activations/activations_collector.py +338 -0
  58. wisent/core/activations/core/__init__.py +0 -0
  59. wisent/core/activations/core/atoms.py +216 -0
  60. wisent/core/agent/__init__.py +18 -0
  61. wisent/core/agent/budget.py +638 -0
  62. wisent/core/agent/device_benchmarks.py +685 -0
  63. wisent/core/agent/diagnose/__init__.py +55 -0
  64. wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
  65. wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
  66. wisent/core/agent/diagnose/create_classifier.py +1154 -0
  67. wisent/core/agent/diagnose/response_diagnostics.py +268 -0
  68. wisent/core/agent/diagnose/select_classifiers.py +506 -0
  69. wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
  70. wisent/core/agent/diagnose/tasks/__init__.py +33 -0
  71. wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
  72. wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
  73. wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
  74. wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
  75. wisent/core/agent/diagnose.py +242 -0
  76. wisent/core/agent/steer.py +212 -0
  77. wisent/core/agent/timeout.py +134 -0
  78. wisent/core/autonomous_agent.py +1234 -0
  79. wisent/core/bigcode_integration.py +583 -0
  80. wisent/core/contrastive_pairs/__init__.py +15 -0
  81. wisent/core/contrastive_pairs/core/__init__.py +0 -0
  82. wisent/core/contrastive_pairs/core/atoms.py +45 -0
  83. wisent/core/contrastive_pairs/core/buliders.py +59 -0
  84. wisent/core/contrastive_pairs/core/pair.py +178 -0
  85. wisent/core/contrastive_pairs/core/response.py +152 -0
  86. wisent/core/contrastive_pairs/core/serialization.py +300 -0
  87. wisent/core/contrastive_pairs/core/set.py +133 -0
  88. wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
  89. wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
  90. wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
  91. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
  92. wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
  93. wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
  94. wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
  95. wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
  96. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
  102. wisent/core/data_loaders/__init__.py +0 -0
  103. wisent/core/data_loaders/core/__init__.py +0 -0
  104. wisent/core/data_loaders/core/atoms.py +98 -0
  105. wisent/core/data_loaders/loaders/__init__.py +0 -0
  106. wisent/core/data_loaders/loaders/custom.py +120 -0
  107. wisent/core/data_loaders/loaders/lm_loader.py +218 -0
  108. wisent/core/detection_handling.py +257 -0
  109. wisent/core/download_full_benchmarks.py +1386 -0
  110. wisent/core/evaluators/__init__.py +0 -0
  111. wisent/core/evaluators/oracles/__init__.py +0 -0
  112. wisent/core/evaluators/oracles/interactive.py +73 -0
  113. wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
  114. wisent/core/evaluators/oracles/user_specified.py +67 -0
  115. wisent/core/hyperparameter_optimizer.py +429 -0
  116. wisent/core/lm_eval_harness_ground_truth.py +1396 -0
  117. wisent/core/log_likelihoods_evaluator.py +321 -0
  118. wisent/core/managed_cached_benchmarks.py +595 -0
  119. wisent/core/mixed_benchmark_sampler.py +364 -0
  120. wisent/core/model_config_manager.py +330 -0
  121. wisent/core/model_persistence.py +317 -0
  122. wisent/core/models/__init__.py +0 -0
  123. wisent/core/models/core/__init__.py +0 -0
  124. wisent/core/models/core/atoms.py +460 -0
  125. wisent/core/models/wisent_model.py +727 -0
  126. wisent/core/multi_steering.py +316 -0
  127. wisent/core/optuna/__init__.py +57 -0
  128. wisent/core/optuna/classifier/__init__.py +25 -0
  129. wisent/core/optuna/classifier/activation_generator.py +349 -0
  130. wisent/core/optuna/classifier/classifier_cache.py +509 -0
  131. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
  132. wisent/core/optuna/steering/__init__.py +0 -0
  133. wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
  134. wisent/core/optuna/steering/data_utils.py +342 -0
  135. wisent/core/optuna/steering/metrics.py +474 -0
  136. wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
  137. wisent/core/optuna/steering/steering_optimization.py +1111 -0
  138. wisent/core/parser.py +1668 -0
  139. wisent/core/prompts/__init__.py +0 -0
  140. wisent/core/prompts/core/__init__.py +0 -0
  141. wisent/core/prompts/core/atom.py +57 -0
  142. wisent/core/prompts/core/prompt_formater.py +157 -0
  143. wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
  144. wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
  145. wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
  146. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
  147. wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
  148. wisent/core/representation.py +5 -0
  149. wisent/core/sample_size_optimizer.py +648 -0
  150. wisent/core/sample_size_optimizer_v2.py +355 -0
  151. wisent/core/save_results.py +277 -0
  152. wisent/core/steering.py +652 -0
  153. wisent/core/steering_method.py +26 -0
  154. wisent/core/steering_methods/__init__.py +0 -0
  155. wisent/core/steering_methods/core/__init__.py +0 -0
  156. wisent/core/steering_methods/core/atoms.py +153 -0
  157. wisent/core/steering_methods/methods/__init__.py +0 -0
  158. wisent/core/steering_methods/methods/caa.py +44 -0
  159. wisent/core/steering_optimizer.py +1297 -0
  160. wisent/core/task_interface.py +132 -0
  161. wisent/core/task_selector.py +189 -0
  162. wisent/core/tasks/__init__.py +175 -0
  163. wisent/core/tasks/aime_task.py +141 -0
  164. wisent/core/tasks/file_task.py +211 -0
  165. wisent/core/tasks/hle_task.py +180 -0
  166. wisent/core/tasks/hmmt_task.py +119 -0
  167. wisent/core/tasks/livecodebench_task.py +201 -0
  168. wisent/core/tasks/livemathbench_task.py +158 -0
  169. wisent/core/tasks/lm_eval_task.py +455 -0
  170. wisent/core/tasks/math500_task.py +84 -0
  171. wisent/core/tasks/polymath_task.py +146 -0
  172. wisent/core/tasks/supergpqa_task.py +220 -0
  173. wisent/core/time_estimator.py +149 -0
  174. wisent/core/timing_calibration.py +174 -0
  175. wisent/core/tracking/__init__.py +54 -0
  176. wisent/core/tracking/latency.py +618 -0
  177. wisent/core/tracking/memory.py +359 -0
  178. wisent/core/trainers/__init__.py +0 -0
  179. wisent/core/trainers/core/__init__.py +11 -0
  180. wisent/core/trainers/core/atoms.py +45 -0
  181. wisent/core/trainers/steering_trainer.py +271 -0
  182. wisent/core/user_model_config.py +158 -0
  183. wisent/opti/__init__.py +0 -0
  184. wisent/opti/core/__init__.py +0 -0
  185. wisent/opti/core/atoms.py +175 -0
  186. wisent/opti/methods/__init__.py +0 -0
  187. wisent/opti/methods/opti_classificator.py +172 -0
  188. wisent/opti/methods/opti_steering.py +138 -0
  189. wisent/synthetic/__init__.py +0 -0
  190. wisent/synthetic/cleaners/__init__.py +0 -0
  191. wisent/synthetic/cleaners/core/__init__.py +0 -0
  192. wisent/synthetic/cleaners/core/atoms.py +58 -0
  193. wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
  194. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  195. wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
  196. wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
  197. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  198. wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
  199. wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
  200. wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
  201. wisent/synthetic/db_instructions/__init__.py +0 -0
  202. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  203. wisent/synthetic/db_instructions/core/atoms.py +25 -0
  204. wisent/synthetic/db_instructions/mini_dp.py +37 -0
  205. wisent/synthetic/generators/__init__.py +0 -0
  206. wisent/synthetic/generators/core/__init__.py +0 -0
  207. wisent/synthetic/generators/core/atoms.py +73 -0
  208. wisent/synthetic/generators/diversities/__init__.py +0 -0
  209. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  210. wisent/synthetic/generators/diversities/core/core.py +68 -0
  211. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  212. wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
  213. wisent/synthetic/generators/pairs_generator.py +179 -0
  214. wisent-0.5.2.dist-info/METADATA +67 -0
  215. wisent-0.5.2.dist-info/RECORD +218 -0
  216. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/WHEEL +1 -1
  217. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info/licenses}/LICENSE +2 -2
  218. wisent/activations/__init__.py +0 -9
  219. wisent/activations/client.py +0 -97
  220. wisent/activations/extractor.py +0 -251
  221. wisent/activations/models.py +0 -95
  222. wisent/client.py +0 -45
  223. wisent/control_vector/__init__.py +0 -9
  224. wisent/control_vector/client.py +0 -85
  225. wisent/control_vector/manager.py +0 -168
  226. wisent/control_vector/models.py +0 -70
  227. wisent/inference/__init__.py +0 -9
  228. wisent/inference/client.py +0 -103
  229. wisent/inference/inferencer.py +0 -250
  230. wisent/inference/models.py +0 -66
  231. wisent/utils/__init__.py +0 -3
  232. wisent/utils/auth.py +0 -30
  233. wisent/utils/http.py +0 -228
  234. wisent/version.py +0 -3
  235. wisent-0.1.1.dist-info/METADATA +0 -142
  236. wisent-0.1.1.dist-info/RECORD +0 -23
  237. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/top_level.txt +0 -0
wisent/__init__.py CHANGED
@@ -1,8 +1 @@
1
- """
2
- Wisent - Client library for interacting with the Wisent backend services.
3
- """
4
-
5
- from wisent.client import WisentClient
6
- from wisent.version import __version__
7
-
8
- __all__ = ["WisentClient", "__version__"]
1
+ __version__ = "0.5.2"
File without changes
File without changes
File without changes
File without changes
@@ -0,0 +1,36 @@
1
+ from __future__ import annotations
2
+ from dataclasses import dataclass
3
+ from typing import Iterable, Protocol
4
+
5
+ @dataclass(frozen=True)
6
+ class SampleOutcome:
7
+ """
8
+ Result of executing a single sample (possibly after self-repair).
9
+
10
+ attributes:
11
+ task_id:
12
+ The unique identifier for the task.
13
+ status:
14
+ One of "ok", "compile_error", "runtime_error", or "timeout".
15
+ passed:
16
+ True if the code passed all tests, False otherwise.
17
+ elapsed:
18
+ Time taken to execute the code in seconds.
19
+ """
20
+ task_id: str
21
+ status: str
22
+ passed: bool
23
+ elapsed: float
24
+
25
+ class Metric(Protocol):
26
+ """
27
+ Metric computes a score from an iterable of SampleOutcome.
28
+ """
29
+ def compute(self, outcomes: Iterable[SampleOutcome]) -> float: ...
30
+
31
+
32
+ class Evaluator(Protocol):
33
+ """
34
+ Runs tasks end-to-end (codegen + optional self-repair) and yields SampleOutcome.
35
+ """
36
+ def evaluate(self) -> Iterable[SampleOutcome]: ...
@@ -0,0 +1,275 @@
1
+ from __future__ import annotations
2
+ from dataclasses import dataclass
3
+ from typing import Callable, Iterable, Optional, TYPE_CHECKING
4
+
5
+ from wisent.benchmarks.coding.safe_docker.core.runtime import DockerSandboxExecutor
6
+ from wisent.benchmarks.coding.safe_docker.recipes import RECIPE_REGISTRY
7
+ from wisent.benchmarks.coding.metrics.core.atoms import SampleOutcome, Evaluator
8
+
9
+ from wisent.benchmarks.coding.output_sanitizer.core.atoms import TaskSchema
10
+ from wisent.benchmarks.coding.output_sanitizer.python_sanitizer import PythonStandardizer
11
+ from wisent.benchmarks.coding.output_sanitizer.cpp_sanitizer import CppStandardizer
12
+ from wisent.benchmarks.coding.output_sanitizer.java_sanitizer import JavaStandardizer
13
+
14
+ if TYPE_CHECKING:
15
+ from wisent.benchmarks.coding.safe_docker.core.atoms import Result
16
+ from wisent.benchmarks.coding.providers.core.atoms import Provider, CodingTask
17
+ from wisent.benchmarks.coding.output_sanitizer.core.atoms import CodeStandardizer
18
+
19
+ RepairFn = Callable[[str, dict[str,str], str], dict[str,str]]
20
+
21
+ @dataclass
22
+ class EvaluatorConfig:
23
+ """
24
+ Configuration for CodingEvaluator.
25
+
26
+ attributes:
27
+ image:
28
+ Docker image to use for code execution (default: "coding/sandbox:polyglot-1.0").
29
+ runtime:
30
+ Optional Docker runtime (e.g., "runsc" for gVisor).
31
+ feedback_max_chars:
32
+ Maximum characters of feedback to pass to the repair function (default: 2000).
33
+ self_repair:
34
+ Whether to perform a single self-repair turn (default: True). It means the we provide feedback to the model for one iteration.
35
+ time_limit_s:
36
+ Time limit in seconds for each code execution (default: 8s).
37
+ cpu_limit_s:
38
+ CPU time limit in seconds for each code execution (default: 3s).
39
+ mem_limit_mb:
40
+ Memory limit in megabytes for each code execution (default: 768MB).
41
+ pre_sanitize:
42
+ Whether to run LLM output through a sanitizer before execution (default: True).
43
+ """
44
+ image: str = "coding/sandbox:polyglot-1.0"
45
+ runtime: Optional[str] = None
46
+ feedback_max_chars: int = 2000
47
+ self_repair: bool = True
48
+ time_limit_s: int = 8
49
+ cpu_limit_s: int = 3
50
+ mem_limit_mb: int = 768
51
+ pre_sanitize: bool = True
52
+
53
+ _SANITIZERS = {
54
+ "python": PythonStandardizer(),
55
+ "cpp": CppStandardizer(),
56
+ "java": JavaStandardizer(),
57
+ }
58
+
59
+ def _default_filename(lang: str) -> str:
60
+ """
61
+ Returns the default source file name for a given programming language.
62
+
63
+ arguments:
64
+ lang:
65
+ Programming language ("python", "cpp", or "java").
66
+
67
+ returns:
68
+ Default filename as a string.
69
+ """
70
+ return {"python":"solution.py","cpp":"solution.cpp","java":"Solution.java"}[lang]
71
+
72
+ def _make_schema(task: CodingTask) -> TaskSchema:
73
+ """
74
+ Constructs a TaskSchema from a CodingTask, using task options or defaults.
75
+
76
+ arguments:
77
+ task:
78
+ CodingTask containing language and options.
79
+
80
+ returns:
81
+ TaskSchema with language, file_name, entry_point, java_class, prefer_rename,
82
+ and allow_wrapper set appropriately.
83
+
84
+ example:
85
+ >>> from wisent.benchmarks.coding.providers.core.atoms import CodingTask
86
+ >>> task = CodingTask(language="python", files={}, options={"entry_point":"add","file_name":"my_solution.py"})
87
+ >>> schema = _make_schema(task)
88
+ >>> schema.language
89
+ 'python'
90
+ >>> schema.file_name
91
+ 'my_solution.py'
92
+ >>> schema.entry_point
93
+ 'add'
94
+ >>> schema.java_class
95
+ 'Solution'
96
+ >>> schema.prefer_rename
97
+ True
98
+ >>> schema.allow_wrapper
99
+ True
100
+ """
101
+ entry = str(task.options.get("entry_point", "solve"))
102
+ file_name = str(task.options.get("file_name", _default_filename(task.language)))
103
+ java_class = str(task.options.get("java_class", "Solution"))
104
+ return TaskSchema(language=task.language, file_name=file_name, entry_point=entry,
105
+ java_class=java_class, prefer_rename=True, allow_wrapper=True)
106
+
107
+ class CodingEvaluator(Evaluator):
108
+ """
109
+ Evaluator for coding tasks with optional self-repair.
110
+ """
111
+ def __init__(self, provider: Provider, model_fn: Callable[[CodingTask], dict[str,str]],
112
+ repair_fn: Optional[RepairFn] = None, cfg: EvaluatorConfig = EvaluatorConfig()):
113
+ self.provider = provider
114
+ self.model_fn = model_fn
115
+ self.repair_fn = repair_fn
116
+ self.cfg = cfg
117
+ self.exec = DockerSandboxExecutor(image=cfg.image, runtime=cfg.runtime)
118
+
119
+ def _feedback(self, res: "Result") -> str:
120
+ """
121
+ Generates feedback text from a Result object for use in self-repair.
122
+
123
+ arguments:
124
+ res:
125
+ Result object containing status, stdout, stderr, and elapsed time.
126
+
127
+ returns:
128
+ Feedback string summarizing the result, truncated to cfg.feedback_max_chars.
129
+
130
+ examples:
131
+ >>> from wisent.benchmarks.coding.safe_docker.core.atoms import Result
132
+ >>> res = Result(status="timeout", stdout="", stderr="", elapsed=10.0)
133
+ >>> evaluator = CodingEvaluator(provider=None, model_fn=lambda x: {}, cfg=EvaluatorConfig())
134
+ >>> evaluator._feedback(res)
135
+ 'Timeout after 10.00s.'
136
+ >>> res = Result(status="compile_error", stdout="", stderr="error: something went wrong", elapsed=1.5)
137
+ >>> evaluator._feedback(res)
138
+ 'Compilation failed:\nerror: something went wrong'
139
+ >>> res = Result(status="runtime_error", stdout="test failed", stderr="", elapsed=0.5)
140
+ >>> evaluator._feedback(res)
141
+ 'Runtime error:\ntest failed'
142
+ """
143
+ if res.status == "timeout":
144
+ return f"Timeout after {res.elapsed:.2f}s."
145
+ body = (res.stdout or "") + ("\n" + res.stderr if res.stderr else "")
146
+ if res.status == "compile_error":
147
+ prefix = "Compilation failed:\n"
148
+ else:
149
+ prefix = "Tests failed:\n"
150
+ return (prefix + body)[: self.cfg.feedback_max_chars]
151
+
152
+ def _run_once(self, task: CodingTask, files: dict[str,str]) -> Result:
153
+ """
154
+ Runs a single evaluation job for the given task and files.
155
+
156
+ arguments:
157
+ task:
158
+ The coding task to evaluate.
159
+ files:
160
+ The files to include in the evaluation.
161
+
162
+ returns:
163
+ Result object containing the status, stdout, stderr, and elapsed time.
164
+
165
+ examples:
166
+ >>> from wisent.benchmarks.coding.providers.core.atoms import CodingTask
167
+ >>> from wisent.benchmarks.coding.safe_docker.core.atoms import Result
168
+ >>> task = CodingTask(language="python", files={}, options={})
169
+ >>> files = {"solution.py": "def add(a,b): return a + b", "tests.py": "from solution import add\ndef test_ok(): assert add(1,2)==3"}
170
+ >>> evaluator = CodingEvaluator(provider=None, model_fn=lambda x: {})
171
+ >>> res: Result = evaluator._run_once(task, files)
172
+ >>> res.status
173
+ 'ok'
174
+ >>> res.exit_code
175
+ 0
176
+ >>> res.stdout
177
+ 'test_ok passed'
178
+ >>> res.stderr
179
+ ''
180
+ >>> round(res.elapsed, 2)
181
+ 0.23
182
+ """
183
+ recipe = RECIPE_REGISTRY[task.language]
184
+ job = recipe.make_job(files, **task.options,
185
+ time_limit_s=self.cfg.time_limit_s,
186
+ cpu_limit_s=self.cfg.cpu_limit_s,
187
+ mem_limit_mb=self.cfg.mem_limit_mb)
188
+ return self.exec.run(files, job)
189
+
190
+ def _maybe_sanitize(self, task: CodingTask, files: dict[str,str]) -> dict[str,str]:
191
+ """
192
+ Optionally sanitizes the generated files based on the task schema.
193
+
194
+ arguments:
195
+ task:
196
+ The coding task containing language and options.
197
+ files:
198
+ The generated files to potentially sanitize.
199
+
200
+ returns:
201
+ The sanitized files if pre_sanitize is True and a sanitizer exists for the language; otherwise, the original files.
202
+
203
+ examples:
204
+ >>> from wisent.benchmarks.coding.providers.core.atoms import CodingTask
205
+ >>> task = CodingTask(language="python", files={}, options={"entry_point":"add","file_name":"my_solution.py"})
206
+ >>> files = {"my_solution.py": "def add(a,b): return a - b # BUG"}
207
+ >>> evaluator = CodingEvaluator(provider=None, model_fn=lambda x: {}, cfg=EvaluatorConfig(pre_sanitize=True))
208
+ >>> sanitized_files = evaluator._maybe_sanitize(task, files)
209
+ >>> "my_solution.py" in sanitized_files
210
+ True
211
+ >>> sanitized_files["my_solution.py"]
212
+ 'def add(a, b):\n return a + b\n'
213
+ """
214
+ if not self.cfg.pre_sanitize:
215
+ return files
216
+ schema = _make_schema(task)
217
+ sanitizer: "CodeStandardizer" = _SANITIZERS.get(task.language)
218
+ if sanitizer is None:
219
+ return files
220
+
221
+ raw = files.get(schema.file_name) or files.get("__raw__")
222
+ if not raw:
223
+ return files
224
+
225
+ out = sanitizer.normalize(raw, schema)
226
+ files = {**files, schema.file_name: out.files.get(schema.file_name, raw)}
227
+ return files
228
+
229
+ def evaluate(self) -> Iterable[SampleOutcome]:
230
+ """
231
+ Evaluates all tasks from the provider, performing optional self-repair.
232
+
233
+ yields:
234
+ SampleOutcome for each task, indicating pass/fail status and elapsed time.
235
+
236
+ examples:
237
+ >>> from wisent.benchmarks.coding.providers.core.atoms import CodingTask, Provider
238
+ >>> class DummyProvider:
239
+ ... name = "dummy"
240
+ ... def iter_tasks(self):
241
+ ... yield CodingTask(language="python", files={"tests.py":"from solution import add\ndef test_ok(): assert add(1,2)==3"},
242
+ ... options={"entry_point":"add","file_name":"solution.py"})
243
+ >>> def model_fn(task: CodingTask) -> Dict[str,str]:
244
+ ... return {"solution.py": "def add(a,b): return a - b # BUG"}
245
+ >>> def repair_fn(lang: str, prev_files: Dict[str,str], feedback: str) -> Dict[str,str]:
246
+ ... fixed = prev_files["solution.py"].replace("a - b", "a + b")
247
+ ... return {"solution.py": fixed}
248
+ >>> evaluator = CodingEvaluator(provider=DummyProvider(), model_fn=model_fn, repair_fn=repair_fn, cfg=EvaluatorConfig(self_repair=True))
249
+ >>> outcomes = list(evaluator.evaluate())
250
+ >>> len(outcomes)
251
+ 1
252
+ >>> outcomes[0].passed
253
+ True
254
+ """
255
+ for idx, task in enumerate(self.provider.iter_tasks()):
256
+ files0 = self.model_fn(task)
257
+ files0 = {**task.files, **files0}
258
+ files0 = self._maybe_sanitize(task, files0)
259
+
260
+ r0 = self._run_once(task, files0)
261
+ if r0.status == "ok":
262
+ yield SampleOutcome(task_id=f"{self.provider.name}:{idx}", status=r0.status, passed=True, elapsed=r0.elapsed)
263
+ continue
264
+
265
+ if not self.cfg.self_repair or self.repair_fn is None:
266
+ yield SampleOutcome(task_id=f"{self.provider.name}:{idx}", status=r0.status, passed=False, elapsed=r0.elapsed); continue
267
+
268
+ fb = self._feedback(r0)
269
+ files1 = self.repair_fn(task.language, files0, fb)
270
+ files1 = {**task.files, **files1}
271
+ files1 = self._maybe_sanitize(task, files1)
272
+
273
+ r1 = self._run_once(task, files1)
274
+ passed = (r0.status == "ok") or (r1.status == "ok")
275
+ yield SampleOutcome(task_id=f"{self.provider.name}:{idx}", status=r1.status, passed=passed, elapsed=(r0.elapsed + r1.elapsed))
@@ -0,0 +1,66 @@
1
+ # coding/metrics/passk.py
2
+ from __future__ import annotations
3
+ from typing import Iterable
4
+ import math
5
+ from collections import defaultdict
6
+ from .core.atoms import SampleOutcome, Metric
7
+
8
+ class PassAtK(Metric):
9
+ """
10
+ Exact Pass@k for code generation.
11
+ """
12
+
13
+ def __init__(self, k: int = 1):
14
+ if k < 1:
15
+ raise ValueError("k must be >= 1")
16
+ self.k = k
17
+
18
+ def compute(self, outcomes: Iterable[SampleOutcome]) -> float:
19
+ """
20
+ Aggregate counts per task_id
21
+
22
+ arguments:
23
+ outcomes: Iterable of SampleOutcome objects
24
+
25
+ returns:
26
+ Average Pass@k score across tasks
27
+
28
+ intuition:
29
+ For each task, we have n samples, c of which pass.
30
+ We want the probability that at least one of k random picks from these n samples is a passing one.
31
+ This is 1 - (combinations of picking k from the n-c failing ones) / (combinations of picking k from all n).
32
+ We then average this score across all tasks.
33
+ """
34
+ per_task_counts = defaultdict(lambda: {"n": 0, "c": 0})
35
+ for o in outcomes:
36
+ d = per_task_counts[o.task_id]
37
+ d["n"] += 1
38
+ d["c"] += 1 if o.passed else 0
39
+
40
+ if not per_task_counts:
41
+ return 0.0
42
+
43
+ scores_sum = 0.0
44
+ task_cnt = 0
45
+ for counts in per_task_counts.values():
46
+ n = counts["n"]
47
+ c = counts["c"]
48
+ if n <= 0:
49
+ continue
50
+
51
+ k = min(self.k, n)
52
+ if c <= 0:
53
+ score = 0.0
54
+ elif k == 0:
55
+ score = 0.0
56
+ elif k == 1:
57
+ score = c / n
58
+ else:
59
+ denom = math.comb(n, k)
60
+ num = math.comb(n - c, k) if k <= (n - c) else 0
61
+ score = 1.0 - (num / denom if denom > 0 else 0.0)
62
+
63
+ scores_sum += score
64
+ task_cnt += 1
65
+
66
+ return 0.0 if task_cnt == 0 else scores_sum / task_cnt
File without changes
@@ -0,0 +1,27 @@
1
+ # coding/llm_sanitizer/core/atoms.py
2
+ from __future__ import annotations
3
+ from dataclasses import dataclass
4
+ from typing import Dict, Protocol, Literal, Optional
5
+
6
+ Language = Literal["python", "cpp", "java"]
7
+
8
+ @dataclass(frozen=True)
9
+ class TaskSchema:
10
+ """What the sandbox expects for this task."""
11
+ language: Language
12
+ file_name: str # e.g., "solution.py" | "solution.cpp" | "Solution.java"
13
+ entry_point: str # function/method name tests will call (e.g., "add", "solve")
14
+ java_class: str = "Solution" # only for Java; expected public class name
15
+ # Optional hints:
16
+ allow_wrapper: bool = True # may synthesize thin wrapper instead of renaming
17
+ prefer_rename: bool = False # if True and safe, rename single top-level function to entry_point
18
+
19
+ @dataclass(frozen=True)
20
+ class NormalizeResult:
21
+ files: Dict[str, str] # filename -> normalized source
22
+ notes: str # human-readable log of what was done
23
+ ok: bool # True if we think it’s valid / parseable
24
+
25
+ class CodeStandardizer(Protocol):
26
+ language: Language
27
+ def normalize(self, raw: str, schema: TaskSchema) -> NormalizeResult: ...
@@ -0,0 +1,62 @@
1
+ from __future__ import annotations
2
+ import re
3
+ from typing import List
4
+ from wisent.benchmarks.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
5
+ from wisent.benchmarks.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace
6
+
7
+ FUNC_RE = re.compile(r"^\s*(?:template<[^>]+>\s*)?(?:[\w:\s*&<>,]+)\s+(\w+)\s*\(", re.MULTILINE)
8
+ CLASS_RE = re.compile(r"^\s*class\s+(\w+)\s*[{:]", re.MULTILINE)
9
+
10
+ class CppStandardizer(CodeStandardizer):
11
+ language = "cpp"
12
+
13
+ def normalize(self, raw: str, schema: TaskSchema) -> NormalizeResult:
14
+ notes: List[str] = []
15
+ code = normalize_whitespace(extract_code_block(raw, prefer_langs=("cpp","c++","cc","c")))
16
+ code = re.sub(r"^```.*?\n|\n```$", "", code, flags=re.DOTALL)
17
+
18
+ if re.search(rf"\b{re.escape(schema.entry_point)}\s*\(", code):
19
+ notes.append(f"found function '{schema.entry_point}'")
20
+ return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=True)
21
+
22
+ classes = CLASS_RE.findall(code)
23
+ for cls in classes:
24
+ if re.search(rf"\b{re.escape(cls)}\s*::\s*{re.escape(schema.entry_point)}\s*\(", code) or \
25
+ re.search(rf"class\s+{re.escape(cls)}.*?\b{re.escape(schema.entry_point)}\s*\(", code, flags=re.S):
26
+ notes.append(f"found {cls}::{schema.entry_point}; adding free-function shim")
27
+ shim = (
28
+ f"\n\ntemplate <typename... Args>\n"
29
+ f"auto {schema.entry_point}(Args&&... args)\n"
30
+ f" -> decltype({cls}().{schema.entry_point}(std::forward<Args>(args)...)) {{\n"
31
+ f" return {cls}().{schema.entry_point}(std::forward<Args>(args)...);\n"
32
+ f"}}\n"
33
+ )
34
+ if "#include <utility>" not in code:
35
+ code = "#include <utility>\n" + code
36
+ return NormalizeResult(files={schema.file_name: code + shim}, notes="\n".join(notes), ok=True)
37
+
38
+ candidates = [m.group(1) for m in FUNC_RE.finditer(code)]
39
+ if schema.prefer_rename and len(candidates) == 1:
40
+ old = candidates[0]
41
+ if old != schema.entry_point:
42
+ notes.append(f"renaming free function '{old}' -> '{schema.entry_point}'")
43
+ code2 = re.sub(rf"(\b){re.escape(old)}(\s*\()", rf"\1{schema.entry_point}\2", code)
44
+ return NormalizeResult(files={schema.file_name: code2}, notes="\n".join(notes), ok=True)
45
+
46
+ if candidates:
47
+ target = candidates[0]
48
+ if target != schema.entry_point:
49
+ notes.append(f"adding forwarding wrapper {schema.entry_point} -> {target}")
50
+ shim = (
51
+ f"\n\ntemplate <typename... Args>\n"
52
+ f"auto {schema.entry_point}(Args&&... args)\n"
53
+ f" -> decltype({target}(std::forward<Args>(args)...)) {{\n"
54
+ f" return {target}(std::forward<Args>(args)...);\n"
55
+ f"}}\n"
56
+ )
57
+ if "#include <utility>" not in code:
58
+ code = "#include <utility>\n" + code
59
+ return NormalizeResult(files={schema.file_name: code + shim}, notes="\n".join(notes), ok=True)
60
+
61
+ notes.append("no obvious function; returned normalized source only")
62
+ return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=False)
@@ -0,0 +1,78 @@
1
+ # coding/llm_sanitizer/java_sanitizer.py
2
+ from __future__ import annotations
3
+ import re
4
+ from typing import List
5
+ from wisent.benchmarks.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
6
+ from wisent.benchmarks.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace
7
+
8
+ CLASS_RE = re.compile(r"\bclass\s+([A-Za-z_]\w*)")
9
+ METHOD_RE = re.compile(r"(public\s+static\s+[\w\<\>\[\]]+\s+)(\w+)\s*\(")
10
+
11
+ class JavaStandardizer(CodeStandardizer):
12
+ language = "java"
13
+
14
+ def normalize(self, raw: str, schema: TaskSchema) -> NormalizeResult:
15
+ notes: List[str] = []
16
+ code = normalize_whitespace(extract_code_block(raw, prefer_langs=("java")))
17
+ code = re.sub(r"^```.*?\n|\n```$", "", code, flags=re.DOTALL)
18
+
19
+ m = CLASS_RE.search(code)
20
+ if m:
21
+ found = m.group(1)
22
+ if found != schema.java_class:
23
+ notes.append(f"renaming class '{found}' -> '{schema.java_class}'")
24
+ code = re.sub(rf"\bclass\s+{re.escape(found)}\b", f"class {schema.java_class}", code, count=1)
25
+
26
+ if not CLASS_RE.search(code):
27
+ notes.append(f"wrapping code in class {schema.java_class}")
28
+ code = f"public class {schema.java_class} {{\n{indent(code)}\n}}\n"
29
+
30
+ static_methods = list(METHOD_RE.finditer(code))
31
+ if any(m.group(2) == schema.entry_point for m in static_methods):
32
+ notes.append(f"found public static '{schema.entry_point}'")
33
+ return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=True)
34
+
35
+ if len(static_methods) == 1 and schema.prefer_rename:
36
+ old = static_methods[0].group(2)
37
+ if old != schema.entry_point:
38
+ notes.append(f"renaming static method '{old}' -> '{schema.entry_point}'")
39
+ code = re.sub(rf"(\bpublic\s+static\s+[\w\<\>\[\]]+\s+){re.escape(old)}(\s*\()",
40
+ rf"\1{schema.entry_point}\2", code, count=1)
41
+ return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=True)
42
+
43
+ if re.search(rf"\b{schema.entry_point}\s*\(", code):
44
+ notes.append(f"adding static wrapper for instance method '{schema.entry_point}'")
45
+ wrapper = (
46
+ f"\n public static <T> Object {schema.entry_point}(Object... args) {{\n"
47
+ f" {schema.java_class} _x = new {schema.java_class}();\n"
48
+ f" try {{\n"
49
+ f" // attempt reflective dispatch to instance method\n"
50
+ f" Class<?>[] types = new Class<?>[args.length];\n"
51
+ f" for (int i=0;i<args.length;i++) types[i] = args[i].getClass();\n"
52
+ f" return {schema.java_class}.class.getMethod(\"{schema.entry_point}\", types).invoke(_x, args);\n"
53
+ f" }} catch (Exception ex) {{ throw new RuntimeException(ex); }}\n"
54
+ f" }}\n"
55
+ )
56
+ code = re.sub(rf"(class\s+{schema.java_class}\s*{{)", r"\1" + wrapper, code, count=1)
57
+ return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=True)
58
+
59
+ notes.append("no suitable method; adding delegating static method to first public static or instance method via reflection")
60
+ fallback = (
61
+ f"\n public static Object {schema.entry_point}(Object... args) {{\n"
62
+ f" try {{\n"
63
+ f" // try any public method first\n"
64
+ f" for (var m : {schema.java_class}.class.getMethods()) {{\n"
65
+ f" if (m.getName().equals(\"{schema.entry_point}\")) continue;\n"
66
+ f" try {{ return m.invoke(m.getParameterCount()==0? new {schema.java_class}(): new {schema.java_class}(), args); }}\n"
67
+ f" catch (Exception ignored) {{}}\n"
68
+ f" }}\n"
69
+ f" }} catch (Exception e) {{ throw new RuntimeException(e); }}\n"
70
+ f" throw new RuntimeException(\"No suitable method for entry point\");\n"
71
+ f" }}\n"
72
+ )
73
+ code = re.sub(rf"(class\s+{schema.java_class}\s*{{)", r"\1" + fallback, code, count=1)
74
+ return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=True)
75
+
76
+ def indent(s: str, n: int = 4) -> str:
77
+ pad = " " * n
78
+ return "\n".join(pad + line if line.strip() else line for line in s.splitlines())
@@ -0,0 +1,94 @@
1
+ # coding/llm_sanitizer/python_sanitizer.py
2
+ from __future__ import annotations
3
+ import ast, re
4
+ from typing import List
5
+ from wisent.benchmarks.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
6
+ from wisent.benchmarks.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace, maybe_black
7
+
8
+ class PythonStandardizer(CodeStandardizer):
9
+ language = "python"
10
+
11
+ def normalize(self, raw: str, schema: TaskSchema) -> NormalizeResult:
12
+ notes: List[str] = []
13
+ code = extract_code_block(raw, prefer_langs=("python","py"))
14
+ code = normalize_whitespace(code)
15
+ code = re.sub(r"^```.*?\n|\n```$", "", code, flags=re.DOTALL)
16
+
17
+ try:
18
+ tree = ast.parse(code)
19
+ except SyntaxError as e:
20
+ notes.append(f"parse failed: {e}; returning raw after whitespace normalize")
21
+ return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=False)
22
+
23
+ fn_names = [n.name for n in tree.body if isinstance(n, ast.FunctionDef)]
24
+ cls_nodes = [n for n in tree.body if isinstance(n, ast.ClassDef)]
25
+ has_entry_top = schema.entry_point in fn_names
26
+
27
+ if has_entry_top:
28
+ notes.append(f"top-level function '{schema.entry_point}' found")
29
+ cleaned = maybe_black(code)
30
+ return NormalizeResult(files={schema.file_name: cleaned}, notes="\n".join(notes), ok=True)
31
+
32
+ if schema.prefer_rename and len(fn_names) == 1:
33
+ old = fn_names[0]
34
+ notes.append(f"renaming single function '{old}' -> '{schema.entry_point}'")
35
+ class Renamer(ast.NodeTransformer):
36
+ def visit_FunctionDef(self, node: ast.FunctionDef):
37
+ if node.name == old:
38
+ node.name = schema.entry_point
39
+ return self.generic_visit(node)
40
+ tree2 = Renamer().visit(tree)
41
+ ast.fix_missing_locations(tree2)
42
+ try:
43
+ new_code = ast.unparse(tree2)
44
+ except Exception:
45
+ new_code = code.replace(f"def {old}(", f"def {schema.entry_point}(")
46
+ new_code = maybe_black(new_code)
47
+ return NormalizeResult(files={schema.file_name: new_code}, notes="\n".join(notes), ok=True)
48
+
49
+ for cls in cls_nodes:
50
+ method_names = [n.name for n in cls.body if isinstance(n, ast.FunctionDef)]
51
+ if schema.entry_point in method_names:
52
+ notes.append(f"found method {cls.name}.{schema.entry_point}; adding thin adapter")
53
+ adapter = (
54
+ f"\n\ndef {schema.entry_point}(*args, **kwargs):\n"
55
+ f" return {cls.name}().{schema.entry_point}(*args, **kwargs)\n"
56
+ )
57
+ final = code + adapter
58
+ final = maybe_black(final)
59
+ return NormalizeResult(files={schema.file_name: final}, notes="\n".join(notes), ok=True)
60
+
61
+ candidates = [n for n in fn_names if n in {"solve","solution","func","function","answer"}]
62
+ if candidates:
63
+ old = candidates[0]
64
+ notes.append(f"renaming fallback '{old}' -> '{schema.entry_point}'")
65
+ try:
66
+ class Renamer(ast.NodeTransformer):
67
+ def visit_FunctionDef(self, node: ast.FunctionDef):
68
+ if node.name == old: node.name = schema.entry_point
69
+ return self.generic_visit(node)
70
+ tree2 = Renamer().visit(tree); ast.fix_missing_locations(tree2)
71
+ new_code = ast.unparse(tree2)
72
+ except Exception:
73
+ new_code = code.replace(f"def {old}(", f"def {schema.entry_point}(")
74
+ new_code = maybe_black(new_code)
75
+ return NormalizeResult(files={schema.file_name: new_code}, notes="\n".join(notes), ok=True)
76
+
77
+ if schema.allow_wrapper:
78
+ notes.append("no entry found; appending dynamic-dispatch adapter to call first callable")
79
+ adapter = (
80
+ f"\n\ndef {schema.entry_point}(*args, **kwargs):\n"
81
+ f" # fallback: try first callable in module\n"
82
+ f" import inspect\n"
83
+ f" for _name, _obj in globals().items():\n"
84
+ f" if callable(_obj) and _name not in ('{schema.entry_point}',):\n"
85
+ f" try:\n"
86
+ f" return _obj(*args, **kwargs)\n"
87
+ f" except TypeError:\n"
88
+ f" continue\n"
89
+ f" raise NameError('No suitable function for entry point')\n"
90
+ )
91
+ final = maybe_black(code + adapter)
92
+ return NormalizeResult(files={schema.file_name: final}, notes="\n".join(notes), ok=True)
93
+
94
+ return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=False)