wisent 0.1.1__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wisent might be problematic. Click here for more details.
- wisent/__init__.py +1 -8
- wisent/benchmarks/__init__.py +0 -0
- wisent/benchmarks/coding/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
- wisent/benchmarks/coding/metrics/evaluator.py +275 -0
- wisent/benchmarks/coding/metrics/passk.py +66 -0
- wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
- wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
- wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
- wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
- wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
- wisent/benchmarks/coding/providers/__init__.py +18 -0
- wisent/benchmarks/coding/providers/core/__init__.py +0 -0
- wisent/benchmarks/coding/providers/core/atoms.py +31 -0
- wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
- wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
- wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
- wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
- wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
- wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
- wisent/classifiers/__init__.py +0 -0
- wisent/classifiers/core/__init__.py +0 -0
- wisent/classifiers/core/atoms.py +747 -0
- wisent/classifiers/models/__init__.py +0 -0
- wisent/classifiers/models/logistic.py +29 -0
- wisent/classifiers/models/mlp.py +47 -0
- wisent/cli/__init__.py +0 -0
- wisent/cli/classifiers/__init__.py +0 -0
- wisent/cli/classifiers/classifier_rotator.py +137 -0
- wisent/cli/cli_logger.py +142 -0
- wisent/cli/data_loaders/__init__.py +0 -0
- wisent/cli/data_loaders/data_loader_rotator.py +96 -0
- wisent/cli/evaluators/__init__.py +0 -0
- wisent/cli/evaluators/evaluator_rotator.py +148 -0
- wisent/cli/steering_methods/__init__.py +0 -0
- wisent/cli/steering_methods/steering_rotator.py +110 -0
- wisent/cli/wisent_cli/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
- wisent/cli/wisent_cli/commands/listing.py +154 -0
- wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
- wisent/cli/wisent_cli/main.py +93 -0
- wisent/cli/wisent_cli/shell.py +80 -0
- wisent/cli/wisent_cli/ui.py +69 -0
- wisent/cli/wisent_cli/util/__init__.py +0 -0
- wisent/cli/wisent_cli/util/aggregations.py +43 -0
- wisent/cli/wisent_cli/util/parsing.py +126 -0
- wisent/cli/wisent_cli/version.py +4 -0
- wisent/core/__init__.py +27 -0
- wisent/core/activations/__init__.py +0 -0
- wisent/core/activations/activations_collector.py +338 -0
- wisent/core/activations/core/__init__.py +0 -0
- wisent/core/activations/core/atoms.py +216 -0
- wisent/core/agent/__init__.py +18 -0
- wisent/core/agent/budget.py +638 -0
- wisent/core/agent/device_benchmarks.py +685 -0
- wisent/core/agent/diagnose/__init__.py +55 -0
- wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
- wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
- wisent/core/agent/diagnose/create_classifier.py +1154 -0
- wisent/core/agent/diagnose/response_diagnostics.py +268 -0
- wisent/core/agent/diagnose/select_classifiers.py +506 -0
- wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
- wisent/core/agent/diagnose/tasks/__init__.py +33 -0
- wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
- wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
- wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
- wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
- wisent/core/agent/diagnose.py +242 -0
- wisent/core/agent/steer.py +212 -0
- wisent/core/agent/timeout.py +134 -0
- wisent/core/autonomous_agent.py +1234 -0
- wisent/core/bigcode_integration.py +583 -0
- wisent/core/contrastive_pairs/__init__.py +15 -0
- wisent/core/contrastive_pairs/core/__init__.py +0 -0
- wisent/core/contrastive_pairs/core/atoms.py +45 -0
- wisent/core/contrastive_pairs/core/buliders.py +59 -0
- wisent/core/contrastive_pairs/core/pair.py +178 -0
- wisent/core/contrastive_pairs/core/response.py +152 -0
- wisent/core/contrastive_pairs/core/serialization.py +300 -0
- wisent/core/contrastive_pairs/core/set.py +133 -0
- wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
- wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
- wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
- wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
- wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
- wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
- wisent/core/data_loaders/__init__.py +0 -0
- wisent/core/data_loaders/core/__init__.py +0 -0
- wisent/core/data_loaders/core/atoms.py +98 -0
- wisent/core/data_loaders/loaders/__init__.py +0 -0
- wisent/core/data_loaders/loaders/custom.py +120 -0
- wisent/core/data_loaders/loaders/lm_loader.py +218 -0
- wisent/core/detection_handling.py +257 -0
- wisent/core/download_full_benchmarks.py +1386 -0
- wisent/core/evaluators/__init__.py +0 -0
- wisent/core/evaluators/oracles/__init__.py +0 -0
- wisent/core/evaluators/oracles/interactive.py +73 -0
- wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
- wisent/core/evaluators/oracles/user_specified.py +67 -0
- wisent/core/hyperparameter_optimizer.py +429 -0
- wisent/core/lm_eval_harness_ground_truth.py +1396 -0
- wisent/core/log_likelihoods_evaluator.py +321 -0
- wisent/core/managed_cached_benchmarks.py +595 -0
- wisent/core/mixed_benchmark_sampler.py +364 -0
- wisent/core/model_config_manager.py +330 -0
- wisent/core/model_persistence.py +317 -0
- wisent/core/models/__init__.py +0 -0
- wisent/core/models/core/__init__.py +0 -0
- wisent/core/models/core/atoms.py +460 -0
- wisent/core/models/wisent_model.py +727 -0
- wisent/core/multi_steering.py +316 -0
- wisent/core/optuna/__init__.py +57 -0
- wisent/core/optuna/classifier/__init__.py +25 -0
- wisent/core/optuna/classifier/activation_generator.py +349 -0
- wisent/core/optuna/classifier/classifier_cache.py +509 -0
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
- wisent/core/optuna/steering/__init__.py +0 -0
- wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
- wisent/core/optuna/steering/data_utils.py +342 -0
- wisent/core/optuna/steering/metrics.py +474 -0
- wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
- wisent/core/optuna/steering/steering_optimization.py +1111 -0
- wisent/core/parser.py +1668 -0
- wisent/core/prompts/__init__.py +0 -0
- wisent/core/prompts/core/__init__.py +0 -0
- wisent/core/prompts/core/atom.py +57 -0
- wisent/core/prompts/core/prompt_formater.py +157 -0
- wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
- wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
- wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
- wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
- wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
- wisent/core/representation.py +5 -0
- wisent/core/sample_size_optimizer.py +648 -0
- wisent/core/sample_size_optimizer_v2.py +355 -0
- wisent/core/save_results.py +277 -0
- wisent/core/steering.py +652 -0
- wisent/core/steering_method.py +26 -0
- wisent/core/steering_methods/__init__.py +0 -0
- wisent/core/steering_methods/core/__init__.py +0 -0
- wisent/core/steering_methods/core/atoms.py +153 -0
- wisent/core/steering_methods/methods/__init__.py +0 -0
- wisent/core/steering_methods/methods/caa.py +44 -0
- wisent/core/steering_optimizer.py +1297 -0
- wisent/core/task_interface.py +132 -0
- wisent/core/task_selector.py +189 -0
- wisent/core/tasks/__init__.py +175 -0
- wisent/core/tasks/aime_task.py +141 -0
- wisent/core/tasks/file_task.py +211 -0
- wisent/core/tasks/hle_task.py +180 -0
- wisent/core/tasks/hmmt_task.py +119 -0
- wisent/core/tasks/livecodebench_task.py +201 -0
- wisent/core/tasks/livemathbench_task.py +158 -0
- wisent/core/tasks/lm_eval_task.py +455 -0
- wisent/core/tasks/math500_task.py +84 -0
- wisent/core/tasks/polymath_task.py +146 -0
- wisent/core/tasks/supergpqa_task.py +220 -0
- wisent/core/time_estimator.py +149 -0
- wisent/core/timing_calibration.py +174 -0
- wisent/core/tracking/__init__.py +54 -0
- wisent/core/tracking/latency.py +618 -0
- wisent/core/tracking/memory.py +359 -0
- wisent/core/trainers/__init__.py +0 -0
- wisent/core/trainers/core/__init__.py +11 -0
- wisent/core/trainers/core/atoms.py +45 -0
- wisent/core/trainers/steering_trainer.py +271 -0
- wisent/core/user_model_config.py +158 -0
- wisent/opti/__init__.py +0 -0
- wisent/opti/core/__init__.py +0 -0
- wisent/opti/core/atoms.py +175 -0
- wisent/opti/methods/__init__.py +0 -0
- wisent/opti/methods/opti_classificator.py +172 -0
- wisent/opti/methods/opti_steering.py +138 -0
- wisent/synthetic/__init__.py +0 -0
- wisent/synthetic/cleaners/__init__.py +0 -0
- wisent/synthetic/cleaners/core/__init__.py +0 -0
- wisent/synthetic/cleaners/core/atoms.py +58 -0
- wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
- wisent/synthetic/cleaners/methods/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
- wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
- wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
- wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
- wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
- wisent/synthetic/db_instructions/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/atoms.py +25 -0
- wisent/synthetic/db_instructions/mini_dp.py +37 -0
- wisent/synthetic/generators/__init__.py +0 -0
- wisent/synthetic/generators/core/__init__.py +0 -0
- wisent/synthetic/generators/core/atoms.py +73 -0
- wisent/synthetic/generators/diversities/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/core.py +68 -0
- wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
- wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
- wisent/synthetic/generators/pairs_generator.py +179 -0
- wisent-0.5.1.dist-info/METADATA +67 -0
- wisent-0.5.1.dist-info/RECORD +218 -0
- {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/WHEEL +1 -1
- {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info/licenses}/LICENSE +2 -2
- wisent/activations/__init__.py +0 -9
- wisent/activations/client.py +0 -97
- wisent/activations/extractor.py +0 -251
- wisent/activations/models.py +0 -95
- wisent/client.py +0 -45
- wisent/control_vector/__init__.py +0 -9
- wisent/control_vector/client.py +0 -85
- wisent/control_vector/manager.py +0 -168
- wisent/control_vector/models.py +0 -70
- wisent/inference/__init__.py +0 -9
- wisent/inference/client.py +0 -103
- wisent/inference/inferencer.py +0 -250
- wisent/inference/models.py +0 -66
- wisent/utils/__init__.py +0 -3
- wisent/utils/auth.py +0 -30
- wisent/utils/http.py +0 -228
- wisent/version.py +0 -3
- wisent-0.1.1.dist-info/METADATA +0 -142
- wisent-0.1.1.dist-info/RECORD +0 -23
- {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/top_level.txt +0 -0
wisent/__init__.py
CHANGED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Iterable, Protocol
|
|
4
|
+
|
|
5
|
+
@dataclass(frozen=True)
|
|
6
|
+
class SampleOutcome:
|
|
7
|
+
"""
|
|
8
|
+
Result of executing a single sample (possibly after self-repair).
|
|
9
|
+
|
|
10
|
+
attributes:
|
|
11
|
+
task_id:
|
|
12
|
+
The unique identifier for the task.
|
|
13
|
+
status:
|
|
14
|
+
One of "ok", "compile_error", "runtime_error", or "timeout".
|
|
15
|
+
passed:
|
|
16
|
+
True if the code passed all tests, False otherwise.
|
|
17
|
+
elapsed:
|
|
18
|
+
Time taken to execute the code in seconds.
|
|
19
|
+
"""
|
|
20
|
+
task_id: str
|
|
21
|
+
status: str
|
|
22
|
+
passed: bool
|
|
23
|
+
elapsed: float
|
|
24
|
+
|
|
25
|
+
class Metric(Protocol):
|
|
26
|
+
"""
|
|
27
|
+
Metric computes a score from an iterable of SampleOutcome.
|
|
28
|
+
"""
|
|
29
|
+
def compute(self, outcomes: Iterable[SampleOutcome]) -> float: ...
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Evaluator(Protocol):
|
|
33
|
+
"""
|
|
34
|
+
Runs tasks end-to-end (codegen + optional self-repair) and yields SampleOutcome.
|
|
35
|
+
"""
|
|
36
|
+
def evaluate(self) -> Iterable[SampleOutcome]: ...
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Callable, Iterable, Optional, TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from wisent_guard.benchmarks.coding.safe_docker.core.runtime import DockerSandboxExecutor
|
|
6
|
+
from wisent_guard.benchmarks.coding.safe_docker.recipes import RECIPE_REGISTRY
|
|
7
|
+
from wisent_guard.benchmarks.coding.metrics.core.atoms import SampleOutcome, Evaluator
|
|
8
|
+
|
|
9
|
+
from wisent_guard.benchmarks.coding.output_sanitizer.core.atoms import TaskSchema
|
|
10
|
+
from wisent_guard.benchmarks.coding.output_sanitizer.python_sanitizer import PythonStandardizer
|
|
11
|
+
from wisent_guard.benchmarks.coding.output_sanitizer.cpp_sanitizer import CppStandardizer
|
|
12
|
+
from wisent_guard.benchmarks.coding.output_sanitizer.java_sanitizer import JavaStandardizer
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from wisent_guard.benchmarks.coding.safe_docker.core.atoms import Result
|
|
16
|
+
from wisent_guard.benchmarks.coding.providers.core.atoms import Provider, CodingTask
|
|
17
|
+
from wisent_guard.benchmarks.coding.output_sanitizer.core.atoms import CodeStandardizer
|
|
18
|
+
|
|
19
|
+
RepairFn = Callable[[str, dict[str,str], str], dict[str,str]]
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class EvaluatorConfig:
|
|
23
|
+
"""
|
|
24
|
+
Configuration for CodingEvaluator.
|
|
25
|
+
|
|
26
|
+
attributes:
|
|
27
|
+
image:
|
|
28
|
+
Docker image to use for code execution (default: "coding/sandbox:polyglot-1.0").
|
|
29
|
+
runtime:
|
|
30
|
+
Optional Docker runtime (e.g., "runsc" for gVisor).
|
|
31
|
+
feedback_max_chars:
|
|
32
|
+
Maximum characters of feedback to pass to the repair function (default: 2000).
|
|
33
|
+
self_repair:
|
|
34
|
+
Whether to perform a single self-repair turn (default: True). It means the we provide feedback to the model for one iteration.
|
|
35
|
+
time_limit_s:
|
|
36
|
+
Time limit in seconds for each code execution (default: 8s).
|
|
37
|
+
cpu_limit_s:
|
|
38
|
+
CPU time limit in seconds for each code execution (default: 3s).
|
|
39
|
+
mem_limit_mb:
|
|
40
|
+
Memory limit in megabytes for each code execution (default: 768MB).
|
|
41
|
+
pre_sanitize:
|
|
42
|
+
Whether to run LLM output through a sanitizer before execution (default: True).
|
|
43
|
+
"""
|
|
44
|
+
image: str = "coding/sandbox:polyglot-1.0"
|
|
45
|
+
runtime: Optional[str] = None
|
|
46
|
+
feedback_max_chars: int = 2000
|
|
47
|
+
self_repair: bool = True
|
|
48
|
+
time_limit_s: int = 8
|
|
49
|
+
cpu_limit_s: int = 3
|
|
50
|
+
mem_limit_mb: int = 768
|
|
51
|
+
pre_sanitize: bool = True
|
|
52
|
+
|
|
53
|
+
_SANITIZERS = {
|
|
54
|
+
"python": PythonStandardizer(),
|
|
55
|
+
"cpp": CppStandardizer(),
|
|
56
|
+
"java": JavaStandardizer(),
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
def _default_filename(lang: str) -> str:
|
|
60
|
+
"""
|
|
61
|
+
Returns the default source file name for a given programming language.
|
|
62
|
+
|
|
63
|
+
arguments:
|
|
64
|
+
lang:
|
|
65
|
+
Programming language ("python", "cpp", or "java").
|
|
66
|
+
|
|
67
|
+
returns:
|
|
68
|
+
Default filename as a string.
|
|
69
|
+
"""
|
|
70
|
+
return {"python":"solution.py","cpp":"solution.cpp","java":"Solution.java"}[lang]
|
|
71
|
+
|
|
72
|
+
def _make_schema(task: CodingTask) -> TaskSchema:
|
|
73
|
+
"""
|
|
74
|
+
Constructs a TaskSchema from a CodingTask, using task options or defaults.
|
|
75
|
+
|
|
76
|
+
arguments:
|
|
77
|
+
task:
|
|
78
|
+
CodingTask containing language and options.
|
|
79
|
+
|
|
80
|
+
returns:
|
|
81
|
+
TaskSchema with language, file_name, entry_point, java_class, prefer_rename,
|
|
82
|
+
and allow_wrapper set appropriately.
|
|
83
|
+
|
|
84
|
+
example:
|
|
85
|
+
>>> from wisent_guard.benchmarks.coding.providers.core.atoms import CodingTask
|
|
86
|
+
>>> task = CodingTask(language="python", files={}, options={"entry_point":"add","file_name":"my_solution.py"})
|
|
87
|
+
>>> schema = _make_schema(task)
|
|
88
|
+
>>> schema.language
|
|
89
|
+
'python'
|
|
90
|
+
>>> schema.file_name
|
|
91
|
+
'my_solution.py'
|
|
92
|
+
>>> schema.entry_point
|
|
93
|
+
'add'
|
|
94
|
+
>>> schema.java_class
|
|
95
|
+
'Solution'
|
|
96
|
+
>>> schema.prefer_rename
|
|
97
|
+
True
|
|
98
|
+
>>> schema.allow_wrapper
|
|
99
|
+
True
|
|
100
|
+
"""
|
|
101
|
+
entry = str(task.options.get("entry_point", "solve"))
|
|
102
|
+
file_name = str(task.options.get("file_name", _default_filename(task.language)))
|
|
103
|
+
java_class = str(task.options.get("java_class", "Solution"))
|
|
104
|
+
return TaskSchema(language=task.language, file_name=file_name, entry_point=entry,
|
|
105
|
+
java_class=java_class, prefer_rename=True, allow_wrapper=True)
|
|
106
|
+
|
|
107
|
+
class CodingEvaluator(Evaluator):
|
|
108
|
+
"""
|
|
109
|
+
Evaluator for coding tasks with optional self-repair.
|
|
110
|
+
"""
|
|
111
|
+
def __init__(self, provider: Provider, model_fn: Callable[[CodingTask], dict[str,str]],
|
|
112
|
+
repair_fn: Optional[RepairFn] = None, cfg: EvaluatorConfig = EvaluatorConfig()):
|
|
113
|
+
self.provider = provider
|
|
114
|
+
self.model_fn = model_fn
|
|
115
|
+
self.repair_fn = repair_fn
|
|
116
|
+
self.cfg = cfg
|
|
117
|
+
self.exec = DockerSandboxExecutor(image=cfg.image, runtime=cfg.runtime)
|
|
118
|
+
|
|
119
|
+
def _feedback(self, res: "Result") -> str:
|
|
120
|
+
"""
|
|
121
|
+
Generates feedback text from a Result object for use in self-repair.
|
|
122
|
+
|
|
123
|
+
arguments:
|
|
124
|
+
res:
|
|
125
|
+
Result object containing status, stdout, stderr, and elapsed time.
|
|
126
|
+
|
|
127
|
+
returns:
|
|
128
|
+
Feedback string summarizing the result, truncated to cfg.feedback_max_chars.
|
|
129
|
+
|
|
130
|
+
examples:
|
|
131
|
+
>>> from wisent_guard.benchmarks.coding.safe_docker.core.atoms import Result
|
|
132
|
+
>>> res = Result(status="timeout", stdout="", stderr="", elapsed=10.0)
|
|
133
|
+
>>> evaluator = CodingEvaluator(provider=None, model_fn=lambda x: {}, cfg=EvaluatorConfig())
|
|
134
|
+
>>> evaluator._feedback(res)
|
|
135
|
+
'Timeout after 10.00s.'
|
|
136
|
+
>>> res = Result(status="compile_error", stdout="", stderr="error: something went wrong", elapsed=1.5)
|
|
137
|
+
>>> evaluator._feedback(res)
|
|
138
|
+
'Compilation failed:\nerror: something went wrong'
|
|
139
|
+
>>> res = Result(status="runtime_error", stdout="test failed", stderr="", elapsed=0.5)
|
|
140
|
+
>>> evaluator._feedback(res)
|
|
141
|
+
'Runtime error:\ntest failed'
|
|
142
|
+
"""
|
|
143
|
+
if res.status == "timeout":
|
|
144
|
+
return f"Timeout after {res.elapsed:.2f}s."
|
|
145
|
+
body = (res.stdout or "") + ("\n" + res.stderr if res.stderr else "")
|
|
146
|
+
if res.status == "compile_error":
|
|
147
|
+
prefix = "Compilation failed:\n"
|
|
148
|
+
else:
|
|
149
|
+
prefix = "Tests failed:\n"
|
|
150
|
+
return (prefix + body)[: self.cfg.feedback_max_chars]
|
|
151
|
+
|
|
152
|
+
def _run_once(self, task: CodingTask, files: dict[str,str]) -> Result:
|
|
153
|
+
"""
|
|
154
|
+
Runs a single evaluation job for the given task and files.
|
|
155
|
+
|
|
156
|
+
arguments:
|
|
157
|
+
task:
|
|
158
|
+
The coding task to evaluate.
|
|
159
|
+
files:
|
|
160
|
+
The files to include in the evaluation.
|
|
161
|
+
|
|
162
|
+
returns:
|
|
163
|
+
Result object containing the status, stdout, stderr, and elapsed time.
|
|
164
|
+
|
|
165
|
+
examples:
|
|
166
|
+
>>> from wisent_guard.benchmarks.coding.providers.core.atoms import CodingTask
|
|
167
|
+
>>> from wisent_guard.benchmarks.coding.safe_docker.core.atoms import Result
|
|
168
|
+
>>> task = CodingTask(language="python", files={}, options={})
|
|
169
|
+
>>> files = {"solution.py": "def add(a,b): return a + b", "tests.py": "from solution import add\ndef test_ok(): assert add(1,2)==3"}
|
|
170
|
+
>>> evaluator = CodingEvaluator(provider=None, model_fn=lambda x: {})
|
|
171
|
+
>>> res: Result = evaluator._run_once(task, files)
|
|
172
|
+
>>> res.status
|
|
173
|
+
'ok'
|
|
174
|
+
>>> res.exit_code
|
|
175
|
+
0
|
|
176
|
+
>>> res.stdout
|
|
177
|
+
'test_ok passed'
|
|
178
|
+
>>> res.stderr
|
|
179
|
+
''
|
|
180
|
+
>>> round(res.elapsed, 2)
|
|
181
|
+
0.23
|
|
182
|
+
"""
|
|
183
|
+
recipe = RECIPE_REGISTRY[task.language]
|
|
184
|
+
job = recipe.make_job(files, **task.options,
|
|
185
|
+
time_limit_s=self.cfg.time_limit_s,
|
|
186
|
+
cpu_limit_s=self.cfg.cpu_limit_s,
|
|
187
|
+
mem_limit_mb=self.cfg.mem_limit_mb)
|
|
188
|
+
return self.exec.run(files, job)
|
|
189
|
+
|
|
190
|
+
def _maybe_sanitize(self, task: CodingTask, files: dict[str,str]) -> dict[str,str]:
|
|
191
|
+
"""
|
|
192
|
+
Optionally sanitizes the generated files based on the task schema.
|
|
193
|
+
|
|
194
|
+
arguments:
|
|
195
|
+
task:
|
|
196
|
+
The coding task containing language and options.
|
|
197
|
+
files:
|
|
198
|
+
The generated files to potentially sanitize.
|
|
199
|
+
|
|
200
|
+
returns:
|
|
201
|
+
The sanitized files if pre_sanitize is True and a sanitizer exists for the language; otherwise, the original files.
|
|
202
|
+
|
|
203
|
+
examples:
|
|
204
|
+
>>> from wisent_guard.benchmarks.coding.providers.core.atoms import CodingTask
|
|
205
|
+
>>> task = CodingTask(language="python", files={}, options={"entry_point":"add","file_name":"my_solution.py"})
|
|
206
|
+
>>> files = {"my_solution.py": "def add(a,b): return a - b # BUG"}
|
|
207
|
+
>>> evaluator = CodingEvaluator(provider=None, model_fn=lambda x: {}, cfg=EvaluatorConfig(pre_sanitize=True))
|
|
208
|
+
>>> sanitized_files = evaluator._maybe_sanitize(task, files)
|
|
209
|
+
>>> "my_solution.py" in sanitized_files
|
|
210
|
+
True
|
|
211
|
+
>>> sanitized_files["my_solution.py"]
|
|
212
|
+
'def add(a, b):\n return a + b\n'
|
|
213
|
+
"""
|
|
214
|
+
if not self.cfg.pre_sanitize:
|
|
215
|
+
return files
|
|
216
|
+
schema = _make_schema(task)
|
|
217
|
+
sanitizer: "CodeStandardizer" = _SANITIZERS.get(task.language)
|
|
218
|
+
if sanitizer is None:
|
|
219
|
+
return files
|
|
220
|
+
|
|
221
|
+
raw = files.get(schema.file_name) or files.get("__raw__")
|
|
222
|
+
if not raw:
|
|
223
|
+
return files
|
|
224
|
+
|
|
225
|
+
out = sanitizer.normalize(raw, schema)
|
|
226
|
+
files = {**files, schema.file_name: out.files.get(schema.file_name, raw)}
|
|
227
|
+
return files
|
|
228
|
+
|
|
229
|
+
def evaluate(self) -> Iterable[SampleOutcome]:
|
|
230
|
+
"""
|
|
231
|
+
Evaluates all tasks from the provider, performing optional self-repair.
|
|
232
|
+
|
|
233
|
+
yields:
|
|
234
|
+
SampleOutcome for each task, indicating pass/fail status and elapsed time.
|
|
235
|
+
|
|
236
|
+
examples:
|
|
237
|
+
>>> from wisent_guard.benchmarks.coding.providers.core.atoms import CodingTask, Provider
|
|
238
|
+
>>> class DummyProvider:
|
|
239
|
+
... name = "dummy"
|
|
240
|
+
... def iter_tasks(self):
|
|
241
|
+
... yield CodingTask(language="python", files={"tests.py":"from solution import add\ndef test_ok(): assert add(1,2)==3"},
|
|
242
|
+
... options={"entry_point":"add","file_name":"solution.py"})
|
|
243
|
+
>>> def model_fn(task: CodingTask) -> Dict[str,str]:
|
|
244
|
+
... return {"solution.py": "def add(a,b): return a - b # BUG"}
|
|
245
|
+
>>> def repair_fn(lang: str, prev_files: Dict[str,str], feedback: str) -> Dict[str,str]:
|
|
246
|
+
... fixed = prev_files["solution.py"].replace("a - b", "a + b")
|
|
247
|
+
... return {"solution.py": fixed}
|
|
248
|
+
>>> evaluator = CodingEvaluator(provider=DummyProvider(), model_fn=model_fn, repair_fn=repair_fn, cfg=EvaluatorConfig(self_repair=True))
|
|
249
|
+
>>> outcomes = list(evaluator.evaluate())
|
|
250
|
+
>>> len(outcomes)
|
|
251
|
+
1
|
|
252
|
+
>>> outcomes[0].passed
|
|
253
|
+
True
|
|
254
|
+
"""
|
|
255
|
+
for idx, task in enumerate(self.provider.iter_tasks()):
|
|
256
|
+
files0 = self.model_fn(task)
|
|
257
|
+
files0 = {**task.files, **files0}
|
|
258
|
+
files0 = self._maybe_sanitize(task, files0)
|
|
259
|
+
|
|
260
|
+
r0 = self._run_once(task, files0)
|
|
261
|
+
if r0.status == "ok":
|
|
262
|
+
yield SampleOutcome(task_id=f"{self.provider.name}:{idx}", status=r0.status, passed=True, elapsed=r0.elapsed)
|
|
263
|
+
continue
|
|
264
|
+
|
|
265
|
+
if not self.cfg.self_repair or self.repair_fn is None:
|
|
266
|
+
yield SampleOutcome(task_id=f"{self.provider.name}:{idx}", status=r0.status, passed=False, elapsed=r0.elapsed); continue
|
|
267
|
+
|
|
268
|
+
fb = self._feedback(r0)
|
|
269
|
+
files1 = self.repair_fn(task.language, files0, fb)
|
|
270
|
+
files1 = {**task.files, **files1}
|
|
271
|
+
files1 = self._maybe_sanitize(task, files1)
|
|
272
|
+
|
|
273
|
+
r1 = self._run_once(task, files1)
|
|
274
|
+
passed = (r0.status == "ok") or (r1.status == "ok")
|
|
275
|
+
yield SampleOutcome(task_id=f"{self.provider.name}:{idx}", status=r1.status, passed=passed, elapsed=(r0.elapsed + r1.elapsed))
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# coding/metrics/passk.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import Iterable
|
|
4
|
+
import math
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from .core.atoms import SampleOutcome, Metric
|
|
7
|
+
|
|
8
|
+
class PassAtK(Metric):
|
|
9
|
+
"""
|
|
10
|
+
Exact Pass@k for code generation.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, k: int = 1):
|
|
14
|
+
if k < 1:
|
|
15
|
+
raise ValueError("k must be >= 1")
|
|
16
|
+
self.k = k
|
|
17
|
+
|
|
18
|
+
def compute(self, outcomes: Iterable[SampleOutcome]) -> float:
|
|
19
|
+
"""
|
|
20
|
+
Aggregate counts per task_id
|
|
21
|
+
|
|
22
|
+
arguments:
|
|
23
|
+
outcomes: Iterable of SampleOutcome objects
|
|
24
|
+
|
|
25
|
+
returns:
|
|
26
|
+
Average Pass@k score across tasks
|
|
27
|
+
|
|
28
|
+
intuition:
|
|
29
|
+
For each task, we have n samples, c of which pass.
|
|
30
|
+
We want the probability that at least one of k random picks from these n samples is a passing one.
|
|
31
|
+
This is 1 - (combinations of picking k from the n-c failing ones) / (combinations of picking k from all n).
|
|
32
|
+
We then average this score across all tasks.
|
|
33
|
+
"""
|
|
34
|
+
per_task_counts = defaultdict(lambda: {"n": 0, "c": 0})
|
|
35
|
+
for o in outcomes:
|
|
36
|
+
d = per_task_counts[o.task_id]
|
|
37
|
+
d["n"] += 1
|
|
38
|
+
d["c"] += 1 if o.passed else 0
|
|
39
|
+
|
|
40
|
+
if not per_task_counts:
|
|
41
|
+
return 0.0
|
|
42
|
+
|
|
43
|
+
scores_sum = 0.0
|
|
44
|
+
task_cnt = 0
|
|
45
|
+
for counts in per_task_counts.values():
|
|
46
|
+
n = counts["n"]
|
|
47
|
+
c = counts["c"]
|
|
48
|
+
if n <= 0:
|
|
49
|
+
continue
|
|
50
|
+
|
|
51
|
+
k = min(self.k, n)
|
|
52
|
+
if c <= 0:
|
|
53
|
+
score = 0.0
|
|
54
|
+
elif k == 0:
|
|
55
|
+
score = 0.0
|
|
56
|
+
elif k == 1:
|
|
57
|
+
score = c / n
|
|
58
|
+
else:
|
|
59
|
+
denom = math.comb(n, k)
|
|
60
|
+
num = math.comb(n - c, k) if k <= (n - c) else 0
|
|
61
|
+
score = 1.0 - (num / denom if denom > 0 else 0.0)
|
|
62
|
+
|
|
63
|
+
scores_sum += score
|
|
64
|
+
task_cnt += 1
|
|
65
|
+
|
|
66
|
+
return 0.0 if task_cnt == 0 else scores_sum / task_cnt
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# coding/llm_sanitizer/core/atoms.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Dict, Protocol, Literal, Optional
|
|
5
|
+
|
|
6
|
+
Language = Literal["python", "cpp", "java"]
|
|
7
|
+
|
|
8
|
+
@dataclass(frozen=True)
|
|
9
|
+
class TaskSchema:
|
|
10
|
+
"""What the sandbox expects for this task."""
|
|
11
|
+
language: Language
|
|
12
|
+
file_name: str # e.g., "solution.py" | "solution.cpp" | "Solution.java"
|
|
13
|
+
entry_point: str # function/method name tests will call (e.g., "add", "solve")
|
|
14
|
+
java_class: str = "Solution" # only for Java; expected public class name
|
|
15
|
+
# Optional hints:
|
|
16
|
+
allow_wrapper: bool = True # may synthesize thin wrapper instead of renaming
|
|
17
|
+
prefer_rename: bool = False # if True and safe, rename single top-level function to entry_point
|
|
18
|
+
|
|
19
|
+
@dataclass(frozen=True)
|
|
20
|
+
class NormalizeResult:
|
|
21
|
+
files: Dict[str, str] # filename -> normalized source
|
|
22
|
+
notes: str # human-readable log of what was done
|
|
23
|
+
ok: bool # True if we think it’s valid / parseable
|
|
24
|
+
|
|
25
|
+
class CodeStandardizer(Protocol):
|
|
26
|
+
language: Language
|
|
27
|
+
def normalize(self, raw: str, schema: TaskSchema) -> NormalizeResult: ...
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import re
|
|
3
|
+
from typing import List
|
|
4
|
+
from wisent_guard.benchmarks.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
|
|
5
|
+
from wisent_guard.benchmarks.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace
|
|
6
|
+
|
|
7
|
+
FUNC_RE = re.compile(r"^\s*(?:template<[^>]+>\s*)?(?:[\w:\s*&<>,]+)\s+(\w+)\s*\(", re.MULTILINE)
|
|
8
|
+
CLASS_RE = re.compile(r"^\s*class\s+(\w+)\s*[{:]", re.MULTILINE)
|
|
9
|
+
|
|
10
|
+
class CppStandardizer(CodeStandardizer):
|
|
11
|
+
language = "cpp"
|
|
12
|
+
|
|
13
|
+
def normalize(self, raw: str, schema: TaskSchema) -> NormalizeResult:
|
|
14
|
+
notes: List[str] = []
|
|
15
|
+
code = normalize_whitespace(extract_code_block(raw, prefer_langs=("cpp","c++","cc","c")))
|
|
16
|
+
code = re.sub(r"^```.*?\n|\n```$", "", code, flags=re.DOTALL)
|
|
17
|
+
|
|
18
|
+
if re.search(rf"\b{re.escape(schema.entry_point)}\s*\(", code):
|
|
19
|
+
notes.append(f"found function '{schema.entry_point}'")
|
|
20
|
+
return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=True)
|
|
21
|
+
|
|
22
|
+
classes = CLASS_RE.findall(code)
|
|
23
|
+
for cls in classes:
|
|
24
|
+
if re.search(rf"\b{re.escape(cls)}\s*::\s*{re.escape(schema.entry_point)}\s*\(", code) or \
|
|
25
|
+
re.search(rf"class\s+{re.escape(cls)}.*?\b{re.escape(schema.entry_point)}\s*\(", code, flags=re.S):
|
|
26
|
+
notes.append(f"found {cls}::{schema.entry_point}; adding free-function shim")
|
|
27
|
+
shim = (
|
|
28
|
+
f"\n\ntemplate <typename... Args>\n"
|
|
29
|
+
f"auto {schema.entry_point}(Args&&... args)\n"
|
|
30
|
+
f" -> decltype({cls}().{schema.entry_point}(std::forward<Args>(args)...)) {{\n"
|
|
31
|
+
f" return {cls}().{schema.entry_point}(std::forward<Args>(args)...);\n"
|
|
32
|
+
f"}}\n"
|
|
33
|
+
)
|
|
34
|
+
if "#include <utility>" not in code:
|
|
35
|
+
code = "#include <utility>\n" + code
|
|
36
|
+
return NormalizeResult(files={schema.file_name: code + shim}, notes="\n".join(notes), ok=True)
|
|
37
|
+
|
|
38
|
+
candidates = [m.group(1) for m in FUNC_RE.finditer(code)]
|
|
39
|
+
if schema.prefer_rename and len(candidates) == 1:
|
|
40
|
+
old = candidates[0]
|
|
41
|
+
if old != schema.entry_point:
|
|
42
|
+
notes.append(f"renaming free function '{old}' -> '{schema.entry_point}'")
|
|
43
|
+
code2 = re.sub(rf"(\b){re.escape(old)}(\s*\()", rf"\1{schema.entry_point}\2", code)
|
|
44
|
+
return NormalizeResult(files={schema.file_name: code2}, notes="\n".join(notes), ok=True)
|
|
45
|
+
|
|
46
|
+
if candidates:
|
|
47
|
+
target = candidates[0]
|
|
48
|
+
if target != schema.entry_point:
|
|
49
|
+
notes.append(f"adding forwarding wrapper {schema.entry_point} -> {target}")
|
|
50
|
+
shim = (
|
|
51
|
+
f"\n\ntemplate <typename... Args>\n"
|
|
52
|
+
f"auto {schema.entry_point}(Args&&... args)\n"
|
|
53
|
+
f" -> decltype({target}(std::forward<Args>(args)...)) {{\n"
|
|
54
|
+
f" return {target}(std::forward<Args>(args)...);\n"
|
|
55
|
+
f"}}\n"
|
|
56
|
+
)
|
|
57
|
+
if "#include <utility>" not in code:
|
|
58
|
+
code = "#include <utility>\n" + code
|
|
59
|
+
return NormalizeResult(files={schema.file_name: code + shim}, notes="\n".join(notes), ok=True)
|
|
60
|
+
|
|
61
|
+
notes.append("no obvious function; returned normalized source only")
|
|
62
|
+
return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=False)
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# coding/llm_sanitizer/java_sanitizer.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import re
|
|
4
|
+
from typing import List
|
|
5
|
+
from wisent_guard.benchmarks.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
|
|
6
|
+
from wisent_guard.benchmarks.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace
|
|
7
|
+
|
|
8
|
+
CLASS_RE = re.compile(r"\bclass\s+([A-Za-z_]\w*)")
|
|
9
|
+
METHOD_RE = re.compile(r"(public\s+static\s+[\w\<\>\[\]]+\s+)(\w+)\s*\(")
|
|
10
|
+
|
|
11
|
+
class JavaStandardizer(CodeStandardizer):
|
|
12
|
+
language = "java"
|
|
13
|
+
|
|
14
|
+
def normalize(self, raw: str, schema: TaskSchema) -> NormalizeResult:
|
|
15
|
+
notes: List[str] = []
|
|
16
|
+
code = normalize_whitespace(extract_code_block(raw, prefer_langs=("java")))
|
|
17
|
+
code = re.sub(r"^```.*?\n|\n```$", "", code, flags=re.DOTALL)
|
|
18
|
+
|
|
19
|
+
m = CLASS_RE.search(code)
|
|
20
|
+
if m:
|
|
21
|
+
found = m.group(1)
|
|
22
|
+
if found != schema.java_class:
|
|
23
|
+
notes.append(f"renaming class '{found}' -> '{schema.java_class}'")
|
|
24
|
+
code = re.sub(rf"\bclass\s+{re.escape(found)}\b", f"class {schema.java_class}", code, count=1)
|
|
25
|
+
|
|
26
|
+
if not CLASS_RE.search(code):
|
|
27
|
+
notes.append(f"wrapping code in class {schema.java_class}")
|
|
28
|
+
code = f"public class {schema.java_class} {{\n{indent(code)}\n}}\n"
|
|
29
|
+
|
|
30
|
+
static_methods = list(METHOD_RE.finditer(code))
|
|
31
|
+
if any(m.group(2) == schema.entry_point for m in static_methods):
|
|
32
|
+
notes.append(f"found public static '{schema.entry_point}'")
|
|
33
|
+
return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=True)
|
|
34
|
+
|
|
35
|
+
if len(static_methods) == 1 and schema.prefer_rename:
|
|
36
|
+
old = static_methods[0].group(2)
|
|
37
|
+
if old != schema.entry_point:
|
|
38
|
+
notes.append(f"renaming static method '{old}' -> '{schema.entry_point}'")
|
|
39
|
+
code = re.sub(rf"(\bpublic\s+static\s+[\w\<\>\[\]]+\s+){re.escape(old)}(\s*\()",
|
|
40
|
+
rf"\1{schema.entry_point}\2", code, count=1)
|
|
41
|
+
return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=True)
|
|
42
|
+
|
|
43
|
+
if re.search(rf"\b{schema.entry_point}\s*\(", code):
|
|
44
|
+
notes.append(f"adding static wrapper for instance method '{schema.entry_point}'")
|
|
45
|
+
wrapper = (
|
|
46
|
+
f"\n public static <T> Object {schema.entry_point}(Object... args) {{\n"
|
|
47
|
+
f" {schema.java_class} _x = new {schema.java_class}();\n"
|
|
48
|
+
f" try {{\n"
|
|
49
|
+
f" // attempt reflective dispatch to instance method\n"
|
|
50
|
+
f" Class<?>[] types = new Class<?>[args.length];\n"
|
|
51
|
+
f" for (int i=0;i<args.length;i++) types[i] = args[i].getClass();\n"
|
|
52
|
+
f" return {schema.java_class}.class.getMethod(\"{schema.entry_point}\", types).invoke(_x, args);\n"
|
|
53
|
+
f" }} catch (Exception ex) {{ throw new RuntimeException(ex); }}\n"
|
|
54
|
+
f" }}\n"
|
|
55
|
+
)
|
|
56
|
+
code = re.sub(rf"(class\s+{schema.java_class}\s*{{)", r"\1" + wrapper, code, count=1)
|
|
57
|
+
return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=True)
|
|
58
|
+
|
|
59
|
+
notes.append("no suitable method; adding delegating static method to first public static or instance method via reflection")
|
|
60
|
+
fallback = (
|
|
61
|
+
f"\n public static Object {schema.entry_point}(Object... args) {{\n"
|
|
62
|
+
f" try {{\n"
|
|
63
|
+
f" // try any public method first\n"
|
|
64
|
+
f" for (var m : {schema.java_class}.class.getMethods()) {{\n"
|
|
65
|
+
f" if (m.getName().equals(\"{schema.entry_point}\")) continue;\n"
|
|
66
|
+
f" try {{ return m.invoke(m.getParameterCount()==0? new {schema.java_class}(): new {schema.java_class}(), args); }}\n"
|
|
67
|
+
f" catch (Exception ignored) {{}}\n"
|
|
68
|
+
f" }}\n"
|
|
69
|
+
f" }} catch (Exception e) {{ throw new RuntimeException(e); }}\n"
|
|
70
|
+
f" throw new RuntimeException(\"No suitable method for entry point\");\n"
|
|
71
|
+
f" }}\n"
|
|
72
|
+
)
|
|
73
|
+
code = re.sub(rf"(class\s+{schema.java_class}\s*{{)", r"\1" + fallback, code, count=1)
|
|
74
|
+
return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=True)
|
|
75
|
+
|
|
76
|
+
def indent(s: str, n: int = 4) -> str:
|
|
77
|
+
pad = " " * n
|
|
78
|
+
return "\n".join(pad + line if line.strip() else line for line in s.splitlines())
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# coding/llm_sanitizer/python_sanitizer.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import ast, re
|
|
4
|
+
from typing import List
|
|
5
|
+
from wisent_guard.benchmarks.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
|
|
6
|
+
from wisent_guard.benchmarks.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace, maybe_black
|
|
7
|
+
|
|
8
|
+
class PythonStandardizer(CodeStandardizer):
|
|
9
|
+
language = "python"
|
|
10
|
+
|
|
11
|
+
def normalize(self, raw: str, schema: TaskSchema) -> NormalizeResult:
|
|
12
|
+
notes: List[str] = []
|
|
13
|
+
code = extract_code_block(raw, prefer_langs=("python","py"))
|
|
14
|
+
code = normalize_whitespace(code)
|
|
15
|
+
code = re.sub(r"^```.*?\n|\n```$", "", code, flags=re.DOTALL)
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
tree = ast.parse(code)
|
|
19
|
+
except SyntaxError as e:
|
|
20
|
+
notes.append(f"parse failed: {e}; returning raw after whitespace normalize")
|
|
21
|
+
return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=False)
|
|
22
|
+
|
|
23
|
+
fn_names = [n.name for n in tree.body if isinstance(n, ast.FunctionDef)]
|
|
24
|
+
cls_nodes = [n for n in tree.body if isinstance(n, ast.ClassDef)]
|
|
25
|
+
has_entry_top = schema.entry_point in fn_names
|
|
26
|
+
|
|
27
|
+
if has_entry_top:
|
|
28
|
+
notes.append(f"top-level function '{schema.entry_point}' found")
|
|
29
|
+
cleaned = maybe_black(code)
|
|
30
|
+
return NormalizeResult(files={schema.file_name: cleaned}, notes="\n".join(notes), ok=True)
|
|
31
|
+
|
|
32
|
+
if schema.prefer_rename and len(fn_names) == 1:
|
|
33
|
+
old = fn_names[0]
|
|
34
|
+
notes.append(f"renaming single function '{old}' -> '{schema.entry_point}'")
|
|
35
|
+
class Renamer(ast.NodeTransformer):
|
|
36
|
+
def visit_FunctionDef(self, node: ast.FunctionDef):
|
|
37
|
+
if node.name == old:
|
|
38
|
+
node.name = schema.entry_point
|
|
39
|
+
return self.generic_visit(node)
|
|
40
|
+
tree2 = Renamer().visit(tree)
|
|
41
|
+
ast.fix_missing_locations(tree2)
|
|
42
|
+
try:
|
|
43
|
+
new_code = ast.unparse(tree2)
|
|
44
|
+
except Exception:
|
|
45
|
+
new_code = code.replace(f"def {old}(", f"def {schema.entry_point}(")
|
|
46
|
+
new_code = maybe_black(new_code)
|
|
47
|
+
return NormalizeResult(files={schema.file_name: new_code}, notes="\n".join(notes), ok=True)
|
|
48
|
+
|
|
49
|
+
for cls in cls_nodes:
|
|
50
|
+
method_names = [n.name for n in cls.body if isinstance(n, ast.FunctionDef)]
|
|
51
|
+
if schema.entry_point in method_names:
|
|
52
|
+
notes.append(f"found method {cls.name}.{schema.entry_point}; adding thin adapter")
|
|
53
|
+
adapter = (
|
|
54
|
+
f"\n\ndef {schema.entry_point}(*args, **kwargs):\n"
|
|
55
|
+
f" return {cls.name}().{schema.entry_point}(*args, **kwargs)\n"
|
|
56
|
+
)
|
|
57
|
+
final = code + adapter
|
|
58
|
+
final = maybe_black(final)
|
|
59
|
+
return NormalizeResult(files={schema.file_name: final}, notes="\n".join(notes), ok=True)
|
|
60
|
+
|
|
61
|
+
candidates = [n for n in fn_names if n in {"solve","solution","func","function","answer"}]
|
|
62
|
+
if candidates:
|
|
63
|
+
old = candidates[0]
|
|
64
|
+
notes.append(f"renaming fallback '{old}' -> '{schema.entry_point}'")
|
|
65
|
+
try:
|
|
66
|
+
class Renamer(ast.NodeTransformer):
|
|
67
|
+
def visit_FunctionDef(self, node: ast.FunctionDef):
|
|
68
|
+
if node.name == old: node.name = schema.entry_point
|
|
69
|
+
return self.generic_visit(node)
|
|
70
|
+
tree2 = Renamer().visit(tree); ast.fix_missing_locations(tree2)
|
|
71
|
+
new_code = ast.unparse(tree2)
|
|
72
|
+
except Exception:
|
|
73
|
+
new_code = code.replace(f"def {old}(", f"def {schema.entry_point}(")
|
|
74
|
+
new_code = maybe_black(new_code)
|
|
75
|
+
return NormalizeResult(files={schema.file_name: new_code}, notes="\n".join(notes), ok=True)
|
|
76
|
+
|
|
77
|
+
if schema.allow_wrapper:
|
|
78
|
+
notes.append("no entry found; appending dynamic-dispatch adapter to call first callable")
|
|
79
|
+
adapter = (
|
|
80
|
+
f"\n\ndef {schema.entry_point}(*args, **kwargs):\n"
|
|
81
|
+
f" # fallback: try first callable in module\n"
|
|
82
|
+
f" import inspect\n"
|
|
83
|
+
f" for _name, _obj in globals().items():\n"
|
|
84
|
+
f" if callable(_obj) and _name not in ('{schema.entry_point}',):\n"
|
|
85
|
+
f" try:\n"
|
|
86
|
+
f" return _obj(*args, **kwargs)\n"
|
|
87
|
+
f" except TypeError:\n"
|
|
88
|
+
f" continue\n"
|
|
89
|
+
f" raise NameError('No suitable function for entry point')\n"
|
|
90
|
+
)
|
|
91
|
+
final = maybe_black(code + adapter)
|
|
92
|
+
return NormalizeResult(files={schema.file_name: final}, notes="\n".join(notes), ok=True)
|
|
93
|
+
|
|
94
|
+
return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=False)
|