wisent 0.5.11__py3-none-any.whl → 0.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wisent might be problematic. Click here for more details.
- wisent/__init__.py +1 -1
- wisent/core/activations/__init__.py +26 -0
- wisent/core/activations/activations.py +96 -0
- wisent/core/activations/activations_collector.py +71 -20
- wisent/core/activations/prompt_construction_strategy.py +47 -0
- wisent/core/agent/budget.py +2 -2
- wisent/core/agent/device_benchmarks.py +1 -1
- wisent/core/agent/diagnose/classifier_marketplace.py +8 -8
- wisent/core/agent/diagnose/response_diagnostics.py +4 -4
- wisent/core/agent/diagnose/synthetic_classifier_option.py +1 -1
- wisent/core/agent/diagnose/tasks/task_manager.py +3 -3
- wisent/core/agent/diagnose.py +2 -1
- wisent/core/autonomous_agent.py +10 -2
- wisent/core/benchmark_extractors.py +293 -0
- wisent/core/bigcode_integration.py +20 -7
- wisent/core/branding.py +108 -0
- wisent/core/cli/__init__.py +15 -0
- wisent/core/cli/create_steering_vector.py +138 -0
- wisent/core/cli/evaluate_responses.py +715 -0
- wisent/core/cli/generate_pairs.py +128 -0
- wisent/core/cli/generate_pairs_from_task.py +119 -0
- wisent/core/cli/generate_responses.py +129 -0
- wisent/core/cli/generate_vector_from_synthetic.py +149 -0
- wisent/core/cli/generate_vector_from_task.py +147 -0
- wisent/core/cli/get_activations.py +191 -0
- wisent/core/cli/optimize_classification.py +339 -0
- wisent/core/cli/optimize_steering.py +364 -0
- wisent/core/cli/tasks.py +182 -0
- wisent/core/cli_logger.py +22 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +27 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +49 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +119 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +146 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +129 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +119 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +112 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livecodebench.py +367 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +112 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +112 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +116 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +121 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +121 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad2.py +124 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sst2.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +112 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +127 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_gen.py +112 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +117 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +117 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +127 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +119 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +112 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +1 -1
- wisent/core/data_loaders/__init__.py +235 -0
- wisent/core/data_loaders/loaders/lm_loader.py +2 -2
- wisent/core/data_loaders/loaders/task_interface_loader.py +300 -0
- wisent/{cli/data_loaders/data_loader_rotator.py → core/data_loaders/rotator.py} +1 -1
- wisent/core/download_full_benchmarks.py +79 -2
- wisent/core/evaluators/benchmark_specific/__init__.py +26 -0
- wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/evaluator.py +17 -17
- wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/cpp_sanitizer.py +2 -2
- wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/java_sanitizer.py +2 -2
- wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/python_sanitizer.py +2 -2
- wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/__init__.py +3 -0
- wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/provider.py +305 -0
- wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/core/runtime.py +36 -4
- wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/entrypoint.py +2 -4
- wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/recipes.py +1 -1
- wisent/core/evaluators/benchmark_specific/coding/solution_generator.py +258 -0
- wisent/core/evaluators/benchmark_specific/exact_match_evaluator.py +79 -0
- wisent/core/evaluators/benchmark_specific/f1_evaluator.py +101 -0
- wisent/core/evaluators/benchmark_specific/generation_evaluator.py +197 -0
- wisent/core/{log_likelihoods_evaluator.py → evaluators/benchmark_specific/log_likelihoods_evaluator.py} +10 -2
- wisent/core/evaluators/benchmark_specific/perplexity_evaluator.py +140 -0
- wisent/core/evaluators/benchmark_specific/personalization_evaluator.py +250 -0
- wisent/{cli/evaluators/evaluator_rotator.py → core/evaluators/rotator.py} +4 -4
- wisent/core/lm_eval_harness_ground_truth.py +3 -2
- wisent/core/main.py +57 -0
- wisent/core/model_persistence.py +2 -2
- wisent/core/models/wisent_model.py +8 -6
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
- wisent/core/optuna/steering/steering_optimization.py +1 -1
- wisent/core/parser_arguments/__init__.py +10 -0
- wisent/core/parser_arguments/agent_parser.py +110 -0
- wisent/core/parser_arguments/configure_model_parser.py +7 -0
- wisent/core/parser_arguments/create_steering_vector_parser.py +59 -0
- wisent/core/parser_arguments/evaluate_parser.py +40 -0
- wisent/core/parser_arguments/evaluate_responses_parser.py +10 -0
- wisent/core/parser_arguments/full_optimize_parser.py +115 -0
- wisent/core/parser_arguments/generate_pairs_from_task_parser.py +33 -0
- wisent/core/parser_arguments/generate_pairs_parser.py +29 -0
- wisent/core/parser_arguments/generate_responses_parser.py +15 -0
- wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +127 -0
- wisent/core/parser_arguments/generate_vector_from_task_parser.py +127 -0
- wisent/core/parser_arguments/generate_vector_parser.py +90 -0
- wisent/core/parser_arguments/get_activations_parser.py +90 -0
- wisent/core/parser_arguments/main_parser.py +152 -0
- wisent/core/parser_arguments/model_config_parser.py +59 -0
- wisent/core/parser_arguments/monitor_parser.py +17 -0
- wisent/core/parser_arguments/multi_steer_parser.py +47 -0
- wisent/core/parser_arguments/optimize_classification_parser.py +67 -0
- wisent/core/parser_arguments/optimize_sample_size_parser.py +58 -0
- wisent/core/parser_arguments/optimize_steering_parser.py +147 -0
- wisent/core/parser_arguments/synthetic_parser.py +93 -0
- wisent/core/parser_arguments/tasks_parser.py +584 -0
- wisent/core/parser_arguments/test_nonsense_parser.py +26 -0
- wisent/core/parser_arguments/utils.py +111 -0
- wisent/core/prompts/core/prompt_formater.py +3 -3
- wisent/core/prompts/prompt_stratiegies/direct_completion.py +2 -0
- wisent/core/prompts/prompt_stratiegies/instruction_following.py +2 -0
- wisent/core/prompts/prompt_stratiegies/multiple_choice.py +2 -0
- wisent/core/prompts/prompt_stratiegies/role_playing.py +2 -0
- wisent/{cli/steering_methods/steering_rotator.py → core/steering_methods/rotator.py} +4 -4
- wisent/core/steering_optimizer.py +45 -21
- wisent/{synthetic → core/synthetic}/cleaners/deduper_cleaner.py +3 -3
- wisent/{synthetic → core/synthetic}/cleaners/methods/base_dedupers.py +2 -2
- wisent/{synthetic → core/synthetic}/cleaners/methods/base_refusalers.py +1 -1
- wisent/{synthetic → core/synthetic}/cleaners/pairs_cleaner.py +5 -5
- wisent/{synthetic → core/synthetic}/cleaners/refusaler_cleaner.py +4 -4
- wisent/{synthetic → core/synthetic}/db_instructions/mini_dp.py +1 -1
- wisent/{synthetic → core/synthetic}/generators/diversities/methods/fast_diversity.py +1 -1
- wisent/{synthetic → core/synthetic}/generators/pairs_generator.py +38 -12
- wisent/core/tasks/livecodebench_task.py +4 -103
- wisent/core/timing_calibration.py +1 -1
- {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/METADATA +3 -3
- wisent-0.5.13.dist-info/RECORD +294 -0
- wisent-0.5.13.dist-info/entry_points.txt +2 -0
- wisent/benchmarks/coding/providers/livecodebench/provider.py +0 -53
- wisent/classifiers/core/atoms.py +0 -747
- wisent/classifiers/models/logistic.py +0 -29
- wisent/classifiers/models/mlp.py +0 -47
- wisent/cli/classifiers/classifier_rotator.py +0 -137
- wisent/cli/cli_logger.py +0 -142
- wisent/cli/wisent_cli/commands/help_cmd.py +0 -52
- wisent/cli/wisent_cli/commands/listing.py +0 -154
- wisent/cli/wisent_cli/commands/train_cmd.py +0 -322
- wisent/cli/wisent_cli/main.py +0 -93
- wisent/cli/wisent_cli/shell.py +0 -80
- wisent/cli/wisent_cli/ui.py +0 -69
- wisent/cli/wisent_cli/util/aggregations.py +0 -43
- wisent/cli/wisent_cli/util/parsing.py +0 -126
- wisent/cli/wisent_cli/version.py +0 -4
- wisent/opti/methods/__init__.py +0 -0
- wisent/synthetic/__init__.py +0 -0
- wisent/synthetic/cleaners/__init__.py +0 -0
- wisent/synthetic/cleaners/core/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
- wisent/synthetic/db_instructions/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/__init__.py +0 -0
- wisent/synthetic/generators/__init__.py +0 -0
- wisent/synthetic/generators/core/__init__.py +0 -0
- wisent/synthetic/generators/diversities/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/__init__.py +0 -0
- wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
- wisent-0.5.11.dist-info/RECORD +0 -220
- /wisent/{benchmarks → core/evaluators/benchmark_specific/coding}/__init__.py +0 -0
- /wisent/{benchmarks/coding → core/evaluators/benchmark_specific/coding/metrics}/__init__.py +0 -0
- /wisent/{benchmarks/coding/metrics → core/evaluators/benchmark_specific/coding/metrics/core}/__init__.py +0 -0
- /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/core/atoms.py +0 -0
- /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/passk.py +0 -0
- /wisent/{benchmarks/coding/metrics/core → core/evaluators/benchmark_specific/coding/output_sanitizer}/__init__.py +0 -0
- /wisent/{benchmarks/coding/output_sanitizer → core/evaluators/benchmark_specific/coding/output_sanitizer/core}/__init__.py +0 -0
- /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/core/atoms.py +0 -0
- /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/utils.py +0 -0
- /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/providers/__init__.py +0 -0
- /wisent/{benchmarks/coding/output_sanitizer → core/evaluators/benchmark_specific/coding/providers}/core/__init__.py +0 -0
- /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/providers/core/atoms.py +0 -0
- /wisent/{benchmarks/coding/providers/core → core/evaluators/benchmark_specific/coding/safe_docker}/__init__.py +0 -0
- /wisent/{benchmarks/coding/providers/livecodebench → core/evaluators/benchmark_specific/coding/safe_docker/core}/__init__.py +0 -0
- /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/core/atoms.py +0 -0
- /wisent/{benchmarks/coding/safe_docker → core/opti}/__init__.py +0 -0
- /wisent/{benchmarks/coding/safe_docker → core/opti}/core/__init__.py +0 -0
- /wisent/{opti → core/opti}/core/atoms.py +0 -0
- /wisent/{classifiers → core/opti/methods}/__init__.py +0 -0
- /wisent/{opti → core/opti}/methods/opti_classificator.py +0 -0
- /wisent/{opti → core/opti}/methods/opti_steering.py +0 -0
- /wisent/{classifiers/core → core/synthetic}/__init__.py +0 -0
- /wisent/{classifiers/models → core/synthetic/cleaners}/__init__.py +0 -0
- /wisent/{cli → core/synthetic/cleaners/core}/__init__.py +0 -0
- /wisent/{synthetic → core/synthetic}/cleaners/core/atoms.py +0 -0
- /wisent/{cli/classifiers → core/synthetic/cleaners/methods}/__init__.py +0 -0
- /wisent/{cli/data_loaders → core/synthetic/cleaners/methods/core}/__init__.py +0 -0
- /wisent/{synthetic → core/synthetic}/cleaners/methods/core/atoms.py +0 -0
- /wisent/{cli/evaluators → core/synthetic/db_instructions}/__init__.py +0 -0
- /wisent/{cli/steering_methods → core/synthetic/db_instructions/core}/__init__.py +0 -0
- /wisent/{synthetic → core/synthetic}/db_instructions/core/atoms.py +0 -0
- /wisent/{cli/wisent_cli → core/synthetic/generators}/__init__.py +0 -0
- /wisent/{cli/wisent_cli/commands → core/synthetic/generators/core}/__init__.py +0 -0
- /wisent/{synthetic → core/synthetic}/generators/core/atoms.py +0 -0
- /wisent/{cli/wisent_cli/util → core/synthetic/generators/diversities}/__init__.py +0 -0
- /wisent/{opti → core/synthetic/generators/diversities/core}/__init__.py +0 -0
- /wisent/{synthetic → core/synthetic}/generators/diversities/core/core.py +0 -0
- /wisent/{opti/core → core/synthetic/generators/diversities/methods}/__init__.py +0 -0
- {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/WHEEL +0 -0
- {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/licenses/LICENSE +0 -0
- {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Benchmark-specific evaluators for lm-eval tasks.
|
|
2
|
+
|
|
3
|
+
This module provides evaluation methods that match lm-eval's native approaches:
|
|
4
|
+
- Log likelihood evaluation for multiple-choice tasks
|
|
5
|
+
- Generation evaluation for text generation tasks
|
|
6
|
+
- Exact match evaluation for precise answer matching
|
|
7
|
+
- F1 evaluation for token-level comparison
|
|
8
|
+
- Perplexity evaluation for language modeling
|
|
9
|
+
- Personalization evaluation for personality trait manifestation
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from .log_likelihoods_evaluator import LogLikelihoodsEvaluator
|
|
13
|
+
from .generation_evaluator import GenerationEvaluator
|
|
14
|
+
from .exact_match_evaluator import ExactMatchEvaluator
|
|
15
|
+
from .f1_evaluator import F1Evaluator
|
|
16
|
+
from .perplexity_evaluator import PerplexityEvaluator
|
|
17
|
+
from .personalization_evaluator import PersonalizationEvaluator
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
'LogLikelihoodsEvaluator',
|
|
21
|
+
'GenerationEvaluator',
|
|
22
|
+
'ExactMatchEvaluator',
|
|
23
|
+
'F1Evaluator',
|
|
24
|
+
'PerplexityEvaluator',
|
|
25
|
+
'PersonalizationEvaluator',
|
|
26
|
+
]
|
|
@@ -2,19 +2,19 @@ from __future__ import annotations
|
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from typing import Callable, Iterable, Optional, TYPE_CHECKING
|
|
4
4
|
|
|
5
|
-
from wisent.
|
|
6
|
-
from wisent.
|
|
7
|
-
from wisent.
|
|
5
|
+
from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.runtime import DockerSandboxExecutor
|
|
6
|
+
from wisent.core.evaluators.benchmark_specific.coding.safe_docker.recipes import RECIPE_REGISTRY
|
|
7
|
+
from wisent.core.evaluators.benchmark_specific.coding.metrics.core.atoms import SampleOutcome, Evaluator
|
|
8
8
|
|
|
9
|
-
from wisent.
|
|
10
|
-
from wisent.
|
|
11
|
-
from wisent.
|
|
12
|
-
from wisent.
|
|
9
|
+
from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.core.atoms import TaskSchema
|
|
10
|
+
from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.python_sanitizer import PythonStandardizer
|
|
11
|
+
from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.cpp_sanitizer import CppStandardizer
|
|
12
|
+
from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.java_sanitizer import JavaStandardizer
|
|
13
13
|
|
|
14
14
|
if TYPE_CHECKING:
|
|
15
|
-
from wisent.
|
|
16
|
-
from wisent.
|
|
17
|
-
from wisent.
|
|
15
|
+
from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Result
|
|
16
|
+
from wisent.core.evaluators.benchmark_specific.coding.providers.core.atoms import Provider, CodingTask
|
|
17
|
+
from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.core.atoms import CodeStandardizer
|
|
18
18
|
|
|
19
19
|
RepairFn = Callable[[str, dict[str,str], str], dict[str,str]]
|
|
20
20
|
|
|
@@ -82,7 +82,7 @@ def _make_schema(task: CodingTask) -> TaskSchema:
|
|
|
82
82
|
and allow_wrapper set appropriately.
|
|
83
83
|
|
|
84
84
|
example:
|
|
85
|
-
>>> from wisent.
|
|
85
|
+
>>> from wisent.core.evaluators.benchmark_specific.coding.providers.core.atoms import CodingTask
|
|
86
86
|
>>> task = CodingTask(language="python", files={}, options={"entry_point":"add","file_name":"my_solution.py"})
|
|
87
87
|
>>> schema = _make_schema(task)
|
|
88
88
|
>>> schema.language
|
|
@@ -128,7 +128,7 @@ class CodingEvaluator(Evaluator):
|
|
|
128
128
|
Feedback string summarizing the result, truncated to cfg.feedback_max_chars.
|
|
129
129
|
|
|
130
130
|
examples:
|
|
131
|
-
>>> from wisent.
|
|
131
|
+
>>> from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Result
|
|
132
132
|
>>> res = Result(status="timeout", stdout="", stderr="", elapsed=10.0)
|
|
133
133
|
>>> evaluator = CodingEvaluator(provider=None, model_fn=lambda x: {}, cfg=EvaluatorConfig())
|
|
134
134
|
>>> evaluator._feedback(res)
|
|
@@ -163,8 +163,8 @@ class CodingEvaluator(Evaluator):
|
|
|
163
163
|
Result object containing the status, stdout, stderr, and elapsed time.
|
|
164
164
|
|
|
165
165
|
examples:
|
|
166
|
-
>>> from wisent.
|
|
167
|
-
>>> from wisent.
|
|
166
|
+
>>> from wisent.core.evaluators.benchmark_specific.coding.providers.core.atoms import CodingTask
|
|
167
|
+
>>> from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Result
|
|
168
168
|
>>> task = CodingTask(language="python", files={}, options={})
|
|
169
169
|
>>> files = {"solution.py": "def add(a,b): return a + b", "tests.py": "from solution import add\ndef test_ok(): assert add(1,2)==3"}
|
|
170
170
|
>>> evaluator = CodingEvaluator(provider=None, model_fn=lambda x: {})
|
|
@@ -181,7 +181,7 @@ class CodingEvaluator(Evaluator):
|
|
|
181
181
|
0.23
|
|
182
182
|
"""
|
|
183
183
|
recipe = RECIPE_REGISTRY[task.language]
|
|
184
|
-
job = recipe.make_job(
|
|
184
|
+
job = recipe.make_job(**task.options,
|
|
185
185
|
time_limit_s=self.cfg.time_limit_s,
|
|
186
186
|
cpu_limit_s=self.cfg.cpu_limit_s,
|
|
187
187
|
mem_limit_mb=self.cfg.mem_limit_mb)
|
|
@@ -201,7 +201,7 @@ class CodingEvaluator(Evaluator):
|
|
|
201
201
|
The sanitized files if pre_sanitize is True and a sanitizer exists for the language; otherwise, the original files.
|
|
202
202
|
|
|
203
203
|
examples:
|
|
204
|
-
>>> from wisent.
|
|
204
|
+
>>> from wisent.core.evaluators.benchmark_specific.coding.providers.core.atoms import CodingTask
|
|
205
205
|
>>> task = CodingTask(language="python", files={}, options={"entry_point":"add","file_name":"my_solution.py"})
|
|
206
206
|
>>> files = {"my_solution.py": "def add(a,b): return a - b # BUG"}
|
|
207
207
|
>>> evaluator = CodingEvaluator(provider=None, model_fn=lambda x: {}, cfg=EvaluatorConfig(pre_sanitize=True))
|
|
@@ -234,7 +234,7 @@ class CodingEvaluator(Evaluator):
|
|
|
234
234
|
SampleOutcome for each task, indicating pass/fail status and elapsed time.
|
|
235
235
|
|
|
236
236
|
examples:
|
|
237
|
-
>>> from wisent.
|
|
237
|
+
>>> from wisent.core.evaluators.benchmark_specific.coding.providers.core.atoms import CodingTask, Provider
|
|
238
238
|
>>> class DummyProvider:
|
|
239
239
|
... name = "dummy"
|
|
240
240
|
... def iter_tasks(self):
|
wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/cpp_sanitizer.py
RENAMED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
import re
|
|
3
3
|
from typing import List
|
|
4
|
-
from wisent.
|
|
5
|
-
from wisent.
|
|
4
|
+
from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
|
|
5
|
+
from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace
|
|
6
6
|
|
|
7
7
|
FUNC_RE = re.compile(r"^\s*(?:template<[^>]+>\s*)?(?:[\w:\s*&<>,]+)\s+(\w+)\s*\(", re.MULTILINE)
|
|
8
8
|
CLASS_RE = re.compile(r"^\s*class\s+(\w+)\s*[{:]", re.MULTILINE)
|
wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/java_sanitizer.py
RENAMED
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
import re
|
|
4
4
|
from typing import List
|
|
5
|
-
from wisent.
|
|
6
|
-
from wisent.
|
|
5
|
+
from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
|
|
6
|
+
from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace
|
|
7
7
|
|
|
8
8
|
CLASS_RE = re.compile(r"\bclass\s+([A-Za-z_]\w*)")
|
|
9
9
|
METHOD_RE = re.compile(r"(public\s+static\s+[\w\<\>\[\]]+\s+)(\w+)\s*\(")
|
wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/python_sanitizer.py
RENAMED
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
import ast, re
|
|
4
4
|
from typing import List
|
|
5
|
-
from wisent.
|
|
6
|
-
from wisent.
|
|
5
|
+
from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
|
|
6
|
+
from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace, maybe_black
|
|
7
7
|
|
|
8
8
|
class PythonStandardizer(CodeStandardizer):
|
|
9
9
|
language = "python"
|
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
# coding/providers/livecodebench/provider.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import json
|
|
4
|
+
from typing import Iterable, Optional
|
|
5
|
+
from ..core.atoms import CodingTask, Language
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class LiveCodeBenchProvider:
|
|
9
|
+
"""
|
|
10
|
+
LiveCodeBench provider: loads real coding problems from HuggingFace.
|
|
11
|
+
|
|
12
|
+
Dataset: livecodebench/code_generation_lite
|
|
13
|
+
Supports Python problems from LeetCode, AtCoder, and CodeForces.
|
|
14
|
+
"""
|
|
15
|
+
name = "livecodebench"
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
language: Language = "python",
|
|
20
|
+
release_version: str = "all",
|
|
21
|
+
limit: Optional[int] = None,
|
|
22
|
+
platform: Optional[str] = None,
|
|
23
|
+
):
|
|
24
|
+
"""
|
|
25
|
+
Initialize LiveCodeBench provider.
|
|
26
|
+
|
|
27
|
+
Arguments:
|
|
28
|
+
language: Programming language (currently only "python" supported)
|
|
29
|
+
release_version: Version to load ("release_v1", "release_v2", "all")
|
|
30
|
+
limit: Maximum number of problems to load
|
|
31
|
+
platform: Filter by platform ("leetcode", "codeforces", "atcoder")
|
|
32
|
+
"""
|
|
33
|
+
self.language = language
|
|
34
|
+
self.release_version = release_version
|
|
35
|
+
self.limit = limit
|
|
36
|
+
self.platform = platform
|
|
37
|
+
|
|
38
|
+
if language != "python":
|
|
39
|
+
raise NotImplementedError(
|
|
40
|
+
f"LiveCodeBench currently only supports Python. Got: {language}"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def iter_tasks(self, split: str = "test") -> Iterable[CodingTask]:
|
|
44
|
+
"""
|
|
45
|
+
Iterate over LiveCodeBench coding tasks.
|
|
46
|
+
|
|
47
|
+
Arguments:
|
|
48
|
+
split: Dataset split (only "test" is available for LiveCodeBench)
|
|
49
|
+
|
|
50
|
+
Yields:
|
|
51
|
+
CodingTask objects with solution file, test file, and options
|
|
52
|
+
"""
|
|
53
|
+
from datasets import load_dataset
|
|
54
|
+
|
|
55
|
+
# Load dataset from HuggingFace
|
|
56
|
+
dataset = load_dataset("livecodebench/code_generation_lite", split=split)
|
|
57
|
+
|
|
58
|
+
# Filter by version (date range)
|
|
59
|
+
if self.release_version == "release_v1":
|
|
60
|
+
dataset = dataset.filter(
|
|
61
|
+
lambda x: x["contest_date"] >= "2023-05-01" and x["contest_date"] <= "2023-10-31"
|
|
62
|
+
)
|
|
63
|
+
elif self.release_version == "release_v2":
|
|
64
|
+
dataset = dataset.filter(
|
|
65
|
+
lambda x: x["contest_date"] >= "2023-11-01" and x["contest_date"] <= "2024-04-30"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Filter by platform if specified
|
|
69
|
+
if self.platform:
|
|
70
|
+
platform_lower = self.platform.lower()
|
|
71
|
+
dataset = dataset.filter(
|
|
72
|
+
lambda x: x["platform"].lower() == platform_lower
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Apply limit
|
|
76
|
+
if self.limit:
|
|
77
|
+
dataset = dataset.select(range(min(self.limit, len(dataset))))
|
|
78
|
+
|
|
79
|
+
# Convert each problem to a CodingTask
|
|
80
|
+
for idx, problem in enumerate(dataset):
|
|
81
|
+
task = self._problem_to_task(problem, idx)
|
|
82
|
+
if task:
|
|
83
|
+
yield task
|
|
84
|
+
|
|
85
|
+
def _problem_to_task(self, problem: dict, idx: int) -> Optional[CodingTask]:
|
|
86
|
+
"""
|
|
87
|
+
Convert a LiveCodeBench problem to a CodingTask.
|
|
88
|
+
|
|
89
|
+
Arguments:
|
|
90
|
+
problem: Problem dictionary from HuggingFace dataset
|
|
91
|
+
idx: Problem index
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
CodingTask or None if conversion fails
|
|
95
|
+
"""
|
|
96
|
+
try:
|
|
97
|
+
platform = problem["platform"].lower()
|
|
98
|
+
question_id = problem["question_id"]
|
|
99
|
+
|
|
100
|
+
# Parse test cases
|
|
101
|
+
public_tests = json.loads(problem["public_test_cases"])
|
|
102
|
+
|
|
103
|
+
if not public_tests:
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
# Determine test type and generate appropriate test file
|
|
107
|
+
test_type = public_tests[0].get("testtype", "stdin")
|
|
108
|
+
|
|
109
|
+
if test_type == "functional":
|
|
110
|
+
# LeetCode-style: function calls with arguments
|
|
111
|
+
test_file = self._generate_functional_test(problem, public_tests)
|
|
112
|
+
else:
|
|
113
|
+
# stdin: CodeForces/AtCoder style
|
|
114
|
+
test_file = self._generate_stdin_test(problem, public_tests)
|
|
115
|
+
|
|
116
|
+
if not test_file:
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
# Generate solution file template
|
|
120
|
+
solution_file = self._generate_solution_template(problem)
|
|
121
|
+
|
|
122
|
+
files = {
|
|
123
|
+
"solution.py": solution_file,
|
|
124
|
+
"tests.py": test_file,
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
options = {
|
|
128
|
+
"problem_id": question_id,
|
|
129
|
+
"platform": platform,
|
|
130
|
+
"difficulty": problem.get("difficulty", "unknown"),
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
return CodingTask(
|
|
134
|
+
language=self.language,
|
|
135
|
+
files=files,
|
|
136
|
+
options=options,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
except Exception as e:
|
|
140
|
+
# Skip problematic problems
|
|
141
|
+
import logging
|
|
142
|
+
logging.warning(f"Failed to convert problem {idx}: {e}")
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
def _generate_solution_template(self, problem: dict) -> str:
|
|
146
|
+
"""
|
|
147
|
+
Generate a solution template from starter code or problem description.
|
|
148
|
+
|
|
149
|
+
Arguments:
|
|
150
|
+
problem: Problem dictionary
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Python solution template as string
|
|
154
|
+
"""
|
|
155
|
+
starter_code = problem.get("starter_code", "").strip()
|
|
156
|
+
|
|
157
|
+
if starter_code:
|
|
158
|
+
# Use provided starter code
|
|
159
|
+
return starter_code
|
|
160
|
+
else:
|
|
161
|
+
# Generate minimal template for stdin problems
|
|
162
|
+
return """# Read input and solve the problem
|
|
163
|
+
import sys
|
|
164
|
+
|
|
165
|
+
def solve():
|
|
166
|
+
# Read input from stdin
|
|
167
|
+
lines = sys.stdin.read().strip().split('\\n')
|
|
168
|
+
|
|
169
|
+
# TODO: Implement solution
|
|
170
|
+
pass
|
|
171
|
+
|
|
172
|
+
if __name__ == "__main__":
|
|
173
|
+
solve()
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
def _generate_functional_test(self, problem: dict, test_cases: list) -> str:
|
|
177
|
+
"""
|
|
178
|
+
Generate test file for LeetCode-style functional tests.
|
|
179
|
+
|
|
180
|
+
Arguments:
|
|
181
|
+
problem: Problem dictionary
|
|
182
|
+
test_cases: List of test case dictionaries
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
Python test file content
|
|
186
|
+
"""
|
|
187
|
+
starter_code = problem.get("starter_code", "").strip()
|
|
188
|
+
|
|
189
|
+
if not starter_code:
|
|
190
|
+
return ""
|
|
191
|
+
|
|
192
|
+
# Extract class and method name from starter code
|
|
193
|
+
import re
|
|
194
|
+
class_match = re.search(r"class\s+(\w+)", starter_code)
|
|
195
|
+
method_match = re.search(r"def\s+(\w+)\s*\(", starter_code)
|
|
196
|
+
|
|
197
|
+
if not class_match or not method_match:
|
|
198
|
+
return ""
|
|
199
|
+
|
|
200
|
+
class_name = class_match.group(1)
|
|
201
|
+
method_name = method_match.group(1)
|
|
202
|
+
|
|
203
|
+
# Generate test file
|
|
204
|
+
test_code = f"""from solution import {class_name}
|
|
205
|
+
|
|
206
|
+
def test_functional():
|
|
207
|
+
solution = {class_name}()
|
|
208
|
+
|
|
209
|
+
"""
|
|
210
|
+
|
|
211
|
+
for i, test in enumerate(test_cases):
|
|
212
|
+
input_str = test.get("input", "")
|
|
213
|
+
expected_output = test.get("output", "")
|
|
214
|
+
|
|
215
|
+
# Parse input (typically JSON array where first element is the actual argument)
|
|
216
|
+
try:
|
|
217
|
+
# Try to evaluate as Python literal
|
|
218
|
+
import ast
|
|
219
|
+
parsed = ast.literal_eval(input_str)
|
|
220
|
+
|
|
221
|
+
# If it's a list with one element that's also a list, use that inner list
|
|
222
|
+
if isinstance(parsed, list) and len(parsed) == 1 and isinstance(parsed[0], list):
|
|
223
|
+
args = [parsed[0]]
|
|
224
|
+
elif isinstance(parsed, list):
|
|
225
|
+
args = [parsed]
|
|
226
|
+
else:
|
|
227
|
+
args = [parsed]
|
|
228
|
+
except:
|
|
229
|
+
# Fallback: use raw string
|
|
230
|
+
args = [input_str]
|
|
231
|
+
|
|
232
|
+
# Parse expected output
|
|
233
|
+
try:
|
|
234
|
+
import ast
|
|
235
|
+
expected = ast.literal_eval(expected_output)
|
|
236
|
+
except:
|
|
237
|
+
expected = expected_output
|
|
238
|
+
|
|
239
|
+
# Generate assertion
|
|
240
|
+
args_str = ", ".join(repr(arg) for arg in args)
|
|
241
|
+
test_code += f" # Test case {i + 1}\n"
|
|
242
|
+
test_code += f" result = solution.{method_name}({args_str})\n"
|
|
243
|
+
test_code += f" assert result == {repr(expected)}, f\"Test {i + 1} failed: {{result}} != {repr(expected)}\"\n\n"
|
|
244
|
+
|
|
245
|
+
test_code += "if __name__ == '__main__':\n"
|
|
246
|
+
test_code += " test_functional()\n"
|
|
247
|
+
test_code += " print('All tests passed!')\n"
|
|
248
|
+
|
|
249
|
+
return test_code
|
|
250
|
+
|
|
251
|
+
def _generate_stdin_test(self, problem: dict, test_cases: list) -> str:
|
|
252
|
+
"""
|
|
253
|
+
Generate test file for stdin-based tests (CodeForces/AtCoder style).
|
|
254
|
+
|
|
255
|
+
Arguments:
|
|
256
|
+
problem: Problem dictionary
|
|
257
|
+
test_cases: List of test case dictionaries
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
Python test file content
|
|
261
|
+
"""
|
|
262
|
+
# For stdin tests, we run the solution and compare output
|
|
263
|
+
test_code = """import subprocess
|
|
264
|
+
import sys
|
|
265
|
+
|
|
266
|
+
def test_stdin():
|
|
267
|
+
test_cases = [
|
|
268
|
+
"""
|
|
269
|
+
|
|
270
|
+
for i, test in enumerate(test_cases):
|
|
271
|
+
input_data = test.get("input", "")
|
|
272
|
+
expected_output = test.get("output", "")
|
|
273
|
+
|
|
274
|
+
test_code += f" # Test case {i + 1}\n"
|
|
275
|
+
test_code += f" ({repr(input_data)}, {repr(expected_output)}),\n"
|
|
276
|
+
|
|
277
|
+
test_code += """ ]
|
|
278
|
+
|
|
279
|
+
for i, (input_data, expected_output) in enumerate(test_cases):
|
|
280
|
+
# Run solution with input
|
|
281
|
+
proc = subprocess.run(
|
|
282
|
+
[sys.executable, "solution.py"],
|
|
283
|
+
input=input_data,
|
|
284
|
+
capture_output=True,
|
|
285
|
+
text=True,
|
|
286
|
+
timeout=5
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
actual_output = proc.stdout.strip()
|
|
290
|
+
expected_output = expected_output.strip()
|
|
291
|
+
|
|
292
|
+
assert actual_output == expected_output, (
|
|
293
|
+
f"Test case {i + 1} failed:\\n"
|
|
294
|
+
f" Input: {input_data[:100]}\\n"
|
|
295
|
+
f" Expected: {expected_output[:200]}\\n"
|
|
296
|
+
f" Got: {actual_output[:200]}"
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
print(f'All {len(test_cases)} test(s) passed!')
|
|
300
|
+
|
|
301
|
+
if __name__ == '__main__':
|
|
302
|
+
test_stdin()
|
|
303
|
+
"""
|
|
304
|
+
|
|
305
|
+
return test_code
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
import json, os, subprocess, tempfile
|
|
3
3
|
from typing import TYPE_CHECKING
|
|
4
|
-
from wisent.
|
|
4
|
+
from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Result, SandboxExecutor
|
|
5
5
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
|
-
from wisent.
|
|
7
|
+
from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Job
|
|
8
8
|
|
|
9
9
|
__all__ = ["DockerSandboxExecutor"]
|
|
10
10
|
|
|
@@ -31,6 +31,38 @@ class DockerSandboxExecutor(SandboxExecutor):
|
|
|
31
31
|
def __init__(self, image: str = DEFAULT_IMAGE, runtime: str | None = None):
|
|
32
32
|
self.image = image
|
|
33
33
|
self.runtime = runtime
|
|
34
|
+
self._check_docker_available()
|
|
35
|
+
|
|
36
|
+
def _check_docker_available(self) -> None:
|
|
37
|
+
"""
|
|
38
|
+
Check if Docker daemon is running and accessible.
|
|
39
|
+
|
|
40
|
+
Raises:
|
|
41
|
+
RuntimeError: If Docker is not available or not running.
|
|
42
|
+
"""
|
|
43
|
+
try:
|
|
44
|
+
result = subprocess.run(
|
|
45
|
+
["docker", "info"],
|
|
46
|
+
capture_output=True,
|
|
47
|
+
text=True,
|
|
48
|
+
timeout=5
|
|
49
|
+
)
|
|
50
|
+
if result.returncode != 0:
|
|
51
|
+
raise RuntimeError(
|
|
52
|
+
"Docker daemon is not running. Please start Docker and try again.\n"
|
|
53
|
+
f"Error: {result.stderr}"
|
|
54
|
+
)
|
|
55
|
+
except FileNotFoundError:
|
|
56
|
+
raise RuntimeError(
|
|
57
|
+
"Docker command not found. Please install Docker:\n"
|
|
58
|
+
" - macOS: https://docs.docker.com/desktop/install/mac-install/\n"
|
|
59
|
+
" - Linux: https://docs.docker.com/engine/install/\n"
|
|
60
|
+
" - Windows: https://docs.docker.com/desktop/install/windows-install/"
|
|
61
|
+
)
|
|
62
|
+
except subprocess.TimeoutExpired:
|
|
63
|
+
raise RuntimeError(
|
|
64
|
+
"Docker command timed out. Docker daemon may be unresponsive."
|
|
65
|
+
)
|
|
34
66
|
|
|
35
67
|
def run(self, files: dict[str, str], job: Job) -> Result:
|
|
36
68
|
"""
|
|
@@ -49,8 +81,8 @@ class DockerSandboxExecutor(SandboxExecutor):
|
|
|
49
81
|
A Result object with the outcome of the execution.
|
|
50
82
|
|
|
51
83
|
example (pythonm add function)
|
|
52
|
-
>>> from wisent.
|
|
53
|
-
>>> from wisent.
|
|
84
|
+
>>> from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Job, Result
|
|
85
|
+
>>> from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.runtime import DockerSandboxExecutor
|
|
54
86
|
>>> job = Job(
|
|
55
87
|
... language="python",
|
|
56
88
|
... compile_argv=None,
|
|
@@ -1,12 +1,10 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
import json, os, shutil, subprocess, sys, time, signal, resource
|
|
3
3
|
|
|
4
|
-
from wisent.benchmarks.coding.safe_docker.core.atoms import Job
|
|
5
|
-
|
|
6
4
|
JOB_FILE = "/job/job.json"
|
|
7
5
|
WORKDIR = "/work"
|
|
8
6
|
|
|
9
|
-
def set_limits(job
|
|
7
|
+
def set_limits(job):
|
|
10
8
|
"""
|
|
11
9
|
Set resource limits for the sandboxed process.
|
|
12
10
|
|
|
@@ -25,7 +23,7 @@ def set_limits(job: Job):
|
|
|
25
23
|
resource.setrlimit(resource.RLIMIT_CORE,(0,0))
|
|
26
24
|
os.setsid()
|
|
27
25
|
|
|
28
|
-
def run(argv: list[str], job
|
|
26
|
+
def run(argv: list[str], job) -> tuple[int,str,str,float,str]:
|
|
29
27
|
"""
|
|
30
28
|
Run a command in a subprocess with resource limits.
|
|
31
29
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
from typing import Dict
|
|
3
|
-
from wisent.
|
|
3
|
+
from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Job, LanguageRecipe
|
|
4
4
|
|
|
5
5
|
class PythonRecipe(LanguageRecipe):
|
|
6
6
|
"""
|