wisent 0.5.11__py3-none-any.whl → 0.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wisent might be problematic. Click here for more details.
- wisent/__init__.py +1 -1
- wisent/core/activations/__init__.py +26 -0
- wisent/core/activations/activations.py +96 -0
- wisent/core/activations/activations_collector.py +71 -20
- wisent/core/activations/prompt_construction_strategy.py +47 -0
- wisent/core/agent/budget.py +2 -2
- wisent/core/agent/device_benchmarks.py +1 -1
- wisent/core/agent/diagnose/classifier_marketplace.py +8 -8
- wisent/core/agent/diagnose/response_diagnostics.py +4 -4
- wisent/core/agent/diagnose/synthetic_classifier_option.py +1 -1
- wisent/core/agent/diagnose/tasks/task_manager.py +3 -3
- wisent/core/agent/diagnose.py +2 -1
- wisent/core/autonomous_agent.py +10 -2
- wisent/core/benchmark_extractors.py +293 -0
- wisent/core/bigcode_integration.py +20 -7
- wisent/core/branding.py +108 -0
- wisent/core/cli/__init__.py +15 -0
- wisent/core/cli/create_steering_vector.py +138 -0
- wisent/core/cli/evaluate_responses.py +715 -0
- wisent/core/cli/generate_pairs.py +128 -0
- wisent/core/cli/generate_pairs_from_task.py +119 -0
- wisent/core/cli/generate_responses.py +129 -0
- wisent/core/cli/generate_vector_from_synthetic.py +149 -0
- wisent/core/cli/generate_vector_from_task.py +147 -0
- wisent/core/cli/get_activations.py +191 -0
- wisent/core/cli/optimize_classification.py +339 -0
- wisent/core/cli/optimize_steering.py +364 -0
- wisent/core/cli/tasks.py +182 -0
- wisent/core/cli_logger.py +22 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +27 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +49 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +119 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +146 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +129 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +119 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +112 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livecodebench.py +367 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +112 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +112 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +116 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +121 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +121 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad2.py +124 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sst2.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +112 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +127 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_gen.py +112 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +117 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +117 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +127 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +119 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +112 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +1 -1
- wisent/core/data_loaders/__init__.py +235 -0
- wisent/core/data_loaders/loaders/lm_loader.py +2 -2
- wisent/core/data_loaders/loaders/task_interface_loader.py +300 -0
- wisent/{cli/data_loaders/data_loader_rotator.py → core/data_loaders/rotator.py} +1 -1
- wisent/core/download_full_benchmarks.py +79 -2
- wisent/core/evaluators/benchmark_specific/__init__.py +26 -0
- wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/evaluator.py +17 -17
- wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/cpp_sanitizer.py +2 -2
- wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/java_sanitizer.py +2 -2
- wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/python_sanitizer.py +2 -2
- wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/__init__.py +3 -0
- wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/provider.py +305 -0
- wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/core/runtime.py +36 -4
- wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/entrypoint.py +2 -4
- wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/recipes.py +1 -1
- wisent/core/evaluators/benchmark_specific/coding/solution_generator.py +258 -0
- wisent/core/evaluators/benchmark_specific/exact_match_evaluator.py +79 -0
- wisent/core/evaluators/benchmark_specific/f1_evaluator.py +101 -0
- wisent/core/evaluators/benchmark_specific/generation_evaluator.py +197 -0
- wisent/core/{log_likelihoods_evaluator.py → evaluators/benchmark_specific/log_likelihoods_evaluator.py} +10 -2
- wisent/core/evaluators/benchmark_specific/perplexity_evaluator.py +140 -0
- wisent/core/evaluators/benchmark_specific/personalization_evaluator.py +250 -0
- wisent/{cli/evaluators/evaluator_rotator.py → core/evaluators/rotator.py} +4 -4
- wisent/core/lm_eval_harness_ground_truth.py +3 -2
- wisent/core/main.py +57 -0
- wisent/core/model_persistence.py +2 -2
- wisent/core/models/wisent_model.py +8 -6
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
- wisent/core/optuna/steering/steering_optimization.py +1 -1
- wisent/core/parser_arguments/__init__.py +10 -0
- wisent/core/parser_arguments/agent_parser.py +110 -0
- wisent/core/parser_arguments/configure_model_parser.py +7 -0
- wisent/core/parser_arguments/create_steering_vector_parser.py +59 -0
- wisent/core/parser_arguments/evaluate_parser.py +40 -0
- wisent/core/parser_arguments/evaluate_responses_parser.py +10 -0
- wisent/core/parser_arguments/full_optimize_parser.py +115 -0
- wisent/core/parser_arguments/generate_pairs_from_task_parser.py +33 -0
- wisent/core/parser_arguments/generate_pairs_parser.py +29 -0
- wisent/core/parser_arguments/generate_responses_parser.py +15 -0
- wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +127 -0
- wisent/core/parser_arguments/generate_vector_from_task_parser.py +127 -0
- wisent/core/parser_arguments/generate_vector_parser.py +90 -0
- wisent/core/parser_arguments/get_activations_parser.py +90 -0
- wisent/core/parser_arguments/main_parser.py +152 -0
- wisent/core/parser_arguments/model_config_parser.py +59 -0
- wisent/core/parser_arguments/monitor_parser.py +17 -0
- wisent/core/parser_arguments/multi_steer_parser.py +47 -0
- wisent/core/parser_arguments/optimize_classification_parser.py +67 -0
- wisent/core/parser_arguments/optimize_sample_size_parser.py +58 -0
- wisent/core/parser_arguments/optimize_steering_parser.py +147 -0
- wisent/core/parser_arguments/synthetic_parser.py +93 -0
- wisent/core/parser_arguments/tasks_parser.py +584 -0
- wisent/core/parser_arguments/test_nonsense_parser.py +26 -0
- wisent/core/parser_arguments/utils.py +111 -0
- wisent/core/prompts/core/prompt_formater.py +3 -3
- wisent/core/prompts/prompt_stratiegies/direct_completion.py +2 -0
- wisent/core/prompts/prompt_stratiegies/instruction_following.py +2 -0
- wisent/core/prompts/prompt_stratiegies/multiple_choice.py +2 -0
- wisent/core/prompts/prompt_stratiegies/role_playing.py +2 -0
- wisent/{cli/steering_methods/steering_rotator.py → core/steering_methods/rotator.py} +4 -4
- wisent/core/steering_optimizer.py +45 -21
- wisent/{synthetic → core/synthetic}/cleaners/deduper_cleaner.py +3 -3
- wisent/{synthetic → core/synthetic}/cleaners/methods/base_dedupers.py +2 -2
- wisent/{synthetic → core/synthetic}/cleaners/methods/base_refusalers.py +1 -1
- wisent/{synthetic → core/synthetic}/cleaners/pairs_cleaner.py +5 -5
- wisent/{synthetic → core/synthetic}/cleaners/refusaler_cleaner.py +4 -4
- wisent/{synthetic → core/synthetic}/db_instructions/mini_dp.py +1 -1
- wisent/{synthetic → core/synthetic}/generators/diversities/methods/fast_diversity.py +1 -1
- wisent/{synthetic → core/synthetic}/generators/pairs_generator.py +38 -12
- wisent/core/tasks/livecodebench_task.py +4 -103
- wisent/core/timing_calibration.py +1 -1
- {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/METADATA +3 -3
- wisent-0.5.13.dist-info/RECORD +294 -0
- wisent-0.5.13.dist-info/entry_points.txt +2 -0
- wisent/benchmarks/coding/providers/livecodebench/provider.py +0 -53
- wisent/classifiers/core/atoms.py +0 -747
- wisent/classifiers/models/logistic.py +0 -29
- wisent/classifiers/models/mlp.py +0 -47
- wisent/cli/classifiers/classifier_rotator.py +0 -137
- wisent/cli/cli_logger.py +0 -142
- wisent/cli/wisent_cli/commands/help_cmd.py +0 -52
- wisent/cli/wisent_cli/commands/listing.py +0 -154
- wisent/cli/wisent_cli/commands/train_cmd.py +0 -322
- wisent/cli/wisent_cli/main.py +0 -93
- wisent/cli/wisent_cli/shell.py +0 -80
- wisent/cli/wisent_cli/ui.py +0 -69
- wisent/cli/wisent_cli/util/aggregations.py +0 -43
- wisent/cli/wisent_cli/util/parsing.py +0 -126
- wisent/cli/wisent_cli/version.py +0 -4
- wisent/opti/methods/__init__.py +0 -0
- wisent/synthetic/__init__.py +0 -0
- wisent/synthetic/cleaners/__init__.py +0 -0
- wisent/synthetic/cleaners/core/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
- wisent/synthetic/db_instructions/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/__init__.py +0 -0
- wisent/synthetic/generators/__init__.py +0 -0
- wisent/synthetic/generators/core/__init__.py +0 -0
- wisent/synthetic/generators/diversities/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/__init__.py +0 -0
- wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
- wisent-0.5.11.dist-info/RECORD +0 -220
- /wisent/{benchmarks → core/evaluators/benchmark_specific/coding}/__init__.py +0 -0
- /wisent/{benchmarks/coding → core/evaluators/benchmark_specific/coding/metrics}/__init__.py +0 -0
- /wisent/{benchmarks/coding/metrics → core/evaluators/benchmark_specific/coding/metrics/core}/__init__.py +0 -0
- /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/core/atoms.py +0 -0
- /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/passk.py +0 -0
- /wisent/{benchmarks/coding/metrics/core → core/evaluators/benchmark_specific/coding/output_sanitizer}/__init__.py +0 -0
- /wisent/{benchmarks/coding/output_sanitizer → core/evaluators/benchmark_specific/coding/output_sanitizer/core}/__init__.py +0 -0
- /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/core/atoms.py +0 -0
- /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/utils.py +0 -0
- /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/providers/__init__.py +0 -0
- /wisent/{benchmarks/coding/output_sanitizer → core/evaluators/benchmark_specific/coding/providers}/core/__init__.py +0 -0
- /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/providers/core/atoms.py +0 -0
- /wisent/{benchmarks/coding/providers/core → core/evaluators/benchmark_specific/coding/safe_docker}/__init__.py +0 -0
- /wisent/{benchmarks/coding/providers/livecodebench → core/evaluators/benchmark_specific/coding/safe_docker/core}/__init__.py +0 -0
- /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/core/atoms.py +0 -0
- /wisent/{benchmarks/coding/safe_docker → core/opti}/__init__.py +0 -0
- /wisent/{benchmarks/coding/safe_docker → core/opti}/core/__init__.py +0 -0
- /wisent/{opti → core/opti}/core/atoms.py +0 -0
- /wisent/{classifiers → core/opti/methods}/__init__.py +0 -0
- /wisent/{opti → core/opti}/methods/opti_classificator.py +0 -0
- /wisent/{opti → core/opti}/methods/opti_steering.py +0 -0
- /wisent/{classifiers/core → core/synthetic}/__init__.py +0 -0
- /wisent/{classifiers/models → core/synthetic/cleaners}/__init__.py +0 -0
- /wisent/{cli → core/synthetic/cleaners/core}/__init__.py +0 -0
- /wisent/{synthetic → core/synthetic}/cleaners/core/atoms.py +0 -0
- /wisent/{cli/classifiers → core/synthetic/cleaners/methods}/__init__.py +0 -0
- /wisent/{cli/data_loaders → core/synthetic/cleaners/methods/core}/__init__.py +0 -0
- /wisent/{synthetic → core/synthetic}/cleaners/methods/core/atoms.py +0 -0
- /wisent/{cli/evaluators → core/synthetic/db_instructions}/__init__.py +0 -0
- /wisent/{cli/steering_methods → core/synthetic/db_instructions/core}/__init__.py +0 -0
- /wisent/{synthetic → core/synthetic}/db_instructions/core/atoms.py +0 -0
- /wisent/{cli/wisent_cli → core/synthetic/generators}/__init__.py +0 -0
- /wisent/{cli/wisent_cli/commands → core/synthetic/generators/core}/__init__.py +0 -0
- /wisent/{synthetic → core/synthetic}/generators/core/atoms.py +0 -0
- /wisent/{cli/wisent_cli/util → core/synthetic/generators/diversities}/__init__.py +0 -0
- /wisent/{opti → core/synthetic/generators/diversities/core}/__init__.py +0 -0
- /wisent/{synthetic → core/synthetic}/generators/diversities/core/core.py +0 -0
- /wisent/{opti/core → core/synthetic/generators/diversities/methods}/__init__.py +0 -0
- {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/WHEEL +0 -0
- {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/licenses/LICENSE +0 -0
- {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/top_level.txt +0 -0
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
from typing import TYPE_CHECKING
|
|
4
4
|
|
|
5
5
|
from wisent.core.contrastive_pairs.lm_eval_pairs.lm_extractor_registry import get_extractor
|
|
6
|
-
from wisent.
|
|
6
|
+
from wisent.core.cli_logger import setup_logger, bind
|
|
7
7
|
|
|
8
8
|
if TYPE_CHECKING:
|
|
9
9
|
from lm_eval.api.task import ConfigurableTask
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data loaders for various benchmarks.
|
|
3
|
+
|
|
4
|
+
This module provides data loaders for tasks that need special handling.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
__all__ = ["LiveCodeBenchLoader", "LiveCodeBenchProblem"]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class LiveCodeBenchProblem:
|
|
16
|
+
"""
|
|
17
|
+
A LiveCodeBench coding problem.
|
|
18
|
+
|
|
19
|
+
Attributes:
|
|
20
|
+
question_title: Title of the problem
|
|
21
|
+
question_content: Full problem description
|
|
22
|
+
platform: Source platform (codeforces, leetcode, atcoder)
|
|
23
|
+
question_id: Unique problem identifier
|
|
24
|
+
contest_id: Contest identifier
|
|
25
|
+
contest_date: Date of the contest
|
|
26
|
+
starter_code: Optional starter code template
|
|
27
|
+
difficulty: Problem difficulty (easy, medium, hard)
|
|
28
|
+
public_test_cases: Public test cases
|
|
29
|
+
private_test_cases: Private test cases
|
|
30
|
+
metadata: Additional metadata
|
|
31
|
+
answer: Correct answer/solution (for TaskInterface compatibility)
|
|
32
|
+
good_code: Code that passes tests (from wisent-core)
|
|
33
|
+
bad_code: Code that fails tests (from wisent-core)
|
|
34
|
+
"""
|
|
35
|
+
question_title: str
|
|
36
|
+
question_content: str
|
|
37
|
+
platform: str
|
|
38
|
+
question_id: str
|
|
39
|
+
contest_id: str
|
|
40
|
+
contest_date: str
|
|
41
|
+
starter_code: str
|
|
42
|
+
difficulty: str
|
|
43
|
+
public_test_cases: List[Any]
|
|
44
|
+
private_test_cases: List[Any]
|
|
45
|
+
metadata: Dict[str, Any]
|
|
46
|
+
answer: Optional[str] = None
|
|
47
|
+
good_code: Optional[str] = None
|
|
48
|
+
bad_code: Optional[str] = None
|
|
49
|
+
|
|
50
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
51
|
+
"""Convert to dictionary format."""
|
|
52
|
+
return {
|
|
53
|
+
"task_id": self.question_id,
|
|
54
|
+
"question_id": self.question_id, # Also include question_id for compatibility
|
|
55
|
+
"question_title": self.question_title,
|
|
56
|
+
"question_content": self.question_content,
|
|
57
|
+
"platform": self.platform.upper(),
|
|
58
|
+
"contest_id": self.contest_id,
|
|
59
|
+
"contest_date": self.contest_date,
|
|
60
|
+
"starter_code": self.starter_code,
|
|
61
|
+
"difficulty": self.difficulty.upper(),
|
|
62
|
+
"public_test_cases": [
|
|
63
|
+
{
|
|
64
|
+
"input": tc if isinstance(tc, str) else str(tc),
|
|
65
|
+
"output": "",
|
|
66
|
+
"testtype": "FUNCTIONAL"
|
|
67
|
+
}
|
|
68
|
+
for tc in self.public_test_cases[:3] # Limit to first 3 for brevity
|
|
69
|
+
] if self.public_test_cases else [],
|
|
70
|
+
"metadata": self.metadata,
|
|
71
|
+
"answer": self.good_code, # Use good_code as the correct answer
|
|
72
|
+
"good_code": self.good_code,
|
|
73
|
+
"bad_code": self.bad_code,
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class LiveCodeBenchLoader:
|
|
78
|
+
"""
|
|
79
|
+
LiveCodeBench data loader.
|
|
80
|
+
|
|
81
|
+
Loads real coding problems from the LiveCodeBench dataset on HuggingFace.
|
|
82
|
+
Dataset: livecodebench/code_generation_lite
|
|
83
|
+
|
|
84
|
+
Also loads pre-generated good/bad code solutions from local cache.
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
def __init__(self, solution_cache_dir: Optional[str] = None):
|
|
88
|
+
self._dataset_name = "livecodebench/code_generation_lite"
|
|
89
|
+
self._cached_dataset = None
|
|
90
|
+
self._solution_data = None
|
|
91
|
+
self.solution_cache_dir = solution_cache_dir or "./livecodebench_solutions"
|
|
92
|
+
|
|
93
|
+
def list_available_versions(self) -> List[str]:
|
|
94
|
+
"""List available LiveCodeBench versions."""
|
|
95
|
+
# The dataset doesn't have explicit versions, but we can filter by date ranges
|
|
96
|
+
return ["release_v1", "release_v2", "all"]
|
|
97
|
+
|
|
98
|
+
def get_version_info(self, version: str) -> Dict[str, Any]:
|
|
99
|
+
"""Get information about a specific version."""
|
|
100
|
+
version_info = {
|
|
101
|
+
"release_v1": {
|
|
102
|
+
"version": "release_v1",
|
|
103
|
+
"description": "LiveCodeBench Release V1 (May 2023 - Oct 2023)",
|
|
104
|
+
"problems": "~500",
|
|
105
|
+
"date_range": "2023-05-01 to 2023-10-31",
|
|
106
|
+
},
|
|
107
|
+
"release_v2": {
|
|
108
|
+
"version": "release_v2",
|
|
109
|
+
"description": "LiveCodeBench Release V2 (Nov 2023 - Apr 2024)",
|
|
110
|
+
"problems": "~500",
|
|
111
|
+
"date_range": "2023-11-01 to 2024-04-30",
|
|
112
|
+
},
|
|
113
|
+
"all": {
|
|
114
|
+
"version": "all",
|
|
115
|
+
"description": "All LiveCodeBench problems",
|
|
116
|
+
"problems": "1055",
|
|
117
|
+
"date_range": "2023-05-01 to 2024-12-31",
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
return version_info.get(version, version_info["all"])
|
|
121
|
+
|
|
122
|
+
def _load_solution_data(self) -> Dict[str, Any]:
|
|
123
|
+
"""
|
|
124
|
+
Load pre-generated AI model solutions from local cache.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Dictionary with question_id -> {good_code, bad_code, difficulty} mapping.
|
|
128
|
+
"""
|
|
129
|
+
import json
|
|
130
|
+
from pathlib import Path
|
|
131
|
+
|
|
132
|
+
if self._solution_data is not None:
|
|
133
|
+
return self._solution_data
|
|
134
|
+
|
|
135
|
+
cache_file = Path(self.solution_cache_dir) / "solutions.json"
|
|
136
|
+
|
|
137
|
+
if not cache_file.exists():
|
|
138
|
+
import logging
|
|
139
|
+
logging.warning(
|
|
140
|
+
f"Solutions cache not found at {cache_file}. "
|
|
141
|
+
f"Run solution generation first using LiveCodeBenchSolutionGenerator. "
|
|
142
|
+
f"Problems will have no answer field."
|
|
143
|
+
)
|
|
144
|
+
self._solution_data = {}
|
|
145
|
+
return {}
|
|
146
|
+
|
|
147
|
+
with open(cache_file, 'r') as f:
|
|
148
|
+
data = json.load(f)
|
|
149
|
+
|
|
150
|
+
# Create mapping from question_id to solutions
|
|
151
|
+
solution_map = {}
|
|
152
|
+
for problem in data.get("problems", []):
|
|
153
|
+
question_id = problem.get("question_id")
|
|
154
|
+
if question_id and problem.get("good_example") and problem.get("bad_example"):
|
|
155
|
+
solution_map[question_id] = {
|
|
156
|
+
"good_code": problem["good_example"].get("code", ""),
|
|
157
|
+
"bad_code": problem["bad_example"].get("code", ""),
|
|
158
|
+
"difficulty": problem.get("difficulty", "unknown"),
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
self._solution_data = solution_map
|
|
162
|
+
return solution_map
|
|
163
|
+
|
|
164
|
+
def load_problems(
|
|
165
|
+
self,
|
|
166
|
+
release_version: str = "all",
|
|
167
|
+
limit: Optional[int] = None
|
|
168
|
+
) -> List[LiveCodeBenchProblem]:
|
|
169
|
+
"""
|
|
170
|
+
Load LiveCodeBench problems from HuggingFace.
|
|
171
|
+
|
|
172
|
+
Arguments:
|
|
173
|
+
release_version: Version to load (release_v1, release_v2, or all)
|
|
174
|
+
limit: Maximum number of problems to load
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
List of LiveCodeBenchProblem objects
|
|
178
|
+
"""
|
|
179
|
+
from datasets import load_dataset
|
|
180
|
+
|
|
181
|
+
# Load dataset (cached after first load)
|
|
182
|
+
if self._cached_dataset is None:
|
|
183
|
+
self._cached_dataset = load_dataset(self._dataset_name, split="test")
|
|
184
|
+
|
|
185
|
+
dataset = self._cached_dataset
|
|
186
|
+
|
|
187
|
+
# Filter by version if needed
|
|
188
|
+
if release_version == "release_v1":
|
|
189
|
+
# Filter problems from May 2023 - Oct 2023
|
|
190
|
+
dataset = dataset.filter(
|
|
191
|
+
lambda x: x["contest_date"] >= "2023-05-01" and x["contest_date"] <= "2023-10-31"
|
|
192
|
+
)
|
|
193
|
+
elif release_version == "release_v2":
|
|
194
|
+
# Filter problems from Nov 2023 - Apr 2024
|
|
195
|
+
dataset = dataset.filter(
|
|
196
|
+
lambda x: x["contest_date"] >= "2023-11-01" and x["contest_date"] <= "2024-04-30"
|
|
197
|
+
)
|
|
198
|
+
# "all" or any other value: use all problems
|
|
199
|
+
|
|
200
|
+
# Apply limit
|
|
201
|
+
if limit:
|
|
202
|
+
dataset = dataset.select(range(min(limit, len(dataset))))
|
|
203
|
+
|
|
204
|
+
# Load solution data from wisent-core
|
|
205
|
+
solution_map = self._load_solution_data()
|
|
206
|
+
|
|
207
|
+
# Convert to LiveCodeBenchProblem objects
|
|
208
|
+
problems = []
|
|
209
|
+
for item in dataset:
|
|
210
|
+
question_id = item.get("question_id", "")
|
|
211
|
+
|
|
212
|
+
# Get solutions if available
|
|
213
|
+
solutions = solution_map.get(question_id, {})
|
|
214
|
+
good_code = solutions.get("good_code")
|
|
215
|
+
bad_code = solutions.get("bad_code")
|
|
216
|
+
|
|
217
|
+
problem = LiveCodeBenchProblem(
|
|
218
|
+
question_title=item.get("question_title", ""),
|
|
219
|
+
question_content=item.get("question_content", ""),
|
|
220
|
+
platform=item.get("platform", ""),
|
|
221
|
+
question_id=question_id,
|
|
222
|
+
contest_id=str(item.get("contest_id", "")),
|
|
223
|
+
contest_date=item.get("contest_date", ""),
|
|
224
|
+
starter_code=item.get("starter_code", ""),
|
|
225
|
+
difficulty=item.get("difficulty", ""),
|
|
226
|
+
public_test_cases=item.get("public_test_cases", []),
|
|
227
|
+
private_test_cases=item.get("private_test_cases", []),
|
|
228
|
+
metadata=item.get("metadata", {}),
|
|
229
|
+
answer=good_code, # Set answer field for TaskInterface
|
|
230
|
+
good_code=good_code,
|
|
231
|
+
bad_code=bad_code,
|
|
232
|
+
)
|
|
233
|
+
problems.append(problem)
|
|
234
|
+
|
|
235
|
+
return problems
|
|
@@ -91,8 +91,8 @@ class LMEvalDataLoader(BaseDataLoader):
|
|
|
91
91
|
train_set = ContrastivePairSet("lm_eval_train", train_pairs, task_type=task_name)
|
|
92
92
|
test_set = ContrastivePairSet("lm_eval_test", test_pairs, task_type=task_name)
|
|
93
93
|
|
|
94
|
-
train_set.validate()
|
|
95
|
-
test_set.validate()
|
|
94
|
+
train_set.validate(raise_on_critical=False)
|
|
95
|
+
test_set.validate(raise_on_critical=False)
|
|
96
96
|
|
|
97
97
|
return LoadDataResult(
|
|
98
98
|
train_qa_pairs=train_set,
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Any, Optional
|
|
3
|
+
import logging
|
|
4
|
+
import random
|
|
5
|
+
|
|
6
|
+
from wisent.core.data_loaders.core.atoms import BaseDataLoader, DataLoaderError, LoadDataResult
|
|
7
|
+
from wisent.core.contrastive_pairs.core.pair import ContrastivePair
|
|
8
|
+
from wisent.core.contrastive_pairs.core.response import PositiveResponse, NegativeResponse
|
|
9
|
+
from wisent.core.contrastive_pairs.core.set import ContrastivePairSet
|
|
10
|
+
from wisent.core.task_interface import get_task, list_tasks
|
|
11
|
+
from wisent.core.task_interface import TaskInterface
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"TaskInterfaceDataLoader",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
log = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class TaskInterfaceDataLoader(BaseDataLoader):
|
|
21
|
+
"""
|
|
22
|
+
Load contrastive pairs from TaskInterface tasks (AIME, HMMT, LiveCodeBench, etc.).
|
|
23
|
+
|
|
24
|
+
This loader bridges TaskInterface tasks with the CLI training pipeline by:
|
|
25
|
+
1. Loading problem data from TaskInterface tasks
|
|
26
|
+
2. Converting problems into contrastive pairs (correct/incorrect answers)
|
|
27
|
+
3. Splitting into train/test sets
|
|
28
|
+
|
|
29
|
+
Usage:
|
|
30
|
+
wisent train model meta-llama/Llama-3.2-1B-Instruct loader task_interface task gsm8k training_limit 100
|
|
31
|
+
wisent train model meta-llama/Llama-3.2-1B-Instruct loader task_interface task aime training_limit 50
|
|
32
|
+
wisent train model meta-llama/Llama-3.2-1B-Instruct loader task_interface task livecodebench training_limit 200
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
name = "task_interface"
|
|
36
|
+
description = "Load from TaskInterface tasks (AIME, HMMT, LiveCodeBench, GSM8K, etc.)"
|
|
37
|
+
|
|
38
|
+
def load(
|
|
39
|
+
self,
|
|
40
|
+
task: Optional[str] = None,
|
|
41
|
+
split_ratio: Optional[float] = None,
|
|
42
|
+
seed: int = 42,
|
|
43
|
+
limit: Optional[int] = None,
|
|
44
|
+
training_limit: Optional[int] = None,
|
|
45
|
+
testing_limit: Optional[int] = None,
|
|
46
|
+
**kwargs: Any,
|
|
47
|
+
) -> LoadDataResult:
|
|
48
|
+
"""
|
|
49
|
+
Load contrastive pairs from a TaskInterface task.
|
|
50
|
+
|
|
51
|
+
Arguments:
|
|
52
|
+
task: Name of the TaskInterface task (e.g., 'gsm8k', 'aime', 'livecodebench')
|
|
53
|
+
split_ratio: Fraction of data for training (default: 0.8)
|
|
54
|
+
seed: Random seed for splitting
|
|
55
|
+
limit: Total number of problems to load
|
|
56
|
+
training_limit: Maximum training examples
|
|
57
|
+
testing_limit: Maximum testing examples
|
|
58
|
+
**kwargs: Additional arguments passed to the task
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
LoadDataResult with train/test contrastive pairs
|
|
62
|
+
|
|
63
|
+
Raises:
|
|
64
|
+
DataLoaderError: If task is not specified or not found
|
|
65
|
+
"""
|
|
66
|
+
if not task:
|
|
67
|
+
available = list_tasks()
|
|
68
|
+
raise DataLoaderError(
|
|
69
|
+
f"TaskInterface loader requires a 'task' parameter. "
|
|
70
|
+
f"Available tasks: {', '.join(available[:10])}..."
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Ensure split ratio is valid
|
|
74
|
+
split_ratio = self._effective_split(split_ratio)
|
|
75
|
+
|
|
76
|
+
# Load the task
|
|
77
|
+
try:
|
|
78
|
+
task_obj: TaskInterface = get_task(task, limit=limit)
|
|
79
|
+
except ValueError as e:
|
|
80
|
+
available = list_tasks()
|
|
81
|
+
raise DataLoaderError(
|
|
82
|
+
f"TaskInterface task '{task}' not found. "
|
|
83
|
+
f"Available tasks: {', '.join(available[:20])}..."
|
|
84
|
+
) from e
|
|
85
|
+
|
|
86
|
+
# Load problem data
|
|
87
|
+
log.info(f"Loading data from TaskInterface task: {task}")
|
|
88
|
+
problems = task_obj.load_data(limit=limit)
|
|
89
|
+
|
|
90
|
+
if not problems:
|
|
91
|
+
raise DataLoaderError(f"TaskInterface task '{task}' returned no data")
|
|
92
|
+
|
|
93
|
+
log.info(f"Loaded {len(problems)} problems from {task}")
|
|
94
|
+
|
|
95
|
+
# Convert problems to contrastive pairs
|
|
96
|
+
pairs = self._convert_to_contrastive_pairs(task_obj, problems)
|
|
97
|
+
|
|
98
|
+
if not pairs:
|
|
99
|
+
raise DataLoaderError(
|
|
100
|
+
f"Could not generate any contrastive pairs from {task}. "
|
|
101
|
+
f"Problems may be missing required fields."
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
log.info(f"Generated {len(pairs)} contrastive pairs")
|
|
105
|
+
|
|
106
|
+
# Shuffle and split
|
|
107
|
+
random.seed(seed)
|
|
108
|
+
random.shuffle(pairs)
|
|
109
|
+
|
|
110
|
+
split_idx = int(len(pairs) * split_ratio)
|
|
111
|
+
train_pairs = pairs[:split_idx]
|
|
112
|
+
test_pairs = pairs[split_idx:]
|
|
113
|
+
|
|
114
|
+
# Apply limits
|
|
115
|
+
if training_limit:
|
|
116
|
+
train_pairs = train_pairs[:training_limit]
|
|
117
|
+
if testing_limit:
|
|
118
|
+
test_pairs = test_pairs[:testing_limit]
|
|
119
|
+
|
|
120
|
+
log.info(f"Split: {len(train_pairs)} train, {len(test_pairs)} test")
|
|
121
|
+
|
|
122
|
+
# Create ContrastivePairSets
|
|
123
|
+
train_set = ContrastivePairSet(name=f"{task}_train", pairs=train_pairs, task_type="classification")
|
|
124
|
+
test_set = ContrastivePairSet(name=f"{task}_test", pairs=test_pairs, task_type="classification")
|
|
125
|
+
|
|
126
|
+
return LoadDataResult(
|
|
127
|
+
train_qa_pairs=train_set,
|
|
128
|
+
test_qa_pairs=test_set,
|
|
129
|
+
task_type="classification",
|
|
130
|
+
lm_task_data=None, # TaskInterface tasks don't use lm-eval format
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
def _convert_to_contrastive_pairs(
|
|
134
|
+
self,
|
|
135
|
+
task_obj: TaskInterface,
|
|
136
|
+
problems: list[dict[str, Any]],
|
|
137
|
+
) -> list[ContrastivePair]:
|
|
138
|
+
"""
|
|
139
|
+
Convert task problems into contrastive pairs.
|
|
140
|
+
|
|
141
|
+
For each problem, we create a contrastive pair with:
|
|
142
|
+
- Positive response: The correct answer
|
|
143
|
+
- Negative response: An incorrect answer (generated or from problem data)
|
|
144
|
+
|
|
145
|
+
Arguments:
|
|
146
|
+
task_obj: The TaskInterface task object
|
|
147
|
+
problems: List of problem dictionaries
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
List of ContrastivePair objects
|
|
151
|
+
"""
|
|
152
|
+
pairs = []
|
|
153
|
+
extractor = task_obj.get_extractor()
|
|
154
|
+
task_name = task_obj.get_name()
|
|
155
|
+
|
|
156
|
+
for idx, problem in enumerate(problems):
|
|
157
|
+
try:
|
|
158
|
+
# Get the prompt/question
|
|
159
|
+
if hasattr(task_obj, 'doc_to_text'):
|
|
160
|
+
prompt = task_obj.doc_to_text(problem)
|
|
161
|
+
else:
|
|
162
|
+
# Fallback: extract prompt from problem dict
|
|
163
|
+
prompt = self._extract_prompt_from_problem(problem, task_name)
|
|
164
|
+
|
|
165
|
+
if not prompt:
|
|
166
|
+
log.warning(f"Problem {idx} has no prompt, skipping")
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
# Get correct answer
|
|
170
|
+
correct_answer = self._get_correct_answer(problem, task_name)
|
|
171
|
+
if not correct_answer:
|
|
172
|
+
log.warning(f"Problem {idx} has no correct answer, skipping")
|
|
173
|
+
continue
|
|
174
|
+
|
|
175
|
+
# Generate incorrect answer
|
|
176
|
+
incorrect_answer = self._generate_incorrect_answer(
|
|
177
|
+
problem, correct_answer, task_name, extractor
|
|
178
|
+
)
|
|
179
|
+
if not incorrect_answer:
|
|
180
|
+
log.warning(f"Problem {idx}: could not generate incorrect answer, skipping")
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
# Create contrastive pair
|
|
184
|
+
pair = ContrastivePair(
|
|
185
|
+
prompt=prompt,
|
|
186
|
+
positive_response=PositiveResponse(model_response=correct_answer),
|
|
187
|
+
negative_response=NegativeResponse(model_response=incorrect_answer),
|
|
188
|
+
label="correct",
|
|
189
|
+
)
|
|
190
|
+
pairs.append(pair)
|
|
191
|
+
|
|
192
|
+
except Exception as e:
|
|
193
|
+
log.warning(f"Problem {idx}: failed to create pair: {e}")
|
|
194
|
+
continue
|
|
195
|
+
|
|
196
|
+
return pairs
|
|
197
|
+
|
|
198
|
+
def _extract_prompt_from_problem(self, problem: dict[str, Any], task_name: str) -> Optional[str]:
|
|
199
|
+
"""Extract prompt/question from a problem dict."""
|
|
200
|
+
# Try common field names for prompts
|
|
201
|
+
prompt_fields = ["question", "prompt", "problem", "text", "input", "query", "doc"]
|
|
202
|
+
|
|
203
|
+
for field in prompt_fields:
|
|
204
|
+
if field in problem:
|
|
205
|
+
value = problem[field]
|
|
206
|
+
if isinstance(value, str) and value.strip():
|
|
207
|
+
return value.strip()
|
|
208
|
+
|
|
209
|
+
# If no prompt found, try to construct from problem data
|
|
210
|
+
if task_name == "gsm8k" and "question" in problem:
|
|
211
|
+
return problem["question"]
|
|
212
|
+
|
|
213
|
+
return None
|
|
214
|
+
|
|
215
|
+
def _get_correct_answer(self, problem: dict[str, Any], task_name: str) -> Optional[str]:
|
|
216
|
+
"""Extract the correct answer from a problem."""
|
|
217
|
+
# Try common field names
|
|
218
|
+
answer_fields = ["answer", "target", "label", "solution", "expected_output"]
|
|
219
|
+
|
|
220
|
+
for field in answer_fields:
|
|
221
|
+
if field in problem:
|
|
222
|
+
answer = problem[field]
|
|
223
|
+
if isinstance(answer, (str, int, float)):
|
|
224
|
+
return str(answer)
|
|
225
|
+
elif isinstance(answer, dict):
|
|
226
|
+
# Try to extract from nested dict
|
|
227
|
+
if "answer" in answer:
|
|
228
|
+
return str(answer["answer"])
|
|
229
|
+
if "text" in answer:
|
|
230
|
+
return str(answer["text"])
|
|
231
|
+
|
|
232
|
+
return None
|
|
233
|
+
|
|
234
|
+
def _generate_incorrect_answer(
|
|
235
|
+
self,
|
|
236
|
+
problem: dict[str, Any],
|
|
237
|
+
correct_answer: str,
|
|
238
|
+
task_name: str,
|
|
239
|
+
extractor: Any,
|
|
240
|
+
) -> Optional[str]:
|
|
241
|
+
"""
|
|
242
|
+
Generate an incorrect answer for a problem.
|
|
243
|
+
|
|
244
|
+
Strategy:
|
|
245
|
+
1. Check if problem has bad_code/incorrect answer field (for LiveCodeBench)
|
|
246
|
+
2. If problem has multiple choices, use an incorrect choice
|
|
247
|
+
3. For numerical answers, perturb the number
|
|
248
|
+
4. For text answers, use a generic incorrect response
|
|
249
|
+
"""
|
|
250
|
+
# Strategy 0: Check for explicit bad_code field (LiveCodeBench)
|
|
251
|
+
if "bad_code" in problem and problem["bad_code"]:
|
|
252
|
+
return problem["bad_code"]
|
|
253
|
+
|
|
254
|
+
# Strategy 1: Check for multiple choice options
|
|
255
|
+
choices_fields = ["choices", "options", "mc1_targets", "mc2_targets"]
|
|
256
|
+
for field in choices_fields:
|
|
257
|
+
if field in problem:
|
|
258
|
+
choices = problem[field]
|
|
259
|
+
if isinstance(choices, dict):
|
|
260
|
+
# Handle mc1_targets/mc2_targets format
|
|
261
|
+
if "choices" in choices and "labels" in choices:
|
|
262
|
+
incorrect_indices = [
|
|
263
|
+
i for i, label in enumerate(choices["labels"]) if label == 0
|
|
264
|
+
]
|
|
265
|
+
if incorrect_indices:
|
|
266
|
+
return choices["choices"][random.choice(incorrect_indices)]
|
|
267
|
+
elif isinstance(choices, list) and choices:
|
|
268
|
+
# Filter out correct answer
|
|
269
|
+
incorrect_choices = [
|
|
270
|
+
c for c in choices
|
|
271
|
+
if self._normalize_for_comparison(str(c)) != self._normalize_for_comparison(correct_answer)
|
|
272
|
+
]
|
|
273
|
+
if incorrect_choices:
|
|
274
|
+
return str(random.choice(incorrect_choices))
|
|
275
|
+
|
|
276
|
+
# Strategy 2: Numerical answer perturbation
|
|
277
|
+
try:
|
|
278
|
+
correct_num = float(correct_answer.strip())
|
|
279
|
+
# Perturb by 10-50%
|
|
280
|
+
perturbation = random.uniform(1.1, 1.5) if random.random() > 0.5 else random.uniform(0.5, 0.9)
|
|
281
|
+
incorrect_num = correct_num * perturbation
|
|
282
|
+
return str(int(incorrect_num) if correct_num == int(correct_num) else round(incorrect_num, 2))
|
|
283
|
+
except (ValueError, AttributeError):
|
|
284
|
+
pass
|
|
285
|
+
|
|
286
|
+
# Strategy 3: Generic incorrect responses by task type
|
|
287
|
+
if task_name in ["gsm8k", "math500", "aime", "hmmt", "polymath", "livemathbench"]:
|
|
288
|
+
# Math tasks: slightly wrong number
|
|
289
|
+
return str(random.randint(0, 1000))
|
|
290
|
+
elif task_name in ["livecodebench", "humaneval", "mbpp"]:
|
|
291
|
+
# Coding tasks: empty or syntax error
|
|
292
|
+
return "# Incomplete solution\npass"
|
|
293
|
+
else:
|
|
294
|
+
# Generic incorrect response
|
|
295
|
+
return "I don't know"
|
|
296
|
+
|
|
297
|
+
@staticmethod
|
|
298
|
+
def _normalize_for_comparison(text: str) -> str:
|
|
299
|
+
"""Normalize text for comparison."""
|
|
300
|
+
return text.lower().strip()
|
|
@@ -14,7 +14,7 @@ class DataLoaderRotator:
|
|
|
14
14
|
def __init__(
|
|
15
15
|
self,
|
|
16
16
|
loader: Union[str, BaseDataLoader, Type[BaseDataLoader], None] = None,
|
|
17
|
-
loaders_location: Union[str, Path] = "
|
|
17
|
+
loaders_location: Union[str, Path] = "wisent.core.data_loaders.loaders",
|
|
18
18
|
autoload: bool = True,
|
|
19
19
|
**default_loader_kwargs: Any,
|
|
20
20
|
) -> None:
|
|
@@ -1276,6 +1276,80 @@ class FullBenchmarkDownloader:
|
|
|
1276
1276
|
|
|
1277
1277
|
return pairs
|
|
1278
1278
|
|
|
1279
|
+
def _perturb_code_to_break(self, code: str) -> str:
|
|
1280
|
+
"""
|
|
1281
|
+
Perturb correct code to make it broken/unable to execute at runtime.
|
|
1282
|
+
|
|
1283
|
+
Introduces various types of bugs:
|
|
1284
|
+
- Syntax errors (missing colons, parentheses)
|
|
1285
|
+
- Runtime errors (undefined variables)
|
|
1286
|
+
- Logic errors (wrong operators)
|
|
1287
|
+
- Type errors (wrong return values)
|
|
1288
|
+
|
|
1289
|
+
Args:
|
|
1290
|
+
code: Correct Python code
|
|
1291
|
+
|
|
1292
|
+
Returns:
|
|
1293
|
+
Broken version of the code
|
|
1294
|
+
"""
|
|
1295
|
+
lines = code.split('\n')
|
|
1296
|
+
if not lines:
|
|
1297
|
+
return "pass # Broken code"
|
|
1298
|
+
|
|
1299
|
+
# Choose a random perturbation strategy
|
|
1300
|
+
perturbation_type = random.choice([
|
|
1301
|
+
'remove_colon',
|
|
1302
|
+
'remove_return',
|
|
1303
|
+
'wrong_variable',
|
|
1304
|
+
'syntax_error',
|
|
1305
|
+
'wrong_operator',
|
|
1306
|
+
'incomplete_code'
|
|
1307
|
+
])
|
|
1308
|
+
|
|
1309
|
+
if perturbation_type == 'remove_colon':
|
|
1310
|
+
# Remove colons from function/if/for statements
|
|
1311
|
+
for i, line in enumerate(lines):
|
|
1312
|
+
if any(keyword in line for keyword in ['def ', 'if ', 'for ', 'while ', 'elif ', 'else:']):
|
|
1313
|
+
lines[i] = line.replace(':', '')
|
|
1314
|
+
break
|
|
1315
|
+
|
|
1316
|
+
elif perturbation_type == 'remove_return':
|
|
1317
|
+
# Remove return statement to break function
|
|
1318
|
+
for i, line in enumerate(lines):
|
|
1319
|
+
if 'return ' in line:
|
|
1320
|
+
lines[i] = line.replace('return ', '# return ')
|
|
1321
|
+
break
|
|
1322
|
+
|
|
1323
|
+
elif perturbation_type == 'wrong_variable':
|
|
1324
|
+
# Use undefined variable name
|
|
1325
|
+
for i, line in enumerate(lines):
|
|
1326
|
+
if '=' in line and 'def ' not in line:
|
|
1327
|
+
lines[i] = line.replace('=', '= undefined_variable +')
|
|
1328
|
+
break
|
|
1329
|
+
|
|
1330
|
+
elif perturbation_type == 'syntax_error':
|
|
1331
|
+
# Add syntax error by removing closing parenthesis
|
|
1332
|
+
for i, line in enumerate(lines):
|
|
1333
|
+
if '(' in line and ')' in line:
|
|
1334
|
+
lines[i] = line.replace(')', '', 1)
|
|
1335
|
+
break
|
|
1336
|
+
|
|
1337
|
+
elif perturbation_type == 'wrong_operator':
|
|
1338
|
+
# Change operators to break logic
|
|
1339
|
+
for i, line in enumerate(lines):
|
|
1340
|
+
if any(op in line for op in ['+', '-', '*', '/', '<', '>', '==']):
|
|
1341
|
+
line = line.replace('+', '-', 1) if '+' in line else line
|
|
1342
|
+
line = line.replace('<', '>', 1) if '<' in line else line
|
|
1343
|
+
lines[i] = line
|
|
1344
|
+
break
|
|
1345
|
+
|
|
1346
|
+
elif perturbation_type == 'incomplete_code':
|
|
1347
|
+
# Return only first half of code to make it incomplete
|
|
1348
|
+
lines = lines[:max(1, len(lines) // 2)]
|
|
1349
|
+
lines.append(" # Incomplete implementation")
|
|
1350
|
+
|
|
1351
|
+
return '\n'.join(lines)
|
|
1352
|
+
|
|
1279
1353
|
def _convert_mbpp_format(self, sample: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
1280
1354
|
"""Convert MBPP/HumanEval code generation format (task_id, code, prompt, test)."""
|
|
1281
1355
|
task_id = sample.get("task_id", "")
|
|
@@ -1285,16 +1359,19 @@ class FullBenchmarkDownloader:
|
|
|
1285
1359
|
|
|
1286
1360
|
# For code generation tasks, we create contrastive pairs based on:
|
|
1287
1361
|
# Correct: The reference code solution
|
|
1288
|
-
# Incorrect:
|
|
1362
|
+
# Incorrect: Perturbed version with bugs that prevent runtime execution
|
|
1289
1363
|
|
|
1290
1364
|
pairs = []
|
|
1291
1365
|
|
|
1366
|
+
# Generate incorrect code by perturbing the correct solution
|
|
1367
|
+
incorrect_code = self._perturb_code_to_break(code)
|
|
1368
|
+
|
|
1292
1369
|
# Create a contrastive pair with the coding prompt
|
|
1293
1370
|
pairs.append(
|
|
1294
1371
|
{
|
|
1295
1372
|
"question": f"Write Python code to solve this problem:\n\n{prompt}",
|
|
1296
1373
|
"correct_answer": code,
|
|
1297
|
-
"incorrect_answer":
|
|
1374
|
+
"incorrect_answer": incorrect_code,
|
|
1298
1375
|
"metadata": {
|
|
1299
1376
|
"task_id": task_id,
|
|
1300
1377
|
"test_cases": test,
|