wisent 0.1.1__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wisent might be problematic. Click here for more details.
- wisent/__init__.py +1 -8
- wisent/benchmarks/__init__.py +0 -0
- wisent/benchmarks/coding/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
- wisent/benchmarks/coding/metrics/evaluator.py +275 -0
- wisent/benchmarks/coding/metrics/passk.py +66 -0
- wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
- wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
- wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
- wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
- wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
- wisent/benchmarks/coding/providers/__init__.py +18 -0
- wisent/benchmarks/coding/providers/core/__init__.py +0 -0
- wisent/benchmarks/coding/providers/core/atoms.py +31 -0
- wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
- wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
- wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
- wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
- wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
- wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
- wisent/classifiers/__init__.py +0 -0
- wisent/classifiers/core/__init__.py +0 -0
- wisent/classifiers/core/atoms.py +747 -0
- wisent/classifiers/models/__init__.py +0 -0
- wisent/classifiers/models/logistic.py +29 -0
- wisent/classifiers/models/mlp.py +47 -0
- wisent/cli/__init__.py +0 -0
- wisent/cli/classifiers/__init__.py +0 -0
- wisent/cli/classifiers/classifier_rotator.py +137 -0
- wisent/cli/cli_logger.py +142 -0
- wisent/cli/data_loaders/__init__.py +0 -0
- wisent/cli/data_loaders/data_loader_rotator.py +96 -0
- wisent/cli/evaluators/__init__.py +0 -0
- wisent/cli/evaluators/evaluator_rotator.py +148 -0
- wisent/cli/steering_methods/__init__.py +0 -0
- wisent/cli/steering_methods/steering_rotator.py +110 -0
- wisent/cli/wisent_cli/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
- wisent/cli/wisent_cli/commands/listing.py +154 -0
- wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
- wisent/cli/wisent_cli/main.py +93 -0
- wisent/cli/wisent_cli/shell.py +80 -0
- wisent/cli/wisent_cli/ui.py +69 -0
- wisent/cli/wisent_cli/util/__init__.py +0 -0
- wisent/cli/wisent_cli/util/aggregations.py +43 -0
- wisent/cli/wisent_cli/util/parsing.py +126 -0
- wisent/cli/wisent_cli/version.py +4 -0
- wisent/core/__init__.py +27 -0
- wisent/core/activations/__init__.py +0 -0
- wisent/core/activations/activations_collector.py +338 -0
- wisent/core/activations/core/__init__.py +0 -0
- wisent/core/activations/core/atoms.py +216 -0
- wisent/core/agent/__init__.py +18 -0
- wisent/core/agent/budget.py +638 -0
- wisent/core/agent/device_benchmarks.py +685 -0
- wisent/core/agent/diagnose/__init__.py +55 -0
- wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
- wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
- wisent/core/agent/diagnose/create_classifier.py +1154 -0
- wisent/core/agent/diagnose/response_diagnostics.py +268 -0
- wisent/core/agent/diagnose/select_classifiers.py +506 -0
- wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
- wisent/core/agent/diagnose/tasks/__init__.py +33 -0
- wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
- wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
- wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
- wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
- wisent/core/agent/diagnose.py +242 -0
- wisent/core/agent/steer.py +212 -0
- wisent/core/agent/timeout.py +134 -0
- wisent/core/autonomous_agent.py +1234 -0
- wisent/core/bigcode_integration.py +583 -0
- wisent/core/contrastive_pairs/__init__.py +15 -0
- wisent/core/contrastive_pairs/core/__init__.py +0 -0
- wisent/core/contrastive_pairs/core/atoms.py +45 -0
- wisent/core/contrastive_pairs/core/buliders.py +59 -0
- wisent/core/contrastive_pairs/core/pair.py +178 -0
- wisent/core/contrastive_pairs/core/response.py +152 -0
- wisent/core/contrastive_pairs/core/serialization.py +300 -0
- wisent/core/contrastive_pairs/core/set.py +133 -0
- wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
- wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
- wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
- wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
- wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
- wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
- wisent/core/data_loaders/__init__.py +0 -0
- wisent/core/data_loaders/core/__init__.py +0 -0
- wisent/core/data_loaders/core/atoms.py +98 -0
- wisent/core/data_loaders/loaders/__init__.py +0 -0
- wisent/core/data_loaders/loaders/custom.py +120 -0
- wisent/core/data_loaders/loaders/lm_loader.py +218 -0
- wisent/core/detection_handling.py +257 -0
- wisent/core/download_full_benchmarks.py +1386 -0
- wisent/core/evaluators/__init__.py +0 -0
- wisent/core/evaluators/oracles/__init__.py +0 -0
- wisent/core/evaluators/oracles/interactive.py +73 -0
- wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
- wisent/core/evaluators/oracles/user_specified.py +67 -0
- wisent/core/hyperparameter_optimizer.py +429 -0
- wisent/core/lm_eval_harness_ground_truth.py +1396 -0
- wisent/core/log_likelihoods_evaluator.py +321 -0
- wisent/core/managed_cached_benchmarks.py +595 -0
- wisent/core/mixed_benchmark_sampler.py +364 -0
- wisent/core/model_config_manager.py +330 -0
- wisent/core/model_persistence.py +317 -0
- wisent/core/models/__init__.py +0 -0
- wisent/core/models/core/__init__.py +0 -0
- wisent/core/models/core/atoms.py +460 -0
- wisent/core/models/wisent_model.py +727 -0
- wisent/core/multi_steering.py +316 -0
- wisent/core/optuna/__init__.py +57 -0
- wisent/core/optuna/classifier/__init__.py +25 -0
- wisent/core/optuna/classifier/activation_generator.py +349 -0
- wisent/core/optuna/classifier/classifier_cache.py +509 -0
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
- wisent/core/optuna/steering/__init__.py +0 -0
- wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
- wisent/core/optuna/steering/data_utils.py +342 -0
- wisent/core/optuna/steering/metrics.py +474 -0
- wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
- wisent/core/optuna/steering/steering_optimization.py +1111 -0
- wisent/core/parser.py +1668 -0
- wisent/core/prompts/__init__.py +0 -0
- wisent/core/prompts/core/__init__.py +0 -0
- wisent/core/prompts/core/atom.py +57 -0
- wisent/core/prompts/core/prompt_formater.py +157 -0
- wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
- wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
- wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
- wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
- wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
- wisent/core/representation.py +5 -0
- wisent/core/sample_size_optimizer.py +648 -0
- wisent/core/sample_size_optimizer_v2.py +355 -0
- wisent/core/save_results.py +277 -0
- wisent/core/steering.py +652 -0
- wisent/core/steering_method.py +26 -0
- wisent/core/steering_methods/__init__.py +0 -0
- wisent/core/steering_methods/core/__init__.py +0 -0
- wisent/core/steering_methods/core/atoms.py +153 -0
- wisent/core/steering_methods/methods/__init__.py +0 -0
- wisent/core/steering_methods/methods/caa.py +44 -0
- wisent/core/steering_optimizer.py +1297 -0
- wisent/core/task_interface.py +132 -0
- wisent/core/task_selector.py +189 -0
- wisent/core/tasks/__init__.py +175 -0
- wisent/core/tasks/aime_task.py +141 -0
- wisent/core/tasks/file_task.py +211 -0
- wisent/core/tasks/hle_task.py +180 -0
- wisent/core/tasks/hmmt_task.py +119 -0
- wisent/core/tasks/livecodebench_task.py +201 -0
- wisent/core/tasks/livemathbench_task.py +158 -0
- wisent/core/tasks/lm_eval_task.py +455 -0
- wisent/core/tasks/math500_task.py +84 -0
- wisent/core/tasks/polymath_task.py +146 -0
- wisent/core/tasks/supergpqa_task.py +220 -0
- wisent/core/time_estimator.py +149 -0
- wisent/core/timing_calibration.py +174 -0
- wisent/core/tracking/__init__.py +54 -0
- wisent/core/tracking/latency.py +618 -0
- wisent/core/tracking/memory.py +359 -0
- wisent/core/trainers/__init__.py +0 -0
- wisent/core/trainers/core/__init__.py +11 -0
- wisent/core/trainers/core/atoms.py +45 -0
- wisent/core/trainers/steering_trainer.py +271 -0
- wisent/core/user_model_config.py +158 -0
- wisent/opti/__init__.py +0 -0
- wisent/opti/core/__init__.py +0 -0
- wisent/opti/core/atoms.py +175 -0
- wisent/opti/methods/__init__.py +0 -0
- wisent/opti/methods/opti_classificator.py +172 -0
- wisent/opti/methods/opti_steering.py +138 -0
- wisent/synthetic/__init__.py +0 -0
- wisent/synthetic/cleaners/__init__.py +0 -0
- wisent/synthetic/cleaners/core/__init__.py +0 -0
- wisent/synthetic/cleaners/core/atoms.py +58 -0
- wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
- wisent/synthetic/cleaners/methods/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
- wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
- wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
- wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
- wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
- wisent/synthetic/db_instructions/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/atoms.py +25 -0
- wisent/synthetic/db_instructions/mini_dp.py +37 -0
- wisent/synthetic/generators/__init__.py +0 -0
- wisent/synthetic/generators/core/__init__.py +0 -0
- wisent/synthetic/generators/core/atoms.py +73 -0
- wisent/synthetic/generators/diversities/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/core.py +68 -0
- wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
- wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
- wisent/synthetic/generators/pairs_generator.py +179 -0
- wisent-0.5.1.dist-info/METADATA +67 -0
- wisent-0.5.1.dist-info/RECORD +218 -0
- {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/WHEEL +1 -1
- {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info/licenses}/LICENSE +2 -2
- wisent/activations/__init__.py +0 -9
- wisent/activations/client.py +0 -97
- wisent/activations/extractor.py +0 -251
- wisent/activations/models.py +0 -95
- wisent/client.py +0 -45
- wisent/control_vector/__init__.py +0 -9
- wisent/control_vector/client.py +0 -85
- wisent/control_vector/manager.py +0 -168
- wisent/control_vector/models.py +0 -70
- wisent/inference/__init__.py +0 -9
- wisent/inference/client.py +0 -103
- wisent/inference/inferencer.py +0 -250
- wisent/inference/models.py +0 -66
- wisent/utils/__init__.py +0 -3
- wisent/utils/auth.py +0 -30
- wisent/utils/http.py +0 -228
- wisent/version.py +0 -3
- wisent-0.1.1.dist-info/METADATA +0 -142
- wisent-0.1.1.dist-info/RECORD +0 -23
- {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LiveCodeBench task implementation for task-agnostic architecture.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
from ..benchmark_extractors import LiveCodeBenchExtractor
|
|
8
|
+
from ..data_loaders import LiveCodeBenchLoader
|
|
9
|
+
from ..task_interface import TaskInterface
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class LiveCodeBenchTask(TaskInterface):
|
|
13
|
+
"""LiveCodeBench task implementation."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, release_version: str = "release_v1", limit: Optional[int] = None):
|
|
16
|
+
self._extractor = LiveCodeBenchExtractor()
|
|
17
|
+
self._data_loader = LiveCodeBenchLoader()
|
|
18
|
+
self._release_version = release_version
|
|
19
|
+
self._validate_release_version(release_version)
|
|
20
|
+
self._data = None # Cache for loaded data
|
|
21
|
+
self._limit = limit # Store limit for later use
|
|
22
|
+
|
|
23
|
+
def _validate_release_version(self, release_version: str) -> None:
|
|
24
|
+
"""Validate release version."""
|
|
25
|
+
try:
|
|
26
|
+
valid_versions = set(self._data_loader.list_available_versions())
|
|
27
|
+
if release_version not in valid_versions:
|
|
28
|
+
raise ValueError(f"Invalid release version: {release_version}. Valid versions: {valid_versions}")
|
|
29
|
+
except ValueError:
|
|
30
|
+
# Re-raise validation errors
|
|
31
|
+
raise
|
|
32
|
+
except Exception:
|
|
33
|
+
# If we can't load versions (e.g., due to dataset issues), just log a warning
|
|
34
|
+
import logging
|
|
35
|
+
|
|
36
|
+
logging.warning(
|
|
37
|
+
f"Could not validate release version {release_version} due to data loader issues. Proceeding with fallback data."
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def _get_version_info(self) -> Dict[str, Any]:
|
|
41
|
+
"""Get version-specific information."""
|
|
42
|
+
try:
|
|
43
|
+
return self._data_loader.get_version_info(self._release_version)
|
|
44
|
+
except Exception:
|
|
45
|
+
# Return default info if data loader fails
|
|
46
|
+
return {
|
|
47
|
+
"version": self._release_version,
|
|
48
|
+
"description": f"LiveCodeBench {self._release_version} (fallback mode)",
|
|
49
|
+
"contest_start": "2023-01-01",
|
|
50
|
+
"contest_end": "2023-12-31",
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
def load_data(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
54
|
+
"""Load LiveCodeBench data for the specified release version."""
|
|
55
|
+
try:
|
|
56
|
+
# Load real LiveCodeBench data
|
|
57
|
+
problems = self._data_loader.load_problems(release_version=self._release_version, limit=limit)
|
|
58
|
+
|
|
59
|
+
# Convert to dictionary format
|
|
60
|
+
return [problem.to_dict() for problem in problems]
|
|
61
|
+
|
|
62
|
+
except Exception as e:
|
|
63
|
+
# Fallback to sample data if loading fails
|
|
64
|
+
import logging
|
|
65
|
+
|
|
66
|
+
logging.warning(f"Failed to load real LiveCodeBench data: {e}. Using sample data.")
|
|
67
|
+
return self._generate_sample_data_fallback(limit)
|
|
68
|
+
|
|
69
|
+
def _generate_sample_data_fallback(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
70
|
+
"""Generate sample data for the specified number of problems."""
|
|
71
|
+
base_problems = [
|
|
72
|
+
{
|
|
73
|
+
"task_id": "lcb_001",
|
|
74
|
+
"question_title": "Two Sum",
|
|
75
|
+
"question_content": "Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target.",
|
|
76
|
+
"starter_code": "def two_sum(nums, target):\n # Your code here\n pass",
|
|
77
|
+
"difficulty": "EASY",
|
|
78
|
+
"platform": "LEETCODE",
|
|
79
|
+
"public_test_cases": [{"input": "[2,7,11,15], 9", "output": "[0,1]", "testtype": "FUNCTIONAL"}],
|
|
80
|
+
"contest_date": "2023-05-15",
|
|
81
|
+
"metadata": {"tags": ["array", "hash-table"], "constraints": "2 <= nums.length <= 10^4"},
|
|
82
|
+
},
|
|
83
|
+
{
|
|
84
|
+
"task_id": "lcb_002",
|
|
85
|
+
"question_title": "Valid Parentheses",
|
|
86
|
+
"question_content": "Given a string s containing just the characters '(', ')', '{', '}', '[' and ']', determine if the input string is valid.",
|
|
87
|
+
"starter_code": "def is_valid(s):\n # Your code here\n pass",
|
|
88
|
+
"difficulty": "EASY",
|
|
89
|
+
"platform": "LEETCODE",
|
|
90
|
+
"public_test_cases": [{"input": '"()"', "output": "true", "testtype": "FUNCTIONAL"}],
|
|
91
|
+
"contest_date": "2023-06-01",
|
|
92
|
+
"metadata": {"tags": ["string", "stack"], "constraints": "1 <= s.length <= 10^4"},
|
|
93
|
+
},
|
|
94
|
+
{
|
|
95
|
+
"task_id": "lcb_003",
|
|
96
|
+
"question_title": "Longest Increasing Subsequence",
|
|
97
|
+
"question_content": "Given an integer array nums, return the length of the longest strictly increasing subsequence.",
|
|
98
|
+
"starter_code": "def length_of_lis(nums):\n # Your code here\n pass",
|
|
99
|
+
"difficulty": "MEDIUM",
|
|
100
|
+
"platform": "LEETCODE",
|
|
101
|
+
"public_test_cases": [{"input": "[10,9,2,5,3,7,101,18]", "output": "4", "testtype": "FUNCTIONAL"}],
|
|
102
|
+
"contest_date": "2023-07-10",
|
|
103
|
+
"metadata": {
|
|
104
|
+
"tags": ["array", "binary-search", "dynamic-programming"],
|
|
105
|
+
"constraints": "1 <= nums.length <= 2500",
|
|
106
|
+
},
|
|
107
|
+
},
|
|
108
|
+
{
|
|
109
|
+
"task_id": "lcb_004",
|
|
110
|
+
"question_title": "Merge Two Sorted Lists",
|
|
111
|
+
"question_content": "You are given the heads of two sorted linked lists list1 and list2. Merge the two lists into one sorted list.",
|
|
112
|
+
"starter_code": "def merge_two_lists(list1, list2):\n # Your code here\n pass",
|
|
113
|
+
"difficulty": "EASY",
|
|
114
|
+
"platform": "LEETCODE",
|
|
115
|
+
"public_test_cases": [
|
|
116
|
+
{"input": "[1,2,4], [1,3,4]", "output": "[1,1,2,3,4,4]", "testtype": "FUNCTIONAL"}
|
|
117
|
+
],
|
|
118
|
+
"contest_date": "2023-08-01",
|
|
119
|
+
"metadata": {
|
|
120
|
+
"tags": ["linked-list", "recursion"],
|
|
121
|
+
"constraints": "0 <= list1.length, list2.length <= 50",
|
|
122
|
+
},
|
|
123
|
+
},
|
|
124
|
+
{
|
|
125
|
+
"task_id": "lcb_005",
|
|
126
|
+
"question_title": "Best Time to Buy and Sell Stock",
|
|
127
|
+
"question_content": "You are given an array prices where prices[i] is the price of a given stock on the ith day. Find the maximum profit.",
|
|
128
|
+
"starter_code": "def max_profit(prices):\n # Your code here\n pass",
|
|
129
|
+
"difficulty": "EASY",
|
|
130
|
+
"platform": "LEETCODE",
|
|
131
|
+
"public_test_cases": [{"input": "[7,1,5,3,6,4]", "output": "5", "testtype": "FUNCTIONAL"}],
|
|
132
|
+
"contest_date": "2023-09-15",
|
|
133
|
+
"metadata": {"tags": ["array", "dynamic-programming"], "constraints": "1 <= prices.length <= 10^5"},
|
|
134
|
+
},
|
|
135
|
+
]
|
|
136
|
+
|
|
137
|
+
# Generate limited sample data for fallback
|
|
138
|
+
if limit:
|
|
139
|
+
base_problems = base_problems[:limit]
|
|
140
|
+
|
|
141
|
+
# Add version-specific metadata
|
|
142
|
+
for problem in base_problems:
|
|
143
|
+
problem["release_version"] = self._release_version
|
|
144
|
+
|
|
145
|
+
return base_problems
|
|
146
|
+
|
|
147
|
+
def get_extractor(self):
|
|
148
|
+
"""Get the LiveCodeBench extractor."""
|
|
149
|
+
return self._extractor
|
|
150
|
+
|
|
151
|
+
def get_name(self) -> str:
|
|
152
|
+
"""Get the task name."""
|
|
153
|
+
return "livecodebench"
|
|
154
|
+
|
|
155
|
+
def get_description(self) -> str:
|
|
156
|
+
"""Get the task description."""
|
|
157
|
+
version_info = self._get_version_info()
|
|
158
|
+
return f"LiveCodeBench {self._release_version}: Contamination-free coding benchmark with {version_info['problems']} problems ({version_info['date_range']}) from LeetCode, AtCoder, and CodeForces"
|
|
159
|
+
|
|
160
|
+
def get_categories(self) -> List[str]:
|
|
161
|
+
"""Get the task categories."""
|
|
162
|
+
return ["coding", "reasoning", "algorithms", "data-structures"]
|
|
163
|
+
|
|
164
|
+
# Methods to match lm-eval interface
|
|
165
|
+
def has_validation_docs(self) -> bool:
|
|
166
|
+
"""Check if task has validation documents."""
|
|
167
|
+
return False # LiveCodeBench doesn't have separate validation sets
|
|
168
|
+
|
|
169
|
+
def has_test_docs(self) -> bool:
|
|
170
|
+
"""Check if task has test documents."""
|
|
171
|
+
return True # All samples are considered test docs
|
|
172
|
+
|
|
173
|
+
def test_docs(self) -> List[Dict[str, Any]]:
|
|
174
|
+
"""Get test documents."""
|
|
175
|
+
if self._data is None:
|
|
176
|
+
self._data = self.load_data(limit=self._limit)
|
|
177
|
+
return self._data
|
|
178
|
+
|
|
179
|
+
def validation_docs(self) -> List[Dict[str, Any]]:
|
|
180
|
+
"""Get validation documents."""
|
|
181
|
+
return [] # No separate validation set
|
|
182
|
+
|
|
183
|
+
def doc_to_text(self, doc: Dict[str, Any]) -> str:
|
|
184
|
+
"""Convert document to text prompt."""
|
|
185
|
+
# Combine problem description with starter code
|
|
186
|
+
question = doc.get("question_content", "")
|
|
187
|
+
starter = doc.get("starter_code", "")
|
|
188
|
+
return f"{question}\n\n{starter}"
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
# TODO: In a real implementation, this would integrate with the actual LiveCodeBench library
|
|
192
|
+
# Example integration:
|
|
193
|
+
# from livecodebench import LiveCodeBench
|
|
194
|
+
#
|
|
195
|
+
# class LiveCodeBenchTask(TaskInterface):
|
|
196
|
+
# def __init__(self):
|
|
197
|
+
# self._lcb = LiveCodeBench()
|
|
198
|
+
# # self._extractor = LiveCodeBenchExtractor() # Not needed with model outputs approach
|
|
199
|
+
#
|
|
200
|
+
# def load_data(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
201
|
+
# return self._lcb.load_problems(limit=limit)
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LiveMathBench CNMO 2024 mathematical reasoning task implementation for task-agnostic architecture.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
import datasets
|
|
8
|
+
|
|
9
|
+
from ..benchmark_extractors import GSM8KExtractor
|
|
10
|
+
from ..task_interface import TaskInterface
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class LiveMathBenchTask(TaskInterface):
|
|
14
|
+
"""LiveMathBench CNMO 2024 mathematical reasoning task implementation."""
|
|
15
|
+
|
|
16
|
+
# Dataset configurations for CNMO 2024 Chinese and English
|
|
17
|
+
DATASET_CONFIGS = {
|
|
18
|
+
"cnmo_en": {
|
|
19
|
+
"source": "opencompass/LiveMathBench",
|
|
20
|
+
"config": "v202412_CNMO_en",
|
|
21
|
+
"split": "test",
|
|
22
|
+
"fields": {"problem": "question", "answer": "answer"},
|
|
23
|
+
"description": "18 CNMO 2024 mathematical problems in English",
|
|
24
|
+
},
|
|
25
|
+
"cnmo_zh": {
|
|
26
|
+
"source": "opencompass/LiveMathBench",
|
|
27
|
+
"config": "v202412_CNMO_cn",
|
|
28
|
+
"split": "test",
|
|
29
|
+
"fields": {"problem": "question", "answer": "answer"},
|
|
30
|
+
"description": "18 CNMO 2024 mathematical problems in Chinese",
|
|
31
|
+
},
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
def __init__(self, language: str = "en", limit: Optional[int] = None):
|
|
35
|
+
"""
|
|
36
|
+
Initialize LiveMathBench task for specified language.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
language: Language code ("en" for English, "zh" for Chinese). Default: "en"
|
|
40
|
+
limit: Maximum number of samples to load
|
|
41
|
+
"""
|
|
42
|
+
config_key = f"cnmo_{language}" if language in ["en", "zh"] else "cnmo_en"
|
|
43
|
+
if config_key not in self.DATASET_CONFIGS:
|
|
44
|
+
available = list(self.DATASET_CONFIGS.keys())
|
|
45
|
+
raise ValueError(f"LiveMathBench config '{config_key}' not supported. Available: {available}")
|
|
46
|
+
|
|
47
|
+
self.language = language
|
|
48
|
+
self.config_key = config_key
|
|
49
|
+
self.config = self.DATASET_CONFIGS[config_key]
|
|
50
|
+
self._limit = limit
|
|
51
|
+
self._data = None # Cache for loaded data
|
|
52
|
+
self._extractor = GSM8KExtractor() # Reuse enhanced GSM8K extractor
|
|
53
|
+
|
|
54
|
+
def load_data(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
55
|
+
"""Load LiveMathBench CNMO 2024 data from HuggingFace for specified language."""
|
|
56
|
+
# Load dataset based on language configuration
|
|
57
|
+
try:
|
|
58
|
+
dataset = datasets.load_dataset(self.config["source"], self.config["config"], split=self.config["split"])
|
|
59
|
+
except ValueError as e:
|
|
60
|
+
if "Feature type 'List' not found" in str(e):
|
|
61
|
+
# Clear cache and retry due to deprecated feature type
|
|
62
|
+
import os
|
|
63
|
+
import shutil
|
|
64
|
+
|
|
65
|
+
from datasets.utils.logging import get_logger
|
|
66
|
+
|
|
67
|
+
logger = get_logger(__name__)
|
|
68
|
+
logger.warning(f"Clearing dataset cache due to deprecated 'List' feature type: {e}")
|
|
69
|
+
|
|
70
|
+
# Clear entire datasets cache to ensure clean state
|
|
71
|
+
try:
|
|
72
|
+
cache_dir = datasets.config.HF_DATASETS_CACHE
|
|
73
|
+
if os.path.exists(cache_dir):
|
|
74
|
+
shutil.rmtree(cache_dir)
|
|
75
|
+
logger.info(f"Removed entire datasets cache directory: {cache_dir}")
|
|
76
|
+
except Exception as cache_error:
|
|
77
|
+
logger.warning(f"Failed to clear cache: {cache_error}")
|
|
78
|
+
|
|
79
|
+
# Try loading again after cache clear with explicit cache disable
|
|
80
|
+
dataset = datasets.load_dataset(
|
|
81
|
+
self.config["source"],
|
|
82
|
+
self.config["config"],
|
|
83
|
+
split=self.config["split"],
|
|
84
|
+
cache_dir=None, # Disable caching for this load
|
|
85
|
+
download_mode="force_redownload",
|
|
86
|
+
)
|
|
87
|
+
else:
|
|
88
|
+
raise
|
|
89
|
+
|
|
90
|
+
# Apply limit
|
|
91
|
+
effective_limit = limit or self._limit
|
|
92
|
+
if effective_limit:
|
|
93
|
+
dataset = dataset.select(range(min(effective_limit, len(dataset))))
|
|
94
|
+
|
|
95
|
+
# Convert to list and normalize field names
|
|
96
|
+
data = [dict(item) for item in dataset]
|
|
97
|
+
|
|
98
|
+
# Normalize field names for consistent processing
|
|
99
|
+
normalized_data = []
|
|
100
|
+
problem_field = self.config["fields"]["problem"]
|
|
101
|
+
answer_field = self.config["fields"]["answer"]
|
|
102
|
+
|
|
103
|
+
for item in data:
|
|
104
|
+
normalized_item = dict(item) # Keep all original fields
|
|
105
|
+
|
|
106
|
+
# Ensure consistent field names for extractor
|
|
107
|
+
if problem_field in item:
|
|
108
|
+
normalized_item["Problem"] = item[problem_field]
|
|
109
|
+
normalized_item["question"] = item[problem_field] # For question/answer format
|
|
110
|
+
|
|
111
|
+
if answer_field in item:
|
|
112
|
+
normalized_item["Answer"] = item[answer_field]
|
|
113
|
+
normalized_item["answer"] = item[answer_field] # For question/answer format
|
|
114
|
+
|
|
115
|
+
normalized_data.append(normalized_item)
|
|
116
|
+
|
|
117
|
+
return normalized_data
|
|
118
|
+
|
|
119
|
+
def get_task_info(self) -> Dict[str, Any]:
|
|
120
|
+
"""Get information about the LiveMathBench task."""
|
|
121
|
+
return {
|
|
122
|
+
"task_name": f"livemathbench_{self.config_key}",
|
|
123
|
+
"language": self.language,
|
|
124
|
+
"contest": "CNMO 2024",
|
|
125
|
+
"description": self.config["description"],
|
|
126
|
+
"source": self.config["source"],
|
|
127
|
+
"task_type": "text_generation",
|
|
128
|
+
"evaluation_method": "mathematical_equivalence",
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
def validate_sample(self, sample: Dict[str, Any]) -> bool:
|
|
132
|
+
"""Validate that a sample has required LiveMathBench fields."""
|
|
133
|
+
problem_field = self.config["fields"]["problem"]
|
|
134
|
+
answer_field = self.config["fields"]["answer"]
|
|
135
|
+
|
|
136
|
+
return all(field in sample for field in [problem_field, answer_field])
|
|
137
|
+
|
|
138
|
+
def get_extractor(self) -> GSM8KExtractor:
|
|
139
|
+
"""Get the benchmark extractor for this task."""
|
|
140
|
+
return self._extractor
|
|
141
|
+
|
|
142
|
+
def get_name(self) -> str:
|
|
143
|
+
"""Get the task name."""
|
|
144
|
+
return f"livemathbench_{self.config_key}"
|
|
145
|
+
|
|
146
|
+
def get_description(self) -> str:
|
|
147
|
+
"""Get the task description."""
|
|
148
|
+
lang_name = "Chinese" if self.language == "zh" else "English"
|
|
149
|
+
return f"LiveMathBench CNMO 2024 mathematical olympiad problems in {lang_name}"
|
|
150
|
+
|
|
151
|
+
def get_categories(self) -> List[str]:
|
|
152
|
+
"""Get the task categories."""
|
|
153
|
+
return ["mathematics", "reasoning", "olympiad", "multilingual", "text_generation"]
|
|
154
|
+
|
|
155
|
+
@classmethod
|
|
156
|
+
def get_supported_languages(cls) -> List[str]:
|
|
157
|
+
"""Get list of supported languages for CNMO 2024."""
|
|
158
|
+
return ["en", "zh"]
|