wisent 0.1.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (237) hide show
  1. wisent/__init__.py +1 -8
  2. wisent/benchmarks/__init__.py +0 -0
  3. wisent/benchmarks/coding/__init__.py +0 -0
  4. wisent/benchmarks/coding/metrics/__init__.py +0 -0
  5. wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
  6. wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
  7. wisent/benchmarks/coding/metrics/evaluator.py +275 -0
  8. wisent/benchmarks/coding/metrics/passk.py +66 -0
  9. wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
  10. wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
  11. wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
  12. wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
  13. wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
  14. wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
  15. wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
  16. wisent/benchmarks/coding/providers/__init__.py +18 -0
  17. wisent/benchmarks/coding/providers/core/__init__.py +0 -0
  18. wisent/benchmarks/coding/providers/core/atoms.py +31 -0
  19. wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
  20. wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
  21. wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
  22. wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
  23. wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
  24. wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
  25. wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
  26. wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
  27. wisent/classifiers/__init__.py +0 -0
  28. wisent/classifiers/core/__init__.py +0 -0
  29. wisent/classifiers/core/atoms.py +747 -0
  30. wisent/classifiers/models/__init__.py +0 -0
  31. wisent/classifiers/models/logistic.py +29 -0
  32. wisent/classifiers/models/mlp.py +47 -0
  33. wisent/cli/__init__.py +0 -0
  34. wisent/cli/classifiers/__init__.py +0 -0
  35. wisent/cli/classifiers/classifier_rotator.py +137 -0
  36. wisent/cli/cli_logger.py +142 -0
  37. wisent/cli/data_loaders/__init__.py +0 -0
  38. wisent/cli/data_loaders/data_loader_rotator.py +96 -0
  39. wisent/cli/evaluators/__init__.py +0 -0
  40. wisent/cli/evaluators/evaluator_rotator.py +148 -0
  41. wisent/cli/steering_methods/__init__.py +0 -0
  42. wisent/cli/steering_methods/steering_rotator.py +110 -0
  43. wisent/cli/wisent_cli/__init__.py +0 -0
  44. wisent/cli/wisent_cli/commands/__init__.py +0 -0
  45. wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
  46. wisent/cli/wisent_cli/commands/listing.py +154 -0
  47. wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
  48. wisent/cli/wisent_cli/main.py +93 -0
  49. wisent/cli/wisent_cli/shell.py +80 -0
  50. wisent/cli/wisent_cli/ui.py +69 -0
  51. wisent/cli/wisent_cli/util/__init__.py +0 -0
  52. wisent/cli/wisent_cli/util/aggregations.py +43 -0
  53. wisent/cli/wisent_cli/util/parsing.py +126 -0
  54. wisent/cli/wisent_cli/version.py +4 -0
  55. wisent/core/__init__.py +27 -0
  56. wisent/core/activations/__init__.py +0 -0
  57. wisent/core/activations/activations_collector.py +338 -0
  58. wisent/core/activations/core/__init__.py +0 -0
  59. wisent/core/activations/core/atoms.py +216 -0
  60. wisent/core/agent/__init__.py +18 -0
  61. wisent/core/agent/budget.py +638 -0
  62. wisent/core/agent/device_benchmarks.py +685 -0
  63. wisent/core/agent/diagnose/__init__.py +55 -0
  64. wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
  65. wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
  66. wisent/core/agent/diagnose/create_classifier.py +1154 -0
  67. wisent/core/agent/diagnose/response_diagnostics.py +268 -0
  68. wisent/core/agent/diagnose/select_classifiers.py +506 -0
  69. wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
  70. wisent/core/agent/diagnose/tasks/__init__.py +33 -0
  71. wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
  72. wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
  73. wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
  74. wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
  75. wisent/core/agent/diagnose.py +242 -0
  76. wisent/core/agent/steer.py +212 -0
  77. wisent/core/agent/timeout.py +134 -0
  78. wisent/core/autonomous_agent.py +1234 -0
  79. wisent/core/bigcode_integration.py +583 -0
  80. wisent/core/contrastive_pairs/__init__.py +15 -0
  81. wisent/core/contrastive_pairs/core/__init__.py +0 -0
  82. wisent/core/contrastive_pairs/core/atoms.py +45 -0
  83. wisent/core/contrastive_pairs/core/buliders.py +59 -0
  84. wisent/core/contrastive_pairs/core/pair.py +178 -0
  85. wisent/core/contrastive_pairs/core/response.py +152 -0
  86. wisent/core/contrastive_pairs/core/serialization.py +300 -0
  87. wisent/core/contrastive_pairs/core/set.py +133 -0
  88. wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
  89. wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
  90. wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
  91. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
  92. wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
  93. wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
  94. wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
  95. wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
  96. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
  102. wisent/core/data_loaders/__init__.py +0 -0
  103. wisent/core/data_loaders/core/__init__.py +0 -0
  104. wisent/core/data_loaders/core/atoms.py +98 -0
  105. wisent/core/data_loaders/loaders/__init__.py +0 -0
  106. wisent/core/data_loaders/loaders/custom.py +120 -0
  107. wisent/core/data_loaders/loaders/lm_loader.py +218 -0
  108. wisent/core/detection_handling.py +257 -0
  109. wisent/core/download_full_benchmarks.py +1386 -0
  110. wisent/core/evaluators/__init__.py +0 -0
  111. wisent/core/evaluators/oracles/__init__.py +0 -0
  112. wisent/core/evaluators/oracles/interactive.py +73 -0
  113. wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
  114. wisent/core/evaluators/oracles/user_specified.py +67 -0
  115. wisent/core/hyperparameter_optimizer.py +429 -0
  116. wisent/core/lm_eval_harness_ground_truth.py +1396 -0
  117. wisent/core/log_likelihoods_evaluator.py +321 -0
  118. wisent/core/managed_cached_benchmarks.py +595 -0
  119. wisent/core/mixed_benchmark_sampler.py +364 -0
  120. wisent/core/model_config_manager.py +330 -0
  121. wisent/core/model_persistence.py +317 -0
  122. wisent/core/models/__init__.py +0 -0
  123. wisent/core/models/core/__init__.py +0 -0
  124. wisent/core/models/core/atoms.py +460 -0
  125. wisent/core/models/wisent_model.py +727 -0
  126. wisent/core/multi_steering.py +316 -0
  127. wisent/core/optuna/__init__.py +57 -0
  128. wisent/core/optuna/classifier/__init__.py +25 -0
  129. wisent/core/optuna/classifier/activation_generator.py +349 -0
  130. wisent/core/optuna/classifier/classifier_cache.py +509 -0
  131. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
  132. wisent/core/optuna/steering/__init__.py +0 -0
  133. wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
  134. wisent/core/optuna/steering/data_utils.py +342 -0
  135. wisent/core/optuna/steering/metrics.py +474 -0
  136. wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
  137. wisent/core/optuna/steering/steering_optimization.py +1111 -0
  138. wisent/core/parser.py +1668 -0
  139. wisent/core/prompts/__init__.py +0 -0
  140. wisent/core/prompts/core/__init__.py +0 -0
  141. wisent/core/prompts/core/atom.py +57 -0
  142. wisent/core/prompts/core/prompt_formater.py +157 -0
  143. wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
  144. wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
  145. wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
  146. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
  147. wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
  148. wisent/core/representation.py +5 -0
  149. wisent/core/sample_size_optimizer.py +648 -0
  150. wisent/core/sample_size_optimizer_v2.py +355 -0
  151. wisent/core/save_results.py +277 -0
  152. wisent/core/steering.py +652 -0
  153. wisent/core/steering_method.py +26 -0
  154. wisent/core/steering_methods/__init__.py +0 -0
  155. wisent/core/steering_methods/core/__init__.py +0 -0
  156. wisent/core/steering_methods/core/atoms.py +153 -0
  157. wisent/core/steering_methods/methods/__init__.py +0 -0
  158. wisent/core/steering_methods/methods/caa.py +44 -0
  159. wisent/core/steering_optimizer.py +1297 -0
  160. wisent/core/task_interface.py +132 -0
  161. wisent/core/task_selector.py +189 -0
  162. wisent/core/tasks/__init__.py +175 -0
  163. wisent/core/tasks/aime_task.py +141 -0
  164. wisent/core/tasks/file_task.py +211 -0
  165. wisent/core/tasks/hle_task.py +180 -0
  166. wisent/core/tasks/hmmt_task.py +119 -0
  167. wisent/core/tasks/livecodebench_task.py +201 -0
  168. wisent/core/tasks/livemathbench_task.py +158 -0
  169. wisent/core/tasks/lm_eval_task.py +455 -0
  170. wisent/core/tasks/math500_task.py +84 -0
  171. wisent/core/tasks/polymath_task.py +146 -0
  172. wisent/core/tasks/supergpqa_task.py +220 -0
  173. wisent/core/time_estimator.py +149 -0
  174. wisent/core/timing_calibration.py +174 -0
  175. wisent/core/tracking/__init__.py +54 -0
  176. wisent/core/tracking/latency.py +618 -0
  177. wisent/core/tracking/memory.py +359 -0
  178. wisent/core/trainers/__init__.py +0 -0
  179. wisent/core/trainers/core/__init__.py +11 -0
  180. wisent/core/trainers/core/atoms.py +45 -0
  181. wisent/core/trainers/steering_trainer.py +271 -0
  182. wisent/core/user_model_config.py +158 -0
  183. wisent/opti/__init__.py +0 -0
  184. wisent/opti/core/__init__.py +0 -0
  185. wisent/opti/core/atoms.py +175 -0
  186. wisent/opti/methods/__init__.py +0 -0
  187. wisent/opti/methods/opti_classificator.py +172 -0
  188. wisent/opti/methods/opti_steering.py +138 -0
  189. wisent/synthetic/__init__.py +0 -0
  190. wisent/synthetic/cleaners/__init__.py +0 -0
  191. wisent/synthetic/cleaners/core/__init__.py +0 -0
  192. wisent/synthetic/cleaners/core/atoms.py +58 -0
  193. wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
  194. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  195. wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
  196. wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
  197. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  198. wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
  199. wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
  200. wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
  201. wisent/synthetic/db_instructions/__init__.py +0 -0
  202. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  203. wisent/synthetic/db_instructions/core/atoms.py +25 -0
  204. wisent/synthetic/db_instructions/mini_dp.py +37 -0
  205. wisent/synthetic/generators/__init__.py +0 -0
  206. wisent/synthetic/generators/core/__init__.py +0 -0
  207. wisent/synthetic/generators/core/atoms.py +73 -0
  208. wisent/synthetic/generators/diversities/__init__.py +0 -0
  209. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  210. wisent/synthetic/generators/diversities/core/core.py +68 -0
  211. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  212. wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
  213. wisent/synthetic/generators/pairs_generator.py +179 -0
  214. wisent-0.5.1.dist-info/METADATA +67 -0
  215. wisent-0.5.1.dist-info/RECORD +218 -0
  216. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/WHEEL +1 -1
  217. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info/licenses}/LICENSE +2 -2
  218. wisent/activations/__init__.py +0 -9
  219. wisent/activations/client.py +0 -97
  220. wisent/activations/extractor.py +0 -251
  221. wisent/activations/models.py +0 -95
  222. wisent/client.py +0 -45
  223. wisent/control_vector/__init__.py +0 -9
  224. wisent/control_vector/client.py +0 -85
  225. wisent/control_vector/manager.py +0 -168
  226. wisent/control_vector/models.py +0 -70
  227. wisent/inference/__init__.py +0 -9
  228. wisent/inference/client.py +0 -103
  229. wisent/inference/inferencer.py +0 -250
  230. wisent/inference/models.py +0 -66
  231. wisent/utils/__init__.py +0 -3
  232. wisent/utils/auth.py +0 -30
  233. wisent/utils/http.py +0 -228
  234. wisent/version.py +0 -3
  235. wisent-0.1.1.dist-info/METADATA +0 -142
  236. wisent-0.1.1.dist-info/RECORD +0 -23
  237. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,201 @@
1
+ """
2
+ LiveCodeBench task implementation for task-agnostic architecture.
3
+ """
4
+
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from ..benchmark_extractors import LiveCodeBenchExtractor
8
+ from ..data_loaders import LiveCodeBenchLoader
9
+ from ..task_interface import TaskInterface
10
+
11
+
12
+ class LiveCodeBenchTask(TaskInterface):
13
+ """LiveCodeBench task implementation."""
14
+
15
+ def __init__(self, release_version: str = "release_v1", limit: Optional[int] = None):
16
+ self._extractor = LiveCodeBenchExtractor()
17
+ self._data_loader = LiveCodeBenchLoader()
18
+ self._release_version = release_version
19
+ self._validate_release_version(release_version)
20
+ self._data = None # Cache for loaded data
21
+ self._limit = limit # Store limit for later use
22
+
23
+ def _validate_release_version(self, release_version: str) -> None:
24
+ """Validate release version."""
25
+ try:
26
+ valid_versions = set(self._data_loader.list_available_versions())
27
+ if release_version not in valid_versions:
28
+ raise ValueError(f"Invalid release version: {release_version}. Valid versions: {valid_versions}")
29
+ except ValueError:
30
+ # Re-raise validation errors
31
+ raise
32
+ except Exception:
33
+ # If we can't load versions (e.g., due to dataset issues), just log a warning
34
+ import logging
35
+
36
+ logging.warning(
37
+ f"Could not validate release version {release_version} due to data loader issues. Proceeding with fallback data."
38
+ )
39
+
40
+ def _get_version_info(self) -> Dict[str, Any]:
41
+ """Get version-specific information."""
42
+ try:
43
+ return self._data_loader.get_version_info(self._release_version)
44
+ except Exception:
45
+ # Return default info if data loader fails
46
+ return {
47
+ "version": self._release_version,
48
+ "description": f"LiveCodeBench {self._release_version} (fallback mode)",
49
+ "contest_start": "2023-01-01",
50
+ "contest_end": "2023-12-31",
51
+ }
52
+
53
+ def load_data(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
54
+ """Load LiveCodeBench data for the specified release version."""
55
+ try:
56
+ # Load real LiveCodeBench data
57
+ problems = self._data_loader.load_problems(release_version=self._release_version, limit=limit)
58
+
59
+ # Convert to dictionary format
60
+ return [problem.to_dict() for problem in problems]
61
+
62
+ except Exception as e:
63
+ # Fallback to sample data if loading fails
64
+ import logging
65
+
66
+ logging.warning(f"Failed to load real LiveCodeBench data: {e}. Using sample data.")
67
+ return self._generate_sample_data_fallback(limit)
68
+
69
+ def _generate_sample_data_fallback(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
70
+ """Generate sample data for the specified number of problems."""
71
+ base_problems = [
72
+ {
73
+ "task_id": "lcb_001",
74
+ "question_title": "Two Sum",
75
+ "question_content": "Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target.",
76
+ "starter_code": "def two_sum(nums, target):\n # Your code here\n pass",
77
+ "difficulty": "EASY",
78
+ "platform": "LEETCODE",
79
+ "public_test_cases": [{"input": "[2,7,11,15], 9", "output": "[0,1]", "testtype": "FUNCTIONAL"}],
80
+ "contest_date": "2023-05-15",
81
+ "metadata": {"tags": ["array", "hash-table"], "constraints": "2 <= nums.length <= 10^4"},
82
+ },
83
+ {
84
+ "task_id": "lcb_002",
85
+ "question_title": "Valid Parentheses",
86
+ "question_content": "Given a string s containing just the characters '(', ')', '{', '}', '[' and ']', determine if the input string is valid.",
87
+ "starter_code": "def is_valid(s):\n # Your code here\n pass",
88
+ "difficulty": "EASY",
89
+ "platform": "LEETCODE",
90
+ "public_test_cases": [{"input": '"()"', "output": "true", "testtype": "FUNCTIONAL"}],
91
+ "contest_date": "2023-06-01",
92
+ "metadata": {"tags": ["string", "stack"], "constraints": "1 <= s.length <= 10^4"},
93
+ },
94
+ {
95
+ "task_id": "lcb_003",
96
+ "question_title": "Longest Increasing Subsequence",
97
+ "question_content": "Given an integer array nums, return the length of the longest strictly increasing subsequence.",
98
+ "starter_code": "def length_of_lis(nums):\n # Your code here\n pass",
99
+ "difficulty": "MEDIUM",
100
+ "platform": "LEETCODE",
101
+ "public_test_cases": [{"input": "[10,9,2,5,3,7,101,18]", "output": "4", "testtype": "FUNCTIONAL"}],
102
+ "contest_date": "2023-07-10",
103
+ "metadata": {
104
+ "tags": ["array", "binary-search", "dynamic-programming"],
105
+ "constraints": "1 <= nums.length <= 2500",
106
+ },
107
+ },
108
+ {
109
+ "task_id": "lcb_004",
110
+ "question_title": "Merge Two Sorted Lists",
111
+ "question_content": "You are given the heads of two sorted linked lists list1 and list2. Merge the two lists into one sorted list.",
112
+ "starter_code": "def merge_two_lists(list1, list2):\n # Your code here\n pass",
113
+ "difficulty": "EASY",
114
+ "platform": "LEETCODE",
115
+ "public_test_cases": [
116
+ {"input": "[1,2,4], [1,3,4]", "output": "[1,1,2,3,4,4]", "testtype": "FUNCTIONAL"}
117
+ ],
118
+ "contest_date": "2023-08-01",
119
+ "metadata": {
120
+ "tags": ["linked-list", "recursion"],
121
+ "constraints": "0 <= list1.length, list2.length <= 50",
122
+ },
123
+ },
124
+ {
125
+ "task_id": "lcb_005",
126
+ "question_title": "Best Time to Buy and Sell Stock",
127
+ "question_content": "You are given an array prices where prices[i] is the price of a given stock on the ith day. Find the maximum profit.",
128
+ "starter_code": "def max_profit(prices):\n # Your code here\n pass",
129
+ "difficulty": "EASY",
130
+ "platform": "LEETCODE",
131
+ "public_test_cases": [{"input": "[7,1,5,3,6,4]", "output": "5", "testtype": "FUNCTIONAL"}],
132
+ "contest_date": "2023-09-15",
133
+ "metadata": {"tags": ["array", "dynamic-programming"], "constraints": "1 <= prices.length <= 10^5"},
134
+ },
135
+ ]
136
+
137
+ # Generate limited sample data for fallback
138
+ if limit:
139
+ base_problems = base_problems[:limit]
140
+
141
+ # Add version-specific metadata
142
+ for problem in base_problems:
143
+ problem["release_version"] = self._release_version
144
+
145
+ return base_problems
146
+
147
+ def get_extractor(self):
148
+ """Get the LiveCodeBench extractor."""
149
+ return self._extractor
150
+
151
+ def get_name(self) -> str:
152
+ """Get the task name."""
153
+ return "livecodebench"
154
+
155
+ def get_description(self) -> str:
156
+ """Get the task description."""
157
+ version_info = self._get_version_info()
158
+ return f"LiveCodeBench {self._release_version}: Contamination-free coding benchmark with {version_info['problems']} problems ({version_info['date_range']}) from LeetCode, AtCoder, and CodeForces"
159
+
160
+ def get_categories(self) -> List[str]:
161
+ """Get the task categories."""
162
+ return ["coding", "reasoning", "algorithms", "data-structures"]
163
+
164
+ # Methods to match lm-eval interface
165
+ def has_validation_docs(self) -> bool:
166
+ """Check if task has validation documents."""
167
+ return False # LiveCodeBench doesn't have separate validation sets
168
+
169
+ def has_test_docs(self) -> bool:
170
+ """Check if task has test documents."""
171
+ return True # All samples are considered test docs
172
+
173
+ def test_docs(self) -> List[Dict[str, Any]]:
174
+ """Get test documents."""
175
+ if self._data is None:
176
+ self._data = self.load_data(limit=self._limit)
177
+ return self._data
178
+
179
+ def validation_docs(self) -> List[Dict[str, Any]]:
180
+ """Get validation documents."""
181
+ return [] # No separate validation set
182
+
183
+ def doc_to_text(self, doc: Dict[str, Any]) -> str:
184
+ """Convert document to text prompt."""
185
+ # Combine problem description with starter code
186
+ question = doc.get("question_content", "")
187
+ starter = doc.get("starter_code", "")
188
+ return f"{question}\n\n{starter}"
189
+
190
+
191
+ # TODO: In a real implementation, this would integrate with the actual LiveCodeBench library
192
+ # Example integration:
193
+ # from livecodebench import LiveCodeBench
194
+ #
195
+ # class LiveCodeBenchTask(TaskInterface):
196
+ # def __init__(self):
197
+ # self._lcb = LiveCodeBench()
198
+ # # self._extractor = LiveCodeBenchExtractor() # Not needed with model outputs approach
199
+ #
200
+ # def load_data(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
201
+ # return self._lcb.load_problems(limit=limit)
@@ -0,0 +1,158 @@
1
+ """
2
+ LiveMathBench CNMO 2024 mathematical reasoning task implementation for task-agnostic architecture.
3
+ """
4
+
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ import datasets
8
+
9
+ from ..benchmark_extractors import GSM8KExtractor
10
+ from ..task_interface import TaskInterface
11
+
12
+
13
+ class LiveMathBenchTask(TaskInterface):
14
+ """LiveMathBench CNMO 2024 mathematical reasoning task implementation."""
15
+
16
+ # Dataset configurations for CNMO 2024 Chinese and English
17
+ DATASET_CONFIGS = {
18
+ "cnmo_en": {
19
+ "source": "opencompass/LiveMathBench",
20
+ "config": "v202412_CNMO_en",
21
+ "split": "test",
22
+ "fields": {"problem": "question", "answer": "answer"},
23
+ "description": "18 CNMO 2024 mathematical problems in English",
24
+ },
25
+ "cnmo_zh": {
26
+ "source": "opencompass/LiveMathBench",
27
+ "config": "v202412_CNMO_cn",
28
+ "split": "test",
29
+ "fields": {"problem": "question", "answer": "answer"},
30
+ "description": "18 CNMO 2024 mathematical problems in Chinese",
31
+ },
32
+ }
33
+
34
+ def __init__(self, language: str = "en", limit: Optional[int] = None):
35
+ """
36
+ Initialize LiveMathBench task for specified language.
37
+
38
+ Args:
39
+ language: Language code ("en" for English, "zh" for Chinese). Default: "en"
40
+ limit: Maximum number of samples to load
41
+ """
42
+ config_key = f"cnmo_{language}" if language in ["en", "zh"] else "cnmo_en"
43
+ if config_key not in self.DATASET_CONFIGS:
44
+ available = list(self.DATASET_CONFIGS.keys())
45
+ raise ValueError(f"LiveMathBench config '{config_key}' not supported. Available: {available}")
46
+
47
+ self.language = language
48
+ self.config_key = config_key
49
+ self.config = self.DATASET_CONFIGS[config_key]
50
+ self._limit = limit
51
+ self._data = None # Cache for loaded data
52
+ self._extractor = GSM8KExtractor() # Reuse enhanced GSM8K extractor
53
+
54
+ def load_data(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
55
+ """Load LiveMathBench CNMO 2024 data from HuggingFace for specified language."""
56
+ # Load dataset based on language configuration
57
+ try:
58
+ dataset = datasets.load_dataset(self.config["source"], self.config["config"], split=self.config["split"])
59
+ except ValueError as e:
60
+ if "Feature type 'List' not found" in str(e):
61
+ # Clear cache and retry due to deprecated feature type
62
+ import os
63
+ import shutil
64
+
65
+ from datasets.utils.logging import get_logger
66
+
67
+ logger = get_logger(__name__)
68
+ logger.warning(f"Clearing dataset cache due to deprecated 'List' feature type: {e}")
69
+
70
+ # Clear entire datasets cache to ensure clean state
71
+ try:
72
+ cache_dir = datasets.config.HF_DATASETS_CACHE
73
+ if os.path.exists(cache_dir):
74
+ shutil.rmtree(cache_dir)
75
+ logger.info(f"Removed entire datasets cache directory: {cache_dir}")
76
+ except Exception as cache_error:
77
+ logger.warning(f"Failed to clear cache: {cache_error}")
78
+
79
+ # Try loading again after cache clear with explicit cache disable
80
+ dataset = datasets.load_dataset(
81
+ self.config["source"],
82
+ self.config["config"],
83
+ split=self.config["split"],
84
+ cache_dir=None, # Disable caching for this load
85
+ download_mode="force_redownload",
86
+ )
87
+ else:
88
+ raise
89
+
90
+ # Apply limit
91
+ effective_limit = limit or self._limit
92
+ if effective_limit:
93
+ dataset = dataset.select(range(min(effective_limit, len(dataset))))
94
+
95
+ # Convert to list and normalize field names
96
+ data = [dict(item) for item in dataset]
97
+
98
+ # Normalize field names for consistent processing
99
+ normalized_data = []
100
+ problem_field = self.config["fields"]["problem"]
101
+ answer_field = self.config["fields"]["answer"]
102
+
103
+ for item in data:
104
+ normalized_item = dict(item) # Keep all original fields
105
+
106
+ # Ensure consistent field names for extractor
107
+ if problem_field in item:
108
+ normalized_item["Problem"] = item[problem_field]
109
+ normalized_item["question"] = item[problem_field] # For question/answer format
110
+
111
+ if answer_field in item:
112
+ normalized_item["Answer"] = item[answer_field]
113
+ normalized_item["answer"] = item[answer_field] # For question/answer format
114
+
115
+ normalized_data.append(normalized_item)
116
+
117
+ return normalized_data
118
+
119
+ def get_task_info(self) -> Dict[str, Any]:
120
+ """Get information about the LiveMathBench task."""
121
+ return {
122
+ "task_name": f"livemathbench_{self.config_key}",
123
+ "language": self.language,
124
+ "contest": "CNMO 2024",
125
+ "description": self.config["description"],
126
+ "source": self.config["source"],
127
+ "task_type": "text_generation",
128
+ "evaluation_method": "mathematical_equivalence",
129
+ }
130
+
131
+ def validate_sample(self, sample: Dict[str, Any]) -> bool:
132
+ """Validate that a sample has required LiveMathBench fields."""
133
+ problem_field = self.config["fields"]["problem"]
134
+ answer_field = self.config["fields"]["answer"]
135
+
136
+ return all(field in sample for field in [problem_field, answer_field])
137
+
138
+ def get_extractor(self) -> GSM8KExtractor:
139
+ """Get the benchmark extractor for this task."""
140
+ return self._extractor
141
+
142
+ def get_name(self) -> str:
143
+ """Get the task name."""
144
+ return f"livemathbench_{self.config_key}"
145
+
146
+ def get_description(self) -> str:
147
+ """Get the task description."""
148
+ lang_name = "Chinese" if self.language == "zh" else "English"
149
+ return f"LiveMathBench CNMO 2024 mathematical olympiad problems in {lang_name}"
150
+
151
+ def get_categories(self) -> List[str]:
152
+ """Get the task categories."""
153
+ return ["mathematics", "reasoning", "olympiad", "multilingual", "text_generation"]
154
+
155
+ @classmethod
156
+ def get_supported_languages(cls) -> List[str]:
157
+ """Get list of supported languages for CNMO 2024."""
158
+ return ["en", "zh"]