wisent 0.1.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (237) hide show
  1. wisent/__init__.py +1 -8
  2. wisent/benchmarks/__init__.py +0 -0
  3. wisent/benchmarks/coding/__init__.py +0 -0
  4. wisent/benchmarks/coding/metrics/__init__.py +0 -0
  5. wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
  6. wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
  7. wisent/benchmarks/coding/metrics/evaluator.py +275 -0
  8. wisent/benchmarks/coding/metrics/passk.py +66 -0
  9. wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
  10. wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
  11. wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
  12. wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
  13. wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
  14. wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
  15. wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
  16. wisent/benchmarks/coding/providers/__init__.py +18 -0
  17. wisent/benchmarks/coding/providers/core/__init__.py +0 -0
  18. wisent/benchmarks/coding/providers/core/atoms.py +31 -0
  19. wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
  20. wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
  21. wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
  22. wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
  23. wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
  24. wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
  25. wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
  26. wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
  27. wisent/classifiers/__init__.py +0 -0
  28. wisent/classifiers/core/__init__.py +0 -0
  29. wisent/classifiers/core/atoms.py +747 -0
  30. wisent/classifiers/models/__init__.py +0 -0
  31. wisent/classifiers/models/logistic.py +29 -0
  32. wisent/classifiers/models/mlp.py +47 -0
  33. wisent/cli/__init__.py +0 -0
  34. wisent/cli/classifiers/__init__.py +0 -0
  35. wisent/cli/classifiers/classifier_rotator.py +137 -0
  36. wisent/cli/cli_logger.py +142 -0
  37. wisent/cli/data_loaders/__init__.py +0 -0
  38. wisent/cli/data_loaders/data_loader_rotator.py +96 -0
  39. wisent/cli/evaluators/__init__.py +0 -0
  40. wisent/cli/evaluators/evaluator_rotator.py +148 -0
  41. wisent/cli/steering_methods/__init__.py +0 -0
  42. wisent/cli/steering_methods/steering_rotator.py +110 -0
  43. wisent/cli/wisent_cli/__init__.py +0 -0
  44. wisent/cli/wisent_cli/commands/__init__.py +0 -0
  45. wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
  46. wisent/cli/wisent_cli/commands/listing.py +154 -0
  47. wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
  48. wisent/cli/wisent_cli/main.py +93 -0
  49. wisent/cli/wisent_cli/shell.py +80 -0
  50. wisent/cli/wisent_cli/ui.py +69 -0
  51. wisent/cli/wisent_cli/util/__init__.py +0 -0
  52. wisent/cli/wisent_cli/util/aggregations.py +43 -0
  53. wisent/cli/wisent_cli/util/parsing.py +126 -0
  54. wisent/cli/wisent_cli/version.py +4 -0
  55. wisent/core/__init__.py +27 -0
  56. wisent/core/activations/__init__.py +0 -0
  57. wisent/core/activations/activations_collector.py +338 -0
  58. wisent/core/activations/core/__init__.py +0 -0
  59. wisent/core/activations/core/atoms.py +216 -0
  60. wisent/core/agent/__init__.py +18 -0
  61. wisent/core/agent/budget.py +638 -0
  62. wisent/core/agent/device_benchmarks.py +685 -0
  63. wisent/core/agent/diagnose/__init__.py +55 -0
  64. wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
  65. wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
  66. wisent/core/agent/diagnose/create_classifier.py +1154 -0
  67. wisent/core/agent/diagnose/response_diagnostics.py +268 -0
  68. wisent/core/agent/diagnose/select_classifiers.py +506 -0
  69. wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
  70. wisent/core/agent/diagnose/tasks/__init__.py +33 -0
  71. wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
  72. wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
  73. wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
  74. wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
  75. wisent/core/agent/diagnose.py +242 -0
  76. wisent/core/agent/steer.py +212 -0
  77. wisent/core/agent/timeout.py +134 -0
  78. wisent/core/autonomous_agent.py +1234 -0
  79. wisent/core/bigcode_integration.py +583 -0
  80. wisent/core/contrastive_pairs/__init__.py +15 -0
  81. wisent/core/contrastive_pairs/core/__init__.py +0 -0
  82. wisent/core/contrastive_pairs/core/atoms.py +45 -0
  83. wisent/core/contrastive_pairs/core/buliders.py +59 -0
  84. wisent/core/contrastive_pairs/core/pair.py +178 -0
  85. wisent/core/contrastive_pairs/core/response.py +152 -0
  86. wisent/core/contrastive_pairs/core/serialization.py +300 -0
  87. wisent/core/contrastive_pairs/core/set.py +133 -0
  88. wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
  89. wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
  90. wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
  91. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
  92. wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
  93. wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
  94. wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
  95. wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
  96. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
  102. wisent/core/data_loaders/__init__.py +0 -0
  103. wisent/core/data_loaders/core/__init__.py +0 -0
  104. wisent/core/data_loaders/core/atoms.py +98 -0
  105. wisent/core/data_loaders/loaders/__init__.py +0 -0
  106. wisent/core/data_loaders/loaders/custom.py +120 -0
  107. wisent/core/data_loaders/loaders/lm_loader.py +218 -0
  108. wisent/core/detection_handling.py +257 -0
  109. wisent/core/download_full_benchmarks.py +1386 -0
  110. wisent/core/evaluators/__init__.py +0 -0
  111. wisent/core/evaluators/oracles/__init__.py +0 -0
  112. wisent/core/evaluators/oracles/interactive.py +73 -0
  113. wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
  114. wisent/core/evaluators/oracles/user_specified.py +67 -0
  115. wisent/core/hyperparameter_optimizer.py +429 -0
  116. wisent/core/lm_eval_harness_ground_truth.py +1396 -0
  117. wisent/core/log_likelihoods_evaluator.py +321 -0
  118. wisent/core/managed_cached_benchmarks.py +595 -0
  119. wisent/core/mixed_benchmark_sampler.py +364 -0
  120. wisent/core/model_config_manager.py +330 -0
  121. wisent/core/model_persistence.py +317 -0
  122. wisent/core/models/__init__.py +0 -0
  123. wisent/core/models/core/__init__.py +0 -0
  124. wisent/core/models/core/atoms.py +460 -0
  125. wisent/core/models/wisent_model.py +727 -0
  126. wisent/core/multi_steering.py +316 -0
  127. wisent/core/optuna/__init__.py +57 -0
  128. wisent/core/optuna/classifier/__init__.py +25 -0
  129. wisent/core/optuna/classifier/activation_generator.py +349 -0
  130. wisent/core/optuna/classifier/classifier_cache.py +509 -0
  131. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
  132. wisent/core/optuna/steering/__init__.py +0 -0
  133. wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
  134. wisent/core/optuna/steering/data_utils.py +342 -0
  135. wisent/core/optuna/steering/metrics.py +474 -0
  136. wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
  137. wisent/core/optuna/steering/steering_optimization.py +1111 -0
  138. wisent/core/parser.py +1668 -0
  139. wisent/core/prompts/__init__.py +0 -0
  140. wisent/core/prompts/core/__init__.py +0 -0
  141. wisent/core/prompts/core/atom.py +57 -0
  142. wisent/core/prompts/core/prompt_formater.py +157 -0
  143. wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
  144. wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
  145. wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
  146. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
  147. wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
  148. wisent/core/representation.py +5 -0
  149. wisent/core/sample_size_optimizer.py +648 -0
  150. wisent/core/sample_size_optimizer_v2.py +355 -0
  151. wisent/core/save_results.py +277 -0
  152. wisent/core/steering.py +652 -0
  153. wisent/core/steering_method.py +26 -0
  154. wisent/core/steering_methods/__init__.py +0 -0
  155. wisent/core/steering_methods/core/__init__.py +0 -0
  156. wisent/core/steering_methods/core/atoms.py +153 -0
  157. wisent/core/steering_methods/methods/__init__.py +0 -0
  158. wisent/core/steering_methods/methods/caa.py +44 -0
  159. wisent/core/steering_optimizer.py +1297 -0
  160. wisent/core/task_interface.py +132 -0
  161. wisent/core/task_selector.py +189 -0
  162. wisent/core/tasks/__init__.py +175 -0
  163. wisent/core/tasks/aime_task.py +141 -0
  164. wisent/core/tasks/file_task.py +211 -0
  165. wisent/core/tasks/hle_task.py +180 -0
  166. wisent/core/tasks/hmmt_task.py +119 -0
  167. wisent/core/tasks/livecodebench_task.py +201 -0
  168. wisent/core/tasks/livemathbench_task.py +158 -0
  169. wisent/core/tasks/lm_eval_task.py +455 -0
  170. wisent/core/tasks/math500_task.py +84 -0
  171. wisent/core/tasks/polymath_task.py +146 -0
  172. wisent/core/tasks/supergpqa_task.py +220 -0
  173. wisent/core/time_estimator.py +149 -0
  174. wisent/core/timing_calibration.py +174 -0
  175. wisent/core/tracking/__init__.py +54 -0
  176. wisent/core/tracking/latency.py +618 -0
  177. wisent/core/tracking/memory.py +359 -0
  178. wisent/core/trainers/__init__.py +0 -0
  179. wisent/core/trainers/core/__init__.py +11 -0
  180. wisent/core/trainers/core/atoms.py +45 -0
  181. wisent/core/trainers/steering_trainer.py +271 -0
  182. wisent/core/user_model_config.py +158 -0
  183. wisent/opti/__init__.py +0 -0
  184. wisent/opti/core/__init__.py +0 -0
  185. wisent/opti/core/atoms.py +175 -0
  186. wisent/opti/methods/__init__.py +0 -0
  187. wisent/opti/methods/opti_classificator.py +172 -0
  188. wisent/opti/methods/opti_steering.py +138 -0
  189. wisent/synthetic/__init__.py +0 -0
  190. wisent/synthetic/cleaners/__init__.py +0 -0
  191. wisent/synthetic/cleaners/core/__init__.py +0 -0
  192. wisent/synthetic/cleaners/core/atoms.py +58 -0
  193. wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
  194. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  195. wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
  196. wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
  197. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  198. wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
  199. wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
  200. wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
  201. wisent/synthetic/db_instructions/__init__.py +0 -0
  202. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  203. wisent/synthetic/db_instructions/core/atoms.py +25 -0
  204. wisent/synthetic/db_instructions/mini_dp.py +37 -0
  205. wisent/synthetic/generators/__init__.py +0 -0
  206. wisent/synthetic/generators/core/__init__.py +0 -0
  207. wisent/synthetic/generators/core/atoms.py +73 -0
  208. wisent/synthetic/generators/diversities/__init__.py +0 -0
  209. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  210. wisent/synthetic/generators/diversities/core/core.py +68 -0
  211. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  212. wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
  213. wisent/synthetic/generators/pairs_generator.py +179 -0
  214. wisent-0.5.1.dist-info/METADATA +67 -0
  215. wisent-0.5.1.dist-info/RECORD +218 -0
  216. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/WHEEL +1 -1
  217. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info/licenses}/LICENSE +2 -2
  218. wisent/activations/__init__.py +0 -9
  219. wisent/activations/client.py +0 -97
  220. wisent/activations/extractor.py +0 -251
  221. wisent/activations/models.py +0 -95
  222. wisent/client.py +0 -45
  223. wisent/control_vector/__init__.py +0 -9
  224. wisent/control_vector/client.py +0 -85
  225. wisent/control_vector/manager.py +0 -168
  226. wisent/control_vector/models.py +0 -70
  227. wisent/inference/__init__.py +0 -9
  228. wisent/inference/client.py +0 -103
  229. wisent/inference/inferencer.py +0 -250
  230. wisent/inference/models.py +0 -66
  231. wisent/utils/__init__.py +0 -3
  232. wisent/utils/auth.py +0 -30
  233. wisent/utils/http.py +0 -228
  234. wisent/version.py +0 -3
  235. wisent-0.1.1.dist-info/METADATA +0 -142
  236. wisent-0.1.1.dist-info/RECORD +0 -23
  237. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,455 @@
1
+ """
2
+ LM-Evaluation-Harness task wrapper for task-agnostic architecture.
3
+ """
4
+
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from ..benchmark_extractors import BenchmarkExtractor, get_extractor
8
+ from ..task_interface import TaskInterface
9
+
10
+
11
+ class LMEvalTask(TaskInterface):
12
+ """Wrapper for lm-evaluation-harness tasks."""
13
+
14
+ def __init__(self, task_name: str, description: str, categories: List[str]):
15
+ self.task_name = task_name
16
+ self._description = description
17
+ self._categories = categories
18
+ self._extractor = get_extractor(task_name)
19
+
20
+ def load_data(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
21
+ """Load data directly from lm-eval without Model dependency."""
22
+ try:
23
+ # Load data directly from lm-eval without creating a Model instance
24
+ from lm_eval.tasks import get_task_dict
25
+
26
+ # Get task directly from lm-eval
27
+ task_dict = get_task_dict([self.task_name])
28
+ if self.task_name not in task_dict:
29
+ print(f"Warning: Task '{self.task_name}' not found in lm-eval")
30
+ return []
31
+
32
+ task = task_dict[self.task_name]
33
+
34
+ # Get the task's test documents
35
+ docs = []
36
+ if hasattr(task, "test_docs"):
37
+ # For lm-eval versions with test_docs method
38
+ docs = list(task.test_docs())
39
+ elif hasattr(task, "dataset"):
40
+ # For newer lm-eval versions
41
+ dataset = task.dataset
42
+ if hasattr(dataset, "test"):
43
+ docs = list(dataset.test)
44
+ elif hasattr(dataset, "validation"):
45
+ docs = list(dataset.validation)
46
+ else:
47
+ # Fallback to the main dataset
48
+ docs = list(dataset)
49
+
50
+ # Ensure docs are in dictionary format
51
+ processed_docs = []
52
+ for doc in docs:
53
+ if isinstance(doc, dict):
54
+ processed_docs.append(doc)
55
+ elif isinstance(doc, str):
56
+ # Handle string documents by wrapping them
57
+ processed_docs.append({"text": doc})
58
+ else:
59
+ # Try to convert to dict if possible
60
+ try:
61
+ processed_docs.append(dict(doc))
62
+ except:
63
+ processed_docs.append({"data": str(doc)})
64
+
65
+ docs = processed_docs
66
+
67
+ # Apply limit if specified
68
+ if limit and len(docs) > limit:
69
+ docs = docs[:limit]
70
+
71
+ return docs
72
+
73
+ except Exception as e:
74
+ print(f"Warning: Could not load lm-eval task '{self.task_name}': {e}")
75
+ return []
76
+
77
+ def get_extractor(self) -> BenchmarkExtractor:
78
+ """Get the benchmark extractor for this task."""
79
+ return self._extractor
80
+
81
+ def get_name(self) -> str:
82
+ """Get the task name."""
83
+ return self.task_name
84
+
85
+ def get_description(self) -> str:
86
+ """Get the task description."""
87
+ return self._description
88
+
89
+ def get_categories(self) -> List[str]:
90
+ """Get the task categories."""
91
+ return self._categories
92
+
93
+
94
+ class MBPPTask(LMEvalTask):
95
+ """MBPP task implementation."""
96
+
97
+ def __init__(self):
98
+ super().__init__(
99
+ task_name="mbpp",
100
+ description="MBPP: Mostly Basic Python Problems coding benchmark",
101
+ categories=["coding", "reasoning", "python"],
102
+ )
103
+
104
+
105
+ class HumanEvalTask(LMEvalTask):
106
+ """HumanEval task implementation."""
107
+
108
+ def __init__(self):
109
+ super().__init__(
110
+ task_name="humaneval",
111
+ description="HumanEval: Human Evaluation of Python coding problems",
112
+ categories=["coding", "reasoning", "python"],
113
+ )
114
+
115
+
116
+ class MBPPPlusTask(LMEvalTask):
117
+ """MBPP Plus task implementation."""
118
+
119
+ def __init__(self):
120
+ super().__init__(
121
+ task_name="mbpp_plus",
122
+ description="MBPP Plus: Extended version of MBPP with additional test cases",
123
+ categories=["coding", "reasoning", "python"],
124
+ )
125
+
126
+
127
+ class GSM8KTask(LMEvalTask):
128
+ """GSM8K task implementation."""
129
+
130
+ def __init__(self):
131
+ super().__init__(
132
+ task_name="gsm8k",
133
+ description="GSM8K: Grade School Math 8K problems",
134
+ categories=["mathematics", "reasoning", "arithmetic"],
135
+ )
136
+
137
+
138
+ class TruthfulQATask(LMEvalTask):
139
+ """TruthfulQA task implementation."""
140
+
141
+ def __init__(self):
142
+ super().__init__(
143
+ task_name="truthfulqa_mc1",
144
+ description="TruthfulQA: Truthfulness evaluation benchmark",
145
+ categories=["hallucination", "general-knowledge", "reasoning"],
146
+ )
147
+
148
+ def load_data(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
149
+ """Load TruthfulQA data, which only has validation split."""
150
+ try:
151
+ from lm_eval.tasks import get_task_dict
152
+
153
+ # Get task directly from lm-eval
154
+ task_dict = get_task_dict([self.task_name])
155
+ if self.task_name not in task_dict:
156
+ print(f"Warning: Task '{self.task_name}' not found in lm-eval")
157
+ return []
158
+
159
+ task = task_dict[self.task_name]
160
+
161
+ # TruthfulQA only has validation split, access it directly
162
+ docs = []
163
+ if hasattr(task, "dataset") and "validation" in task.dataset:
164
+ validation_data = task.dataset["validation"]
165
+ docs = list(validation_data)
166
+
167
+ # Apply limit if specified
168
+ if limit and len(docs) > limit:
169
+ docs = docs[:limit]
170
+
171
+ return docs
172
+
173
+ except Exception as e:
174
+ print(f"Warning: Could not load TruthfulQA task '{self.task_name}': {e}")
175
+ import traceback
176
+
177
+ traceback.print_exc()
178
+ return []
179
+
180
+
181
+ class MMLUTask(LMEvalTask):
182
+ """MMLU task implementation."""
183
+
184
+ def __init__(self):
185
+ super().__init__(
186
+ task_name="mmlu",
187
+ description="MMLU: Massive Multitask Language Understanding",
188
+ categories=["general-knowledge", "science", "reasoning"],
189
+ )
190
+
191
+
192
+ # === CODING TASKS ===
193
+
194
+
195
+ class InstructHumanEvalTask(LMEvalTask):
196
+ """InstructHumanEval task implementation."""
197
+
198
+ def __init__(self):
199
+ super().__init__(
200
+ task_name="instructhumaneval",
201
+ description="InstructHumanEval: Instruction-following HumanEval benchmark",
202
+ categories=["coding", "reasoning", "python", "instruction-following"],
203
+ )
204
+
205
+
206
+ class HumanEvalPlusTask(LMEvalTask):
207
+ """HumanEval Plus task implementation."""
208
+
209
+ def __init__(self):
210
+ super().__init__(
211
+ task_name="humaneval_plus",
212
+ description="HumanEval Plus: Extended HumanEval with more tests",
213
+ categories=["coding", "reasoning", "python"],
214
+ )
215
+
216
+
217
+ class ConalaTask(LMEvalTask):
218
+ """Conala task implementation."""
219
+
220
+ def __init__(self):
221
+ super().__init__(
222
+ task_name="conala",
223
+ description="Conala: Code generation from natural language",
224
+ categories=["coding", "reasoning", "python", "nl2code"],
225
+ )
226
+
227
+
228
+ class ConcodeTask(LMEvalTask):
229
+ """Concode task implementation."""
230
+
231
+ def __init__(self):
232
+ super().__init__(
233
+ task_name="concode",
234
+ description="Concode: Code completion benchmark",
235
+ categories=["coding", "reasoning", "completion"],
236
+ )
237
+
238
+
239
+ class MercuryTask(LMEvalTask):
240
+ """Mercury task implementation."""
241
+
242
+ def __init__(self):
243
+ super().__init__(
244
+ task_name="mercury",
245
+ description="Mercury: Code generation benchmark",
246
+ categories=["coding", "reasoning"],
247
+ )
248
+
249
+
250
+ class AppsTask(LMEvalTask):
251
+ """APPS task implementation."""
252
+
253
+ def __init__(self):
254
+ super().__init__(
255
+ task_name="apps",
256
+ description="APPS: Automated Programming Problems Synthesis",
257
+ categories=["coding", "reasoning", "python", "competitive"],
258
+ )
259
+
260
+
261
+ class DS1000Task(LMEvalTask):
262
+ """DS1000 task implementation."""
263
+
264
+ def __init__(self):
265
+ super().__init__(
266
+ task_name="ds1000",
267
+ description="DS1000: Data Science coding tasks",
268
+ categories=["coding", "reasoning", "python", "data-science"],
269
+ )
270
+
271
+
272
+ class MultiplePyTask(LMEvalTask):
273
+ """Multiple-Py task implementation."""
274
+
275
+ def __init__(self):
276
+ super().__init__(
277
+ task_name="multiple_py",
278
+ description="Multiple-Py: Multi-language Python tasks",
279
+ categories=["coding", "reasoning", "python", "multi-language"],
280
+ )
281
+
282
+
283
+ class MultipleJsTask(LMEvalTask):
284
+ """Multiple-JS task implementation."""
285
+
286
+ def __init__(self):
287
+ super().__init__(
288
+ task_name="multiple_js",
289
+ description="Multiple-JS: Multi-language JavaScript tasks",
290
+ categories=["coding", "reasoning", "javascript", "multi-language"],
291
+ )
292
+
293
+
294
+ class MultipleJavaTask(LMEvalTask):
295
+ """Multiple-Java task implementation."""
296
+
297
+ def __init__(self):
298
+ super().__init__(
299
+ task_name="multiple_java",
300
+ description="Multiple-Java: Multi-language Java tasks",
301
+ categories=["coding", "reasoning", "java", "multi-language"],
302
+ )
303
+
304
+
305
+ class MultipleCppTask(LMEvalTask):
306
+ """Multiple-Cpp task implementation."""
307
+
308
+ def __init__(self):
309
+ super().__init__(
310
+ task_name="multiple_cpp",
311
+ description="Multiple-Cpp: Multi-language C++ tasks",
312
+ categories=["coding", "reasoning", "cpp", "multi-language"],
313
+ )
314
+
315
+
316
+ class MultipleRsTask(LMEvalTask):
317
+ """Multiple-Rs task implementation."""
318
+
319
+ def __init__(self):
320
+ super().__init__(
321
+ task_name="multiple_rs",
322
+ description="Multiple-Rs: Multi-language Rust tasks",
323
+ categories=["coding", "reasoning", "rust", "multi-language"],
324
+ )
325
+
326
+
327
+ class MultipleGoTask(LMEvalTask):
328
+ """Multiple-Go task implementation."""
329
+
330
+ def __init__(self):
331
+ super().__init__(
332
+ task_name="multiple_go",
333
+ description="Multiple-Go: Multi-language Go tasks",
334
+ categories=["coding", "reasoning", "go", "multi-language"],
335
+ )
336
+
337
+
338
+ class CodexglueCodeToTextPythonTask(LMEvalTask):
339
+ """CodexGlue Code-to-Text Python task implementation."""
340
+
341
+ def __init__(self):
342
+ super().__init__(
343
+ task_name="codexglue_code_to_text_python",
344
+ description="CodexGlue Code-to-Text Python: Python code summarization",
345
+ categories=["coding", "reasoning", "python", "code-to-text"],
346
+ )
347
+
348
+
349
+ class CodexglueCodeToTextGoTask(LMEvalTask):
350
+ """CodexGlue Code-to-Text Go task implementation."""
351
+
352
+ def __init__(self):
353
+ super().__init__(
354
+ task_name="codexglue_code_to_text_go",
355
+ description="CodexGlue Code-to-Text Go: Go code summarization",
356
+ categories=["coding", "reasoning", "go", "code-to-text"],
357
+ )
358
+
359
+
360
+ class CodexglueCodeToTextRubyTask(LMEvalTask):
361
+ """CodexGlue Code-to-Text Ruby task implementation."""
362
+
363
+ def __init__(self):
364
+ super().__init__(
365
+ task_name="codexglue_code_to_text_ruby",
366
+ description="CodexGlue Code-to-Text Ruby: Ruby code summarization",
367
+ categories=["coding", "reasoning", "ruby", "code-to-text"],
368
+ )
369
+
370
+
371
+ class CodexglueCodeToTextJavaTask(LMEvalTask):
372
+ """CodexGlue Code-to-Text Java task implementation."""
373
+
374
+ def __init__(self):
375
+ super().__init__(
376
+ task_name="codexglue_code_to_text_java",
377
+ description="CodexGlue Code-to-Text Java: Java code summarization",
378
+ categories=["coding", "reasoning", "java", "code-to-text"],
379
+ )
380
+
381
+
382
+ class CodexglueCodeToTextJavascriptTask(LMEvalTask):
383
+ """CodexGlue Code-to-Text JavaScript task implementation."""
384
+
385
+ def __init__(self):
386
+ super().__init__(
387
+ task_name="codexglue_code_to_text_javascript",
388
+ description="CodexGlue Code-to-Text JavaScript: JavaScript code summarization",
389
+ categories=["coding", "reasoning", "javascript", "code-to-text"],
390
+ )
391
+
392
+
393
+ class CodexglueCodeToTextPhpTask(LMEvalTask):
394
+ """CodexGlue Code-to-Text PHP task implementation."""
395
+
396
+ def __init__(self):
397
+ super().__init__(
398
+ task_name="codexglue_code_to_text_php",
399
+ description="CodexGlue Code-to-Text PHP: PHP code summarization",
400
+ categories=["coding", "reasoning", "php", "code-to-text"],
401
+ )
402
+
403
+
404
+ class RecodeTask(LMEvalTask):
405
+ """Recode task implementation."""
406
+
407
+ def __init__(self):
408
+ super().__init__(
409
+ task_name="recode",
410
+ description="Recode: Perturbed HumanEval natural generation",
411
+ categories=["coding", "reasoning", "python", "perturbation"],
412
+ )
413
+
414
+
415
+ class Squad2Task(LMEvalTask):
416
+ """SQuAD2 task implementation."""
417
+
418
+ def __init__(self):
419
+ super().__init__(
420
+ task_name="squadv2",
421
+ description="SQuAD2: Stanford Question Answering Dataset 2.0",
422
+ categories=["reading-comprehension", "qa", "natural-language"],
423
+ )
424
+
425
+ def load_data(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
426
+ """Load SQuAD2 data, which only has validation split."""
427
+ try:
428
+ from lm_eval.tasks import get_task_dict
429
+
430
+ # Get task directly from lm-eval
431
+ task_dict = get_task_dict([self.task_name])
432
+ if self.task_name not in task_dict:
433
+ print(f"Warning: Task '{self.task_name}' not found in lm-eval")
434
+ return []
435
+
436
+ task = task_dict[self.task_name]
437
+
438
+ # SQuAD2 only has validation split, access it directly
439
+ docs = []
440
+ if hasattr(task, "dataset") and "validation" in task.dataset:
441
+ validation_data = task.dataset["validation"]
442
+ docs = list(validation_data)
443
+
444
+ # Apply limit if specified
445
+ if limit and len(docs) > limit:
446
+ docs = docs[:limit]
447
+
448
+ return docs
449
+
450
+ except Exception as e:
451
+ print(f"Warning: Could not load SQuAD2 task '{self.task_name}': {e}")
452
+ import traceback
453
+
454
+ traceback.print_exc()
455
+ return []
@@ -0,0 +1,84 @@
1
+ """
2
+ MATH-500 task implementation for task-agnostic architecture.
3
+ """
4
+
5
+ from typing import Dict, Any, List, Optional
6
+ from ..task_interface import TaskInterface
7
+ from ..benchmark_extractors import GSM8KExtractor
8
+ import datasets
9
+
10
+
11
+ class Math500Task(TaskInterface):
12
+ """MATH-500 mathematical reasoning task implementation."""
13
+
14
+ def __init__(self, limit: Optional[int] = None):
15
+ self._limit = limit
16
+ self._data = None # Cache for loaded data
17
+ self._extractor = GSM8KExtractor() # Reuse GSM8K extractor
18
+
19
+ def load_data(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
20
+ """Load MATH-500 data from HuggingFace."""
21
+ dataset = datasets.load_dataset("HuggingFaceH4/MATH-500", split="test")
22
+
23
+ # Apply limit
24
+ effective_limit = limit or self._limit
25
+ if effective_limit:
26
+ dataset = dataset.select(range(min(effective_limit, len(dataset))))
27
+
28
+ # Convert to list of dictionaries
29
+ return [dict(item) for item in dataset]
30
+
31
+
32
+ def get_task_info(self) -> Dict[str, Any]:
33
+ """Get information about the MATH-500 task."""
34
+ return {
35
+ "task_name": "math500",
36
+ "description": "500 mathematical reasoning problems from OpenAI's MATH dataset",
37
+ "source": "HuggingFaceH4/MATH-500",
38
+ "task_type": "text_generation",
39
+ "evaluation_method": "mathematical_equivalence"
40
+ }
41
+
42
+ def validate_sample(self, sample: Dict[str, Any]) -> bool:
43
+ """Validate that a sample has required MATH-500 fields."""
44
+ required_fields = ["problem", "answer"]
45
+ return all(field in sample for field in required_fields)
46
+
47
+ def get_extractor(self) -> GSM8KExtractor:
48
+ """Get the benchmark extractor for this task."""
49
+ return self._extractor
50
+
51
+ def get_name(self) -> str:
52
+ """Get the task name."""
53
+ return "math500"
54
+
55
+ def get_description(self) -> str:
56
+ """Get the task description."""
57
+ return "500 mathematical reasoning problems from OpenAI's MATH dataset requiring multi-step solutions"
58
+
59
+ def get_categories(self) -> List[str]:
60
+ """Get the task categories."""
61
+ return ["mathematics", "reasoning", "text_generation"]
62
+
63
+ # Methods to match lm-eval interface
64
+ def has_validation_docs(self) -> bool:
65
+ """Check if task has validation documents."""
66
+ return False # MATH-500 doesn't have separate validation sets
67
+
68
+ def has_test_docs(self) -> bool:
69
+ """Check if task has test documents."""
70
+ return True # All samples are considered test docs
71
+
72
+ def test_docs(self) -> List[Dict[str, Any]]:
73
+ """Get test documents."""
74
+ if self._data is None:
75
+ self._data = self.load_data()
76
+ return self._data
77
+
78
+ def validation_docs(self) -> List[Dict[str, Any]]:
79
+ """Get validation documents."""
80
+ return [] # No separate validation set
81
+
82
+ def doc_to_text(self, doc: Dict[str, Any]) -> str:
83
+ """Convert document to text prompt."""
84
+ return doc.get('problem', '')
@@ -0,0 +1,146 @@
1
+ """
2
+ PolyMath multilingual mathematical reasoning task implementation for task-agnostic architecture.
3
+ """
4
+
5
+ from typing import Dict, Any, List, Optional
6
+ from ..task_interface import TaskInterface
7
+ from ..benchmark_extractors import GSM8KExtractor
8
+ import datasets
9
+
10
+
11
+ class PolyMathTask(TaskInterface):
12
+ """PolyMath multilingual mathematical reasoning task implementation."""
13
+
14
+ # Dataset configurations for different language-difficulty combinations
15
+ DATASET_CONFIGS = {
16
+ "zh_medium": {
17
+ "source": "Qwen/PolyMath",
18
+ "language": "zh",
19
+ "split": "medium",
20
+ "fields": {"problem": "question", "answer": "answer"},
21
+ "description": "125 medium-difficulty mathematical problems in Chinese"
22
+ },
23
+ "en_medium": {
24
+ "source": "Qwen/PolyMath",
25
+ "language": "en",
26
+ "split": "medium",
27
+ "fields": {"problem": "question", "answer": "answer"},
28
+ "description": "125 medium-difficulty mathematical problems in English"
29
+ },
30
+ "zh_high": {
31
+ "source": "Qwen/PolyMath",
32
+ "language": "zh",
33
+ "split": "high",
34
+ "fields": {"problem": "question", "answer": "answer"},
35
+ "description": "125 high-difficulty mathematical problems in Chinese"
36
+ },
37
+ "en_high": {
38
+ "source": "Qwen/PolyMath",
39
+ "language": "en",
40
+ "split": "high",
41
+ "fields": {"problem": "question", "answer": "answer"},
42
+ "description": "125 high-difficulty mathematical problems in English"
43
+ }
44
+ }
45
+
46
+ def __init__(self, language: str = "en", difficulty: str = "medium", limit: Optional[int] = None):
47
+ """
48
+ Initialize PolyMath task for specified language and difficulty.
49
+
50
+ Args:
51
+ language: Language code ("en" for English, "zh" for Chinese). Default: "en"
52
+ difficulty: Difficulty level ("medium", "high"). Default: "medium"
53
+ limit: Maximum number of samples to load
54
+ """
55
+ config_key = f"{language}_{difficulty}"
56
+ if config_key not in self.DATASET_CONFIGS:
57
+ available = list(self.DATASET_CONFIGS.keys())
58
+ raise ValueError(f"PolyMath config '{config_key}' not supported. Available: {available}")
59
+
60
+ self.language = language
61
+ self.difficulty = difficulty
62
+ self.config_key = config_key
63
+ self.config = self.DATASET_CONFIGS[config_key]
64
+ self._limit = limit
65
+ self._data = None # Cache for loaded data
66
+ self._extractor = GSM8KExtractor() # Reuse enhanced GSM8K extractor
67
+
68
+ def load_data(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
69
+ """Load PolyMath data from HuggingFace for specified language and difficulty."""
70
+ # Load dataset based on language and difficulty configuration
71
+ dataset = datasets.load_dataset(
72
+ self.config["source"],
73
+ self.config["language"],
74
+ split=self.config["split"]
75
+ )
76
+
77
+ # Apply limit
78
+ effective_limit = limit or self._limit
79
+ if effective_limit:
80
+ dataset = dataset.select(range(min(effective_limit, len(dataset))))
81
+
82
+ # Convert to list and normalize field names
83
+ data = [dict(item) for item in dataset]
84
+
85
+ # Normalize field names for consistent processing
86
+ normalized_data = []
87
+ problem_field = self.config["fields"]["problem"]
88
+ answer_field = self.config["fields"]["answer"]
89
+
90
+ for item in data:
91
+ normalized_item = dict(item) # Keep all original fields
92
+
93
+ # Ensure consistent field names for extractor
94
+ if problem_field in item:
95
+ normalized_item["Problem"] = item[problem_field]
96
+ normalized_item["question"] = item[problem_field] # For question/answer format
97
+
98
+ if answer_field in item:
99
+ normalized_item["Answer"] = item[answer_field]
100
+ normalized_item["answer"] = item[answer_field] # For question/answer format
101
+
102
+ normalized_data.append(normalized_item)
103
+
104
+ return normalized_data
105
+
106
+
107
+ def get_task_info(self) -> Dict[str, Any]:
108
+ """Get information about the PolyMath task."""
109
+ return {
110
+ "task_name": f"polymath_{self.config_key}",
111
+ "language": self.language,
112
+ "difficulty": self.difficulty,
113
+ "description": self.config["description"],
114
+ "source": self.config["source"],
115
+ "task_type": "text_generation",
116
+ "evaluation_method": "mathematical_equivalence"
117
+ }
118
+
119
+ def validate_sample(self, sample: Dict[str, Any]) -> bool:
120
+ """Validate that a sample has required PolyMath fields."""
121
+ problem_field = self.config["fields"]["problem"]
122
+ answer_field = self.config["fields"]["answer"]
123
+
124
+ return all(field in sample for field in [problem_field, answer_field])
125
+
126
+ def get_extractor(self) -> GSM8KExtractor:
127
+ """Get the benchmark extractor for this task."""
128
+ return self._extractor
129
+
130
+ def get_name(self) -> str:
131
+ """Get the task name."""
132
+ return f"polymath_{self.config_key}"
133
+
134
+ def get_description(self) -> str:
135
+ """Get the task description."""
136
+ lang_name = "Chinese" if self.language == "zh" else "English"
137
+ return f"PolyMath {self.difficulty}-difficulty mathematical problems in {lang_name}"
138
+
139
+ def get_categories(self) -> List[str]:
140
+ """Get the task categories."""
141
+ return ["mathematics", "reasoning", "multilingual", "text_generation"]
142
+
143
+ @classmethod
144
+ def get_supported_configs(cls) -> List[str]:
145
+ """Get list of supported PolyMath language-difficulty configurations."""
146
+ return list(cls.DATASET_CONFIGS.keys())