wisent 0.5.11__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (225) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/__init__.py +26 -0
  3. wisent/core/activations/activations.py +96 -0
  4. wisent/core/activations/activations_collector.py +71 -20
  5. wisent/core/activations/prompt_construction_strategy.py +47 -0
  6. wisent/core/agent/budget.py +2 -2
  7. wisent/core/agent/device_benchmarks.py +1 -1
  8. wisent/core/agent/diagnose/classifier_marketplace.py +8 -8
  9. wisent/core/agent/diagnose/response_diagnostics.py +4 -4
  10. wisent/core/agent/diagnose/synthetic_classifier_option.py +1 -1
  11. wisent/core/agent/diagnose/tasks/task_manager.py +3 -3
  12. wisent/core/agent/diagnose.py +2 -1
  13. wisent/core/autonomous_agent.py +10 -2
  14. wisent/core/benchmark_extractors.py +293 -0
  15. wisent/core/bigcode_integration.py +20 -7
  16. wisent/core/branding.py +108 -0
  17. wisent/core/cli/__init__.py +15 -0
  18. wisent/core/cli/create_steering_vector.py +138 -0
  19. wisent/core/cli/evaluate_responses.py +715 -0
  20. wisent/core/cli/generate_pairs.py +128 -0
  21. wisent/core/cli/generate_pairs_from_task.py +119 -0
  22. wisent/core/cli/generate_responses.py +129 -0
  23. wisent/core/cli/generate_vector_from_synthetic.py +149 -0
  24. wisent/core/cli/generate_vector_from_task.py +147 -0
  25. wisent/core/cli/get_activations.py +191 -0
  26. wisent/core/cli/optimize_classification.py +339 -0
  27. wisent/core/cli/optimize_steering.py +364 -0
  28. wisent/core/cli/tasks.py +182 -0
  29. wisent/core/cli_logger.py +22 -0
  30. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +27 -1
  31. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +49 -1
  32. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +115 -0
  33. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +115 -0
  34. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +111 -0
  35. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +119 -0
  36. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +115 -0
  37. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +114 -0
  38. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +118 -0
  39. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +146 -0
  40. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +129 -0
  41. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +119 -0
  42. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +112 -0
  43. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +113 -0
  44. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livecodebench.py +367 -0
  45. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +115 -0
  46. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +114 -0
  47. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +113 -0
  48. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +112 -0
  49. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +111 -0
  50. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +114 -0
  51. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +113 -0
  52. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +115 -0
  53. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +111 -0
  54. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +113 -0
  55. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +113 -0
  56. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +112 -0
  57. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +116 -0
  58. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +115 -0
  59. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +111 -0
  60. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +111 -0
  61. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +121 -0
  62. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +121 -0
  63. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +111 -0
  64. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +110 -0
  65. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +114 -0
  66. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad2.py +124 -0
  67. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sst2.py +111 -0
  68. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +112 -0
  69. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +127 -0
  70. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_gen.py +112 -0
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +117 -0
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +117 -0
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +127 -0
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +119 -0
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +1 -1
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +111 -0
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +114 -0
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +112 -0
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +114 -0
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +114 -0
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +1 -1
  82. wisent/core/data_loaders/__init__.py +235 -0
  83. wisent/core/data_loaders/loaders/lm_loader.py +2 -2
  84. wisent/core/data_loaders/loaders/task_interface_loader.py +300 -0
  85. wisent/{cli/data_loaders/data_loader_rotator.py → core/data_loaders/rotator.py} +1 -1
  86. wisent/core/download_full_benchmarks.py +79 -2
  87. wisent/core/evaluators/benchmark_specific/__init__.py +26 -0
  88. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/evaluator.py +17 -17
  89. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/cpp_sanitizer.py +2 -2
  90. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/java_sanitizer.py +2 -2
  91. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/python_sanitizer.py +2 -2
  92. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/__init__.py +3 -0
  93. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/provider.py +305 -0
  94. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/core/runtime.py +36 -4
  95. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/entrypoint.py +2 -4
  96. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/recipes.py +1 -1
  97. wisent/core/evaluators/benchmark_specific/coding/solution_generator.py +258 -0
  98. wisent/core/evaluators/benchmark_specific/exact_match_evaluator.py +79 -0
  99. wisent/core/evaluators/benchmark_specific/f1_evaluator.py +101 -0
  100. wisent/core/evaluators/benchmark_specific/generation_evaluator.py +197 -0
  101. wisent/core/{log_likelihoods_evaluator.py → evaluators/benchmark_specific/log_likelihoods_evaluator.py} +10 -2
  102. wisent/core/evaluators/benchmark_specific/perplexity_evaluator.py +140 -0
  103. wisent/core/evaluators/benchmark_specific/personalization_evaluator.py +250 -0
  104. wisent/{cli/evaluators/evaluator_rotator.py → core/evaluators/rotator.py} +4 -4
  105. wisent/core/lm_eval_harness_ground_truth.py +3 -2
  106. wisent/core/main.py +57 -0
  107. wisent/core/model_persistence.py +2 -2
  108. wisent/core/models/wisent_model.py +8 -6
  109. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  110. wisent/core/optuna/steering/steering_optimization.py +1 -1
  111. wisent/core/parser_arguments/__init__.py +10 -0
  112. wisent/core/parser_arguments/agent_parser.py +110 -0
  113. wisent/core/parser_arguments/configure_model_parser.py +7 -0
  114. wisent/core/parser_arguments/create_steering_vector_parser.py +59 -0
  115. wisent/core/parser_arguments/evaluate_parser.py +40 -0
  116. wisent/core/parser_arguments/evaluate_responses_parser.py +10 -0
  117. wisent/core/parser_arguments/full_optimize_parser.py +115 -0
  118. wisent/core/parser_arguments/generate_pairs_from_task_parser.py +33 -0
  119. wisent/core/parser_arguments/generate_pairs_parser.py +29 -0
  120. wisent/core/parser_arguments/generate_responses_parser.py +15 -0
  121. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +127 -0
  122. wisent/core/parser_arguments/generate_vector_from_task_parser.py +127 -0
  123. wisent/core/parser_arguments/generate_vector_parser.py +90 -0
  124. wisent/core/parser_arguments/get_activations_parser.py +90 -0
  125. wisent/core/parser_arguments/main_parser.py +152 -0
  126. wisent/core/parser_arguments/model_config_parser.py +59 -0
  127. wisent/core/parser_arguments/monitor_parser.py +17 -0
  128. wisent/core/parser_arguments/multi_steer_parser.py +47 -0
  129. wisent/core/parser_arguments/optimize_classification_parser.py +67 -0
  130. wisent/core/parser_arguments/optimize_sample_size_parser.py +58 -0
  131. wisent/core/parser_arguments/optimize_steering_parser.py +147 -0
  132. wisent/core/parser_arguments/synthetic_parser.py +93 -0
  133. wisent/core/parser_arguments/tasks_parser.py +584 -0
  134. wisent/core/parser_arguments/test_nonsense_parser.py +26 -0
  135. wisent/core/parser_arguments/utils.py +111 -0
  136. wisent/core/prompts/core/prompt_formater.py +3 -3
  137. wisent/core/prompts/prompt_stratiegies/direct_completion.py +2 -0
  138. wisent/core/prompts/prompt_stratiegies/instruction_following.py +2 -0
  139. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +2 -0
  140. wisent/core/prompts/prompt_stratiegies/role_playing.py +2 -0
  141. wisent/{cli/steering_methods/steering_rotator.py → core/steering_methods/rotator.py} +4 -4
  142. wisent/core/steering_optimizer.py +45 -21
  143. wisent/{synthetic → core/synthetic}/cleaners/deduper_cleaner.py +3 -3
  144. wisent/{synthetic → core/synthetic}/cleaners/methods/base_dedupers.py +2 -2
  145. wisent/{synthetic → core/synthetic}/cleaners/methods/base_refusalers.py +1 -1
  146. wisent/{synthetic → core/synthetic}/cleaners/pairs_cleaner.py +5 -5
  147. wisent/{synthetic → core/synthetic}/cleaners/refusaler_cleaner.py +4 -4
  148. wisent/{synthetic → core/synthetic}/db_instructions/mini_dp.py +1 -1
  149. wisent/{synthetic → core/synthetic}/generators/diversities/methods/fast_diversity.py +1 -1
  150. wisent/{synthetic → core/synthetic}/generators/pairs_generator.py +38 -12
  151. wisent/core/tasks/livecodebench_task.py +4 -103
  152. wisent/core/timing_calibration.py +1 -1
  153. {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/METADATA +3 -3
  154. wisent-0.5.13.dist-info/RECORD +294 -0
  155. wisent-0.5.13.dist-info/entry_points.txt +2 -0
  156. wisent/benchmarks/coding/providers/livecodebench/provider.py +0 -53
  157. wisent/classifiers/core/atoms.py +0 -747
  158. wisent/classifiers/models/logistic.py +0 -29
  159. wisent/classifiers/models/mlp.py +0 -47
  160. wisent/cli/classifiers/classifier_rotator.py +0 -137
  161. wisent/cli/cli_logger.py +0 -142
  162. wisent/cli/wisent_cli/commands/help_cmd.py +0 -52
  163. wisent/cli/wisent_cli/commands/listing.py +0 -154
  164. wisent/cli/wisent_cli/commands/train_cmd.py +0 -322
  165. wisent/cli/wisent_cli/main.py +0 -93
  166. wisent/cli/wisent_cli/shell.py +0 -80
  167. wisent/cli/wisent_cli/ui.py +0 -69
  168. wisent/cli/wisent_cli/util/aggregations.py +0 -43
  169. wisent/cli/wisent_cli/util/parsing.py +0 -126
  170. wisent/cli/wisent_cli/version.py +0 -4
  171. wisent/opti/methods/__init__.py +0 -0
  172. wisent/synthetic/__init__.py +0 -0
  173. wisent/synthetic/cleaners/__init__.py +0 -0
  174. wisent/synthetic/cleaners/core/__init__.py +0 -0
  175. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  176. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  177. wisent/synthetic/db_instructions/__init__.py +0 -0
  178. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  179. wisent/synthetic/generators/__init__.py +0 -0
  180. wisent/synthetic/generators/core/__init__.py +0 -0
  181. wisent/synthetic/generators/diversities/__init__.py +0 -0
  182. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  183. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  184. wisent-0.5.11.dist-info/RECORD +0 -220
  185. /wisent/{benchmarks → core/evaluators/benchmark_specific/coding}/__init__.py +0 -0
  186. /wisent/{benchmarks/coding → core/evaluators/benchmark_specific/coding/metrics}/__init__.py +0 -0
  187. /wisent/{benchmarks/coding/metrics → core/evaluators/benchmark_specific/coding/metrics/core}/__init__.py +0 -0
  188. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/core/atoms.py +0 -0
  189. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/passk.py +0 -0
  190. /wisent/{benchmarks/coding/metrics/core → core/evaluators/benchmark_specific/coding/output_sanitizer}/__init__.py +0 -0
  191. /wisent/{benchmarks/coding/output_sanitizer → core/evaluators/benchmark_specific/coding/output_sanitizer/core}/__init__.py +0 -0
  192. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/core/atoms.py +0 -0
  193. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/utils.py +0 -0
  194. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/providers/__init__.py +0 -0
  195. /wisent/{benchmarks/coding/output_sanitizer → core/evaluators/benchmark_specific/coding/providers}/core/__init__.py +0 -0
  196. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/providers/core/atoms.py +0 -0
  197. /wisent/{benchmarks/coding/providers/core → core/evaluators/benchmark_specific/coding/safe_docker}/__init__.py +0 -0
  198. /wisent/{benchmarks/coding/providers/livecodebench → core/evaluators/benchmark_specific/coding/safe_docker/core}/__init__.py +0 -0
  199. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/core/atoms.py +0 -0
  200. /wisent/{benchmarks/coding/safe_docker → core/opti}/__init__.py +0 -0
  201. /wisent/{benchmarks/coding/safe_docker → core/opti}/core/__init__.py +0 -0
  202. /wisent/{opti → core/opti}/core/atoms.py +0 -0
  203. /wisent/{classifiers → core/opti/methods}/__init__.py +0 -0
  204. /wisent/{opti → core/opti}/methods/opti_classificator.py +0 -0
  205. /wisent/{opti → core/opti}/methods/opti_steering.py +0 -0
  206. /wisent/{classifiers/core → core/synthetic}/__init__.py +0 -0
  207. /wisent/{classifiers/models → core/synthetic/cleaners}/__init__.py +0 -0
  208. /wisent/{cli → core/synthetic/cleaners/core}/__init__.py +0 -0
  209. /wisent/{synthetic → core/synthetic}/cleaners/core/atoms.py +0 -0
  210. /wisent/{cli/classifiers → core/synthetic/cleaners/methods}/__init__.py +0 -0
  211. /wisent/{cli/data_loaders → core/synthetic/cleaners/methods/core}/__init__.py +0 -0
  212. /wisent/{synthetic → core/synthetic}/cleaners/methods/core/atoms.py +0 -0
  213. /wisent/{cli/evaluators → core/synthetic/db_instructions}/__init__.py +0 -0
  214. /wisent/{cli/steering_methods → core/synthetic/db_instructions/core}/__init__.py +0 -0
  215. /wisent/{synthetic → core/synthetic}/db_instructions/core/atoms.py +0 -0
  216. /wisent/{cli/wisent_cli → core/synthetic/generators}/__init__.py +0 -0
  217. /wisent/{cli/wisent_cli/commands → core/synthetic/generators/core}/__init__.py +0 -0
  218. /wisent/{synthetic → core/synthetic}/generators/core/atoms.py +0 -0
  219. /wisent/{cli/wisent_cli/util → core/synthetic/generators/diversities}/__init__.py +0 -0
  220. /wisent/{opti → core/synthetic/generators/diversities/core}/__init__.py +0 -0
  221. /wisent/{synthetic → core/synthetic}/generators/diversities/core/core.py +0 -0
  222. /wisent/{opti/core → core/synthetic/generators/diversities/methods}/__init__.py +0 -0
  223. {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/WHEEL +0 -0
  224. {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/licenses/LICENSE +0 -0
  225. {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
  from typing import TYPE_CHECKING
4
4
 
5
5
  from wisent.core.contrastive_pairs.lm_eval_pairs.lm_extractor_registry import get_extractor
6
- from wisent.cli.cli_logger import setup_logger, bind
6
+ from wisent.core.cli_logger import setup_logger, bind
7
7
 
8
8
  if TYPE_CHECKING:
9
9
  from lm_eval.api.task import ConfigurableTask
@@ -0,0 +1,235 @@
1
+ """
2
+ Data loaders for various benchmarks.
3
+
4
+ This module provides data loaders for tasks that need special handling.
5
+ """
6
+
7
+ from typing import Any, Dict, List, Optional
8
+ from dataclasses import dataclass
9
+
10
+
11
+ __all__ = ["LiveCodeBenchLoader", "LiveCodeBenchProblem"]
12
+
13
+
14
+ @dataclass
15
+ class LiveCodeBenchProblem:
16
+ """
17
+ A LiveCodeBench coding problem.
18
+
19
+ Attributes:
20
+ question_title: Title of the problem
21
+ question_content: Full problem description
22
+ platform: Source platform (codeforces, leetcode, atcoder)
23
+ question_id: Unique problem identifier
24
+ contest_id: Contest identifier
25
+ contest_date: Date of the contest
26
+ starter_code: Optional starter code template
27
+ difficulty: Problem difficulty (easy, medium, hard)
28
+ public_test_cases: Public test cases
29
+ private_test_cases: Private test cases
30
+ metadata: Additional metadata
31
+ answer: Correct answer/solution (for TaskInterface compatibility)
32
+ good_code: Code that passes tests (from wisent-core)
33
+ bad_code: Code that fails tests (from wisent-core)
34
+ """
35
+ question_title: str
36
+ question_content: str
37
+ platform: str
38
+ question_id: str
39
+ contest_id: str
40
+ contest_date: str
41
+ starter_code: str
42
+ difficulty: str
43
+ public_test_cases: List[Any]
44
+ private_test_cases: List[Any]
45
+ metadata: Dict[str, Any]
46
+ answer: Optional[str] = None
47
+ good_code: Optional[str] = None
48
+ bad_code: Optional[str] = None
49
+
50
+ def to_dict(self) -> Dict[str, Any]:
51
+ """Convert to dictionary format."""
52
+ return {
53
+ "task_id": self.question_id,
54
+ "question_id": self.question_id, # Also include question_id for compatibility
55
+ "question_title": self.question_title,
56
+ "question_content": self.question_content,
57
+ "platform": self.platform.upper(),
58
+ "contest_id": self.contest_id,
59
+ "contest_date": self.contest_date,
60
+ "starter_code": self.starter_code,
61
+ "difficulty": self.difficulty.upper(),
62
+ "public_test_cases": [
63
+ {
64
+ "input": tc if isinstance(tc, str) else str(tc),
65
+ "output": "",
66
+ "testtype": "FUNCTIONAL"
67
+ }
68
+ for tc in self.public_test_cases[:3] # Limit to first 3 for brevity
69
+ ] if self.public_test_cases else [],
70
+ "metadata": self.metadata,
71
+ "answer": self.good_code, # Use good_code as the correct answer
72
+ "good_code": self.good_code,
73
+ "bad_code": self.bad_code,
74
+ }
75
+
76
+
77
+ class LiveCodeBenchLoader:
78
+ """
79
+ LiveCodeBench data loader.
80
+
81
+ Loads real coding problems from the LiveCodeBench dataset on HuggingFace.
82
+ Dataset: livecodebench/code_generation_lite
83
+
84
+ Also loads pre-generated good/bad code solutions from local cache.
85
+ """
86
+
87
+ def __init__(self, solution_cache_dir: Optional[str] = None):
88
+ self._dataset_name = "livecodebench/code_generation_lite"
89
+ self._cached_dataset = None
90
+ self._solution_data = None
91
+ self.solution_cache_dir = solution_cache_dir or "./livecodebench_solutions"
92
+
93
+ def list_available_versions(self) -> List[str]:
94
+ """List available LiveCodeBench versions."""
95
+ # The dataset doesn't have explicit versions, but we can filter by date ranges
96
+ return ["release_v1", "release_v2", "all"]
97
+
98
+ def get_version_info(self, version: str) -> Dict[str, Any]:
99
+ """Get information about a specific version."""
100
+ version_info = {
101
+ "release_v1": {
102
+ "version": "release_v1",
103
+ "description": "LiveCodeBench Release V1 (May 2023 - Oct 2023)",
104
+ "problems": "~500",
105
+ "date_range": "2023-05-01 to 2023-10-31",
106
+ },
107
+ "release_v2": {
108
+ "version": "release_v2",
109
+ "description": "LiveCodeBench Release V2 (Nov 2023 - Apr 2024)",
110
+ "problems": "~500",
111
+ "date_range": "2023-11-01 to 2024-04-30",
112
+ },
113
+ "all": {
114
+ "version": "all",
115
+ "description": "All LiveCodeBench problems",
116
+ "problems": "1055",
117
+ "date_range": "2023-05-01 to 2024-12-31",
118
+ }
119
+ }
120
+ return version_info.get(version, version_info["all"])
121
+
122
+ def _load_solution_data(self) -> Dict[str, Any]:
123
+ """
124
+ Load pre-generated AI model solutions from local cache.
125
+
126
+ Returns:
127
+ Dictionary with question_id -> {good_code, bad_code, difficulty} mapping.
128
+ """
129
+ import json
130
+ from pathlib import Path
131
+
132
+ if self._solution_data is not None:
133
+ return self._solution_data
134
+
135
+ cache_file = Path(self.solution_cache_dir) / "solutions.json"
136
+
137
+ if not cache_file.exists():
138
+ import logging
139
+ logging.warning(
140
+ f"Solutions cache not found at {cache_file}. "
141
+ f"Run solution generation first using LiveCodeBenchSolutionGenerator. "
142
+ f"Problems will have no answer field."
143
+ )
144
+ self._solution_data = {}
145
+ return {}
146
+
147
+ with open(cache_file, 'r') as f:
148
+ data = json.load(f)
149
+
150
+ # Create mapping from question_id to solutions
151
+ solution_map = {}
152
+ for problem in data.get("problems", []):
153
+ question_id = problem.get("question_id")
154
+ if question_id and problem.get("good_example") and problem.get("bad_example"):
155
+ solution_map[question_id] = {
156
+ "good_code": problem["good_example"].get("code", ""),
157
+ "bad_code": problem["bad_example"].get("code", ""),
158
+ "difficulty": problem.get("difficulty", "unknown"),
159
+ }
160
+
161
+ self._solution_data = solution_map
162
+ return solution_map
163
+
164
+ def load_problems(
165
+ self,
166
+ release_version: str = "all",
167
+ limit: Optional[int] = None
168
+ ) -> List[LiveCodeBenchProblem]:
169
+ """
170
+ Load LiveCodeBench problems from HuggingFace.
171
+
172
+ Arguments:
173
+ release_version: Version to load (release_v1, release_v2, or all)
174
+ limit: Maximum number of problems to load
175
+
176
+ Returns:
177
+ List of LiveCodeBenchProblem objects
178
+ """
179
+ from datasets import load_dataset
180
+
181
+ # Load dataset (cached after first load)
182
+ if self._cached_dataset is None:
183
+ self._cached_dataset = load_dataset(self._dataset_name, split="test")
184
+
185
+ dataset = self._cached_dataset
186
+
187
+ # Filter by version if needed
188
+ if release_version == "release_v1":
189
+ # Filter problems from May 2023 - Oct 2023
190
+ dataset = dataset.filter(
191
+ lambda x: x["contest_date"] >= "2023-05-01" and x["contest_date"] <= "2023-10-31"
192
+ )
193
+ elif release_version == "release_v2":
194
+ # Filter problems from Nov 2023 - Apr 2024
195
+ dataset = dataset.filter(
196
+ lambda x: x["contest_date"] >= "2023-11-01" and x["contest_date"] <= "2024-04-30"
197
+ )
198
+ # "all" or any other value: use all problems
199
+
200
+ # Apply limit
201
+ if limit:
202
+ dataset = dataset.select(range(min(limit, len(dataset))))
203
+
204
+ # Load solution data from wisent-core
205
+ solution_map = self._load_solution_data()
206
+
207
+ # Convert to LiveCodeBenchProblem objects
208
+ problems = []
209
+ for item in dataset:
210
+ question_id = item.get("question_id", "")
211
+
212
+ # Get solutions if available
213
+ solutions = solution_map.get(question_id, {})
214
+ good_code = solutions.get("good_code")
215
+ bad_code = solutions.get("bad_code")
216
+
217
+ problem = LiveCodeBenchProblem(
218
+ question_title=item.get("question_title", ""),
219
+ question_content=item.get("question_content", ""),
220
+ platform=item.get("platform", ""),
221
+ question_id=question_id,
222
+ contest_id=str(item.get("contest_id", "")),
223
+ contest_date=item.get("contest_date", ""),
224
+ starter_code=item.get("starter_code", ""),
225
+ difficulty=item.get("difficulty", ""),
226
+ public_test_cases=item.get("public_test_cases", []),
227
+ private_test_cases=item.get("private_test_cases", []),
228
+ metadata=item.get("metadata", {}),
229
+ answer=good_code, # Set answer field for TaskInterface
230
+ good_code=good_code,
231
+ bad_code=bad_code,
232
+ )
233
+ problems.append(problem)
234
+
235
+ return problems
@@ -91,8 +91,8 @@ class LMEvalDataLoader(BaseDataLoader):
91
91
  train_set = ContrastivePairSet("lm_eval_train", train_pairs, task_type=task_name)
92
92
  test_set = ContrastivePairSet("lm_eval_test", test_pairs, task_type=task_name)
93
93
 
94
- train_set.validate()
95
- test_set.validate()
94
+ train_set.validate(raise_on_critical=False)
95
+ test_set.validate(raise_on_critical=False)
96
96
 
97
97
  return LoadDataResult(
98
98
  train_qa_pairs=train_set,
@@ -0,0 +1,300 @@
1
+ from __future__ import annotations
2
+ from typing import Any, Optional
3
+ import logging
4
+ import random
5
+
6
+ from wisent.core.data_loaders.core.atoms import BaseDataLoader, DataLoaderError, LoadDataResult
7
+ from wisent.core.contrastive_pairs.core.pair import ContrastivePair
8
+ from wisent.core.contrastive_pairs.core.response import PositiveResponse, NegativeResponse
9
+ from wisent.core.contrastive_pairs.core.set import ContrastivePairSet
10
+ from wisent.core.task_interface import get_task, list_tasks
11
+ from wisent.core.task_interface import TaskInterface
12
+
13
+ __all__ = [
14
+ "TaskInterfaceDataLoader",
15
+ ]
16
+
17
+ log = logging.getLogger(__name__)
18
+
19
+
20
+ class TaskInterfaceDataLoader(BaseDataLoader):
21
+ """
22
+ Load contrastive pairs from TaskInterface tasks (AIME, HMMT, LiveCodeBench, etc.).
23
+
24
+ This loader bridges TaskInterface tasks with the CLI training pipeline by:
25
+ 1. Loading problem data from TaskInterface tasks
26
+ 2. Converting problems into contrastive pairs (correct/incorrect answers)
27
+ 3. Splitting into train/test sets
28
+
29
+ Usage:
30
+ wisent train model meta-llama/Llama-3.2-1B-Instruct loader task_interface task gsm8k training_limit 100
31
+ wisent train model meta-llama/Llama-3.2-1B-Instruct loader task_interface task aime training_limit 50
32
+ wisent train model meta-llama/Llama-3.2-1B-Instruct loader task_interface task livecodebench training_limit 200
33
+ """
34
+
35
+ name = "task_interface"
36
+ description = "Load from TaskInterface tasks (AIME, HMMT, LiveCodeBench, GSM8K, etc.)"
37
+
38
+ def load(
39
+ self,
40
+ task: Optional[str] = None,
41
+ split_ratio: Optional[float] = None,
42
+ seed: int = 42,
43
+ limit: Optional[int] = None,
44
+ training_limit: Optional[int] = None,
45
+ testing_limit: Optional[int] = None,
46
+ **kwargs: Any,
47
+ ) -> LoadDataResult:
48
+ """
49
+ Load contrastive pairs from a TaskInterface task.
50
+
51
+ Arguments:
52
+ task: Name of the TaskInterface task (e.g., 'gsm8k', 'aime', 'livecodebench')
53
+ split_ratio: Fraction of data for training (default: 0.8)
54
+ seed: Random seed for splitting
55
+ limit: Total number of problems to load
56
+ training_limit: Maximum training examples
57
+ testing_limit: Maximum testing examples
58
+ **kwargs: Additional arguments passed to the task
59
+
60
+ Returns:
61
+ LoadDataResult with train/test contrastive pairs
62
+
63
+ Raises:
64
+ DataLoaderError: If task is not specified or not found
65
+ """
66
+ if not task:
67
+ available = list_tasks()
68
+ raise DataLoaderError(
69
+ f"TaskInterface loader requires a 'task' parameter. "
70
+ f"Available tasks: {', '.join(available[:10])}..."
71
+ )
72
+
73
+ # Ensure split ratio is valid
74
+ split_ratio = self._effective_split(split_ratio)
75
+
76
+ # Load the task
77
+ try:
78
+ task_obj: TaskInterface = get_task(task, limit=limit)
79
+ except ValueError as e:
80
+ available = list_tasks()
81
+ raise DataLoaderError(
82
+ f"TaskInterface task '{task}' not found. "
83
+ f"Available tasks: {', '.join(available[:20])}..."
84
+ ) from e
85
+
86
+ # Load problem data
87
+ log.info(f"Loading data from TaskInterface task: {task}")
88
+ problems = task_obj.load_data(limit=limit)
89
+
90
+ if not problems:
91
+ raise DataLoaderError(f"TaskInterface task '{task}' returned no data")
92
+
93
+ log.info(f"Loaded {len(problems)} problems from {task}")
94
+
95
+ # Convert problems to contrastive pairs
96
+ pairs = self._convert_to_contrastive_pairs(task_obj, problems)
97
+
98
+ if not pairs:
99
+ raise DataLoaderError(
100
+ f"Could not generate any contrastive pairs from {task}. "
101
+ f"Problems may be missing required fields."
102
+ )
103
+
104
+ log.info(f"Generated {len(pairs)} contrastive pairs")
105
+
106
+ # Shuffle and split
107
+ random.seed(seed)
108
+ random.shuffle(pairs)
109
+
110
+ split_idx = int(len(pairs) * split_ratio)
111
+ train_pairs = pairs[:split_idx]
112
+ test_pairs = pairs[split_idx:]
113
+
114
+ # Apply limits
115
+ if training_limit:
116
+ train_pairs = train_pairs[:training_limit]
117
+ if testing_limit:
118
+ test_pairs = test_pairs[:testing_limit]
119
+
120
+ log.info(f"Split: {len(train_pairs)} train, {len(test_pairs)} test")
121
+
122
+ # Create ContrastivePairSets
123
+ train_set = ContrastivePairSet(name=f"{task}_train", pairs=train_pairs, task_type="classification")
124
+ test_set = ContrastivePairSet(name=f"{task}_test", pairs=test_pairs, task_type="classification")
125
+
126
+ return LoadDataResult(
127
+ train_qa_pairs=train_set,
128
+ test_qa_pairs=test_set,
129
+ task_type="classification",
130
+ lm_task_data=None, # TaskInterface tasks don't use lm-eval format
131
+ )
132
+
133
+ def _convert_to_contrastive_pairs(
134
+ self,
135
+ task_obj: TaskInterface,
136
+ problems: list[dict[str, Any]],
137
+ ) -> list[ContrastivePair]:
138
+ """
139
+ Convert task problems into contrastive pairs.
140
+
141
+ For each problem, we create a contrastive pair with:
142
+ - Positive response: The correct answer
143
+ - Negative response: An incorrect answer (generated or from problem data)
144
+
145
+ Arguments:
146
+ task_obj: The TaskInterface task object
147
+ problems: List of problem dictionaries
148
+
149
+ Returns:
150
+ List of ContrastivePair objects
151
+ """
152
+ pairs = []
153
+ extractor = task_obj.get_extractor()
154
+ task_name = task_obj.get_name()
155
+
156
+ for idx, problem in enumerate(problems):
157
+ try:
158
+ # Get the prompt/question
159
+ if hasattr(task_obj, 'doc_to_text'):
160
+ prompt = task_obj.doc_to_text(problem)
161
+ else:
162
+ # Fallback: extract prompt from problem dict
163
+ prompt = self._extract_prompt_from_problem(problem, task_name)
164
+
165
+ if not prompt:
166
+ log.warning(f"Problem {idx} has no prompt, skipping")
167
+ continue
168
+
169
+ # Get correct answer
170
+ correct_answer = self._get_correct_answer(problem, task_name)
171
+ if not correct_answer:
172
+ log.warning(f"Problem {idx} has no correct answer, skipping")
173
+ continue
174
+
175
+ # Generate incorrect answer
176
+ incorrect_answer = self._generate_incorrect_answer(
177
+ problem, correct_answer, task_name, extractor
178
+ )
179
+ if not incorrect_answer:
180
+ log.warning(f"Problem {idx}: could not generate incorrect answer, skipping")
181
+ continue
182
+
183
+ # Create contrastive pair
184
+ pair = ContrastivePair(
185
+ prompt=prompt,
186
+ positive_response=PositiveResponse(model_response=correct_answer),
187
+ negative_response=NegativeResponse(model_response=incorrect_answer),
188
+ label="correct",
189
+ )
190
+ pairs.append(pair)
191
+
192
+ except Exception as e:
193
+ log.warning(f"Problem {idx}: failed to create pair: {e}")
194
+ continue
195
+
196
+ return pairs
197
+
198
+ def _extract_prompt_from_problem(self, problem: dict[str, Any], task_name: str) -> Optional[str]:
199
+ """Extract prompt/question from a problem dict."""
200
+ # Try common field names for prompts
201
+ prompt_fields = ["question", "prompt", "problem", "text", "input", "query", "doc"]
202
+
203
+ for field in prompt_fields:
204
+ if field in problem:
205
+ value = problem[field]
206
+ if isinstance(value, str) and value.strip():
207
+ return value.strip()
208
+
209
+ # If no prompt found, try to construct from problem data
210
+ if task_name == "gsm8k" and "question" in problem:
211
+ return problem["question"]
212
+
213
+ return None
214
+
215
+ def _get_correct_answer(self, problem: dict[str, Any], task_name: str) -> Optional[str]:
216
+ """Extract the correct answer from a problem."""
217
+ # Try common field names
218
+ answer_fields = ["answer", "target", "label", "solution", "expected_output"]
219
+
220
+ for field in answer_fields:
221
+ if field in problem:
222
+ answer = problem[field]
223
+ if isinstance(answer, (str, int, float)):
224
+ return str(answer)
225
+ elif isinstance(answer, dict):
226
+ # Try to extract from nested dict
227
+ if "answer" in answer:
228
+ return str(answer["answer"])
229
+ if "text" in answer:
230
+ return str(answer["text"])
231
+
232
+ return None
233
+
234
+ def _generate_incorrect_answer(
235
+ self,
236
+ problem: dict[str, Any],
237
+ correct_answer: str,
238
+ task_name: str,
239
+ extractor: Any,
240
+ ) -> Optional[str]:
241
+ """
242
+ Generate an incorrect answer for a problem.
243
+
244
+ Strategy:
245
+ 1. Check if problem has bad_code/incorrect answer field (for LiveCodeBench)
246
+ 2. If problem has multiple choices, use an incorrect choice
247
+ 3. For numerical answers, perturb the number
248
+ 4. For text answers, use a generic incorrect response
249
+ """
250
+ # Strategy 0: Check for explicit bad_code field (LiveCodeBench)
251
+ if "bad_code" in problem and problem["bad_code"]:
252
+ return problem["bad_code"]
253
+
254
+ # Strategy 1: Check for multiple choice options
255
+ choices_fields = ["choices", "options", "mc1_targets", "mc2_targets"]
256
+ for field in choices_fields:
257
+ if field in problem:
258
+ choices = problem[field]
259
+ if isinstance(choices, dict):
260
+ # Handle mc1_targets/mc2_targets format
261
+ if "choices" in choices and "labels" in choices:
262
+ incorrect_indices = [
263
+ i for i, label in enumerate(choices["labels"]) if label == 0
264
+ ]
265
+ if incorrect_indices:
266
+ return choices["choices"][random.choice(incorrect_indices)]
267
+ elif isinstance(choices, list) and choices:
268
+ # Filter out correct answer
269
+ incorrect_choices = [
270
+ c for c in choices
271
+ if self._normalize_for_comparison(str(c)) != self._normalize_for_comparison(correct_answer)
272
+ ]
273
+ if incorrect_choices:
274
+ return str(random.choice(incorrect_choices))
275
+
276
+ # Strategy 2: Numerical answer perturbation
277
+ try:
278
+ correct_num = float(correct_answer.strip())
279
+ # Perturb by 10-50%
280
+ perturbation = random.uniform(1.1, 1.5) if random.random() > 0.5 else random.uniform(0.5, 0.9)
281
+ incorrect_num = correct_num * perturbation
282
+ return str(int(incorrect_num) if correct_num == int(correct_num) else round(incorrect_num, 2))
283
+ except (ValueError, AttributeError):
284
+ pass
285
+
286
+ # Strategy 3: Generic incorrect responses by task type
287
+ if task_name in ["gsm8k", "math500", "aime", "hmmt", "polymath", "livemathbench"]:
288
+ # Math tasks: slightly wrong number
289
+ return str(random.randint(0, 1000))
290
+ elif task_name in ["livecodebench", "humaneval", "mbpp"]:
291
+ # Coding tasks: empty or syntax error
292
+ return "# Incomplete solution\npass"
293
+ else:
294
+ # Generic incorrect response
295
+ return "I don't know"
296
+
297
+ @staticmethod
298
+ def _normalize_for_comparison(text: str) -> str:
299
+ """Normalize text for comparison."""
300
+ return text.lower().strip()
@@ -14,7 +14,7 @@ class DataLoaderRotator:
14
14
  def __init__(
15
15
  self,
16
16
  loader: Union[str, BaseDataLoader, Type[BaseDataLoader], None] = None,
17
- loaders_location: Union[str, Path] = "wisent_guard.core.data_loaders.loaders",
17
+ loaders_location: Union[str, Path] = "wisent.core.data_loaders.loaders",
18
18
  autoload: bool = True,
19
19
  **default_loader_kwargs: Any,
20
20
  ) -> None:
@@ -1276,6 +1276,80 @@ class FullBenchmarkDownloader:
1276
1276
 
1277
1277
  return pairs
1278
1278
 
1279
+ def _perturb_code_to_break(self, code: str) -> str:
1280
+ """
1281
+ Perturb correct code to make it broken/unable to execute at runtime.
1282
+
1283
+ Introduces various types of bugs:
1284
+ - Syntax errors (missing colons, parentheses)
1285
+ - Runtime errors (undefined variables)
1286
+ - Logic errors (wrong operators)
1287
+ - Type errors (wrong return values)
1288
+
1289
+ Args:
1290
+ code: Correct Python code
1291
+
1292
+ Returns:
1293
+ Broken version of the code
1294
+ """
1295
+ lines = code.split('\n')
1296
+ if not lines:
1297
+ return "pass # Broken code"
1298
+
1299
+ # Choose a random perturbation strategy
1300
+ perturbation_type = random.choice([
1301
+ 'remove_colon',
1302
+ 'remove_return',
1303
+ 'wrong_variable',
1304
+ 'syntax_error',
1305
+ 'wrong_operator',
1306
+ 'incomplete_code'
1307
+ ])
1308
+
1309
+ if perturbation_type == 'remove_colon':
1310
+ # Remove colons from function/if/for statements
1311
+ for i, line in enumerate(lines):
1312
+ if any(keyword in line for keyword in ['def ', 'if ', 'for ', 'while ', 'elif ', 'else:']):
1313
+ lines[i] = line.replace(':', '')
1314
+ break
1315
+
1316
+ elif perturbation_type == 'remove_return':
1317
+ # Remove return statement to break function
1318
+ for i, line in enumerate(lines):
1319
+ if 'return ' in line:
1320
+ lines[i] = line.replace('return ', '# return ')
1321
+ break
1322
+
1323
+ elif perturbation_type == 'wrong_variable':
1324
+ # Use undefined variable name
1325
+ for i, line in enumerate(lines):
1326
+ if '=' in line and 'def ' not in line:
1327
+ lines[i] = line.replace('=', '= undefined_variable +')
1328
+ break
1329
+
1330
+ elif perturbation_type == 'syntax_error':
1331
+ # Add syntax error by removing closing parenthesis
1332
+ for i, line in enumerate(lines):
1333
+ if '(' in line and ')' in line:
1334
+ lines[i] = line.replace(')', '', 1)
1335
+ break
1336
+
1337
+ elif perturbation_type == 'wrong_operator':
1338
+ # Change operators to break logic
1339
+ for i, line in enumerate(lines):
1340
+ if any(op in line for op in ['+', '-', '*', '/', '<', '>', '==']):
1341
+ line = line.replace('+', '-', 1) if '+' in line else line
1342
+ line = line.replace('<', '>', 1) if '<' in line else line
1343
+ lines[i] = line
1344
+ break
1345
+
1346
+ elif perturbation_type == 'incomplete_code':
1347
+ # Return only first half of code to make it incomplete
1348
+ lines = lines[:max(1, len(lines) // 2)]
1349
+ lines.append(" # Incomplete implementation")
1350
+
1351
+ return '\n'.join(lines)
1352
+
1279
1353
  def _convert_mbpp_format(self, sample: Dict[str, Any]) -> List[Dict[str, Any]]:
1280
1354
  """Convert MBPP/HumanEval code generation format (task_id, code, prompt, test)."""
1281
1355
  task_id = sample.get("task_id", "")
@@ -1285,16 +1359,19 @@ class FullBenchmarkDownloader:
1285
1359
 
1286
1360
  # For code generation tasks, we create contrastive pairs based on:
1287
1361
  # Correct: The reference code solution
1288
- # Incorrect: A placeholder for incorrect/buggy code (since we don't have real incorrect solutions)
1362
+ # Incorrect: Perturbed version with bugs that prevent runtime execution
1289
1363
 
1290
1364
  pairs = []
1291
1365
 
1366
+ # Generate incorrect code by perturbing the correct solution
1367
+ incorrect_code = self._perturb_code_to_break(code)
1368
+
1292
1369
  # Create a contrastive pair with the coding prompt
1293
1370
  pairs.append(
1294
1371
  {
1295
1372
  "question": f"Write Python code to solve this problem:\n\n{prompt}",
1296
1373
  "correct_answer": code,
1297
- "incorrect_answer": "# This is a placeholder for incorrect code\n# In practice, this would be buggy or incomplete code\npass", # TODO
1374
+ "incorrect_answer": incorrect_code,
1298
1375
  "metadata": {
1299
1376
  "task_id": task_id,
1300
1377
  "test_cases": test,