wisent 0.5.12__py3-none-any.whl → 0.5.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (227) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/__init__.py +26 -0
  3. wisent/core/activations/activations.py +96 -0
  4. wisent/core/activations/activations_collector.py +71 -20
  5. wisent/core/activations/prompt_construction_strategy.py +47 -0
  6. wisent/core/agent/__init__.py +1 -18
  7. wisent/core/agent/budget.py +2 -2
  8. wisent/core/agent/device_benchmarks.py +1 -1
  9. wisent/core/agent/diagnose/__init__.py +1 -55
  10. wisent/core/agent/diagnose/classifier_marketplace.py +8 -8
  11. wisent/core/agent/diagnose/response_diagnostics.py +4 -4
  12. wisent/core/agent/diagnose/synthetic_classifier_option.py +1 -1
  13. wisent/core/agent/diagnose/tasks/task_manager.py +3 -3
  14. wisent/core/agent/diagnose.py +2 -1
  15. wisent/core/autonomous_agent.py +10 -2
  16. wisent/core/benchmark_extractors.py +293 -0
  17. wisent/core/bigcode_integration.py +20 -7
  18. wisent/core/branding.py +108 -0
  19. wisent/core/cli/__init__.py +15 -0
  20. wisent/core/cli/create_steering_vector.py +138 -0
  21. wisent/core/cli/evaluate_responses.py +715 -0
  22. wisent/core/cli/generate_pairs.py +128 -0
  23. wisent/core/cli/generate_pairs_from_task.py +119 -0
  24. wisent/core/cli/generate_responses.py +129 -0
  25. wisent/core/cli/generate_vector_from_synthetic.py +149 -0
  26. wisent/core/cli/generate_vector_from_task.py +147 -0
  27. wisent/core/cli/get_activations.py +191 -0
  28. wisent/core/cli/optimize_classification.py +339 -0
  29. wisent/core/cli/optimize_steering.py +364 -0
  30. wisent/core/cli/tasks.py +182 -0
  31. wisent/core/cli_logger.py +22 -0
  32. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +27 -1
  33. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +49 -1
  34. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +115 -0
  35. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +115 -0
  36. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +111 -0
  37. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +119 -0
  38. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +115 -0
  39. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +114 -0
  40. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +118 -0
  41. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +146 -0
  42. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +129 -0
  43. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +119 -0
  44. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +112 -0
  45. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +113 -0
  46. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livecodebench.py +367 -0
  47. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +115 -0
  48. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +114 -0
  49. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +113 -0
  50. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +112 -0
  51. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +111 -0
  52. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +114 -0
  53. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +113 -0
  54. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +115 -0
  55. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +111 -0
  56. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +113 -0
  57. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +113 -0
  58. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +112 -0
  59. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +116 -0
  60. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +115 -0
  61. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +111 -0
  62. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +111 -0
  63. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +121 -0
  64. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +121 -0
  65. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +111 -0
  66. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +110 -0
  67. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +114 -0
  68. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad2.py +124 -0
  69. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sst2.py +111 -0
  70. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +112 -0
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +127 -0
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_gen.py +112 -0
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +117 -0
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +117 -0
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +127 -0
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +119 -0
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +1 -1
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +111 -0
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +114 -0
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +112 -0
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +114 -0
  82. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +114 -0
  83. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +1 -1
  84. wisent/core/data_loaders/__init__.py +235 -0
  85. wisent/core/data_loaders/loaders/lm_loader.py +2 -2
  86. wisent/core/data_loaders/loaders/task_interface_loader.py +300 -0
  87. wisent/{cli/data_loaders/data_loader_rotator.py → core/data_loaders/rotator.py} +1 -1
  88. wisent/core/download_full_benchmarks.py +79 -2
  89. wisent/core/evaluators/benchmark_specific/__init__.py +26 -0
  90. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/evaluator.py +17 -17
  91. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/cpp_sanitizer.py +2 -2
  92. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/java_sanitizer.py +2 -2
  93. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/python_sanitizer.py +2 -2
  94. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/__init__.py +3 -0
  95. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/provider.py +305 -0
  96. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/core/runtime.py +36 -4
  97. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/entrypoint.py +2 -4
  98. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/recipes.py +1 -1
  99. wisent/core/evaluators/benchmark_specific/coding/solution_generator.py +258 -0
  100. wisent/core/evaluators/benchmark_specific/exact_match_evaluator.py +79 -0
  101. wisent/core/evaluators/benchmark_specific/f1_evaluator.py +101 -0
  102. wisent/core/evaluators/benchmark_specific/generation_evaluator.py +197 -0
  103. wisent/core/{log_likelihoods_evaluator.py → evaluators/benchmark_specific/log_likelihoods_evaluator.py} +10 -2
  104. wisent/core/evaluators/benchmark_specific/perplexity_evaluator.py +140 -0
  105. wisent/core/evaluators/benchmark_specific/personalization_evaluator.py +250 -0
  106. wisent/{cli/evaluators/evaluator_rotator.py → core/evaluators/rotator.py} +4 -4
  107. wisent/core/lm_eval_harness_ground_truth.py +3 -2
  108. wisent/core/main.py +57 -0
  109. wisent/core/model_persistence.py +2 -2
  110. wisent/core/models/wisent_model.py +6 -6
  111. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  112. wisent/core/optuna/steering/steering_optimization.py +1 -1
  113. wisent/core/parser_arguments/__init__.py +10 -0
  114. wisent/core/parser_arguments/agent_parser.py +110 -0
  115. wisent/core/parser_arguments/configure_model_parser.py +7 -0
  116. wisent/core/parser_arguments/create_steering_vector_parser.py +59 -0
  117. wisent/core/parser_arguments/evaluate_parser.py +40 -0
  118. wisent/core/parser_arguments/evaluate_responses_parser.py +10 -0
  119. wisent/core/parser_arguments/full_optimize_parser.py +115 -0
  120. wisent/core/parser_arguments/generate_pairs_from_task_parser.py +33 -0
  121. wisent/core/parser_arguments/generate_pairs_parser.py +29 -0
  122. wisent/core/parser_arguments/generate_responses_parser.py +15 -0
  123. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +127 -0
  124. wisent/core/parser_arguments/generate_vector_from_task_parser.py +127 -0
  125. wisent/core/parser_arguments/generate_vector_parser.py +90 -0
  126. wisent/core/parser_arguments/get_activations_parser.py +90 -0
  127. wisent/core/parser_arguments/main_parser.py +152 -0
  128. wisent/core/parser_arguments/model_config_parser.py +59 -0
  129. wisent/core/parser_arguments/monitor_parser.py +17 -0
  130. wisent/core/parser_arguments/multi_steer_parser.py +47 -0
  131. wisent/core/parser_arguments/optimize_classification_parser.py +67 -0
  132. wisent/core/parser_arguments/optimize_sample_size_parser.py +58 -0
  133. wisent/core/parser_arguments/optimize_steering_parser.py +147 -0
  134. wisent/core/parser_arguments/synthetic_parser.py +93 -0
  135. wisent/core/parser_arguments/tasks_parser.py +584 -0
  136. wisent/core/parser_arguments/test_nonsense_parser.py +26 -0
  137. wisent/core/parser_arguments/utils.py +111 -0
  138. wisent/core/prompts/core/prompt_formater.py +3 -3
  139. wisent/core/prompts/prompt_stratiegies/direct_completion.py +2 -0
  140. wisent/core/prompts/prompt_stratiegies/instruction_following.py +2 -0
  141. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +2 -0
  142. wisent/core/prompts/prompt_stratiegies/role_playing.py +2 -0
  143. wisent/{cli/steering_methods/steering_rotator.py → core/steering_methods/rotator.py} +4 -4
  144. wisent/core/steering_optimizer.py +45 -21
  145. wisent/{synthetic → core/synthetic}/cleaners/deduper_cleaner.py +3 -3
  146. wisent/{synthetic → core/synthetic}/cleaners/methods/base_dedupers.py +2 -2
  147. wisent/{synthetic → core/synthetic}/cleaners/methods/base_refusalers.py +1 -1
  148. wisent/{synthetic → core/synthetic}/cleaners/pairs_cleaner.py +5 -5
  149. wisent/{synthetic → core/synthetic}/cleaners/refusaler_cleaner.py +4 -4
  150. wisent/{synthetic → core/synthetic}/db_instructions/mini_dp.py +1 -1
  151. wisent/{synthetic → core/synthetic}/generators/diversities/methods/fast_diversity.py +1 -1
  152. wisent/{synthetic → core/synthetic}/generators/pairs_generator.py +38 -12
  153. wisent/core/tasks/livecodebench_task.py +4 -103
  154. wisent/core/timing_calibration.py +1 -1
  155. {wisent-0.5.12.dist-info → wisent-0.5.14.dist-info}/METADATA +3 -3
  156. wisent-0.5.14.dist-info/RECORD +294 -0
  157. wisent-0.5.14.dist-info/entry_points.txt +2 -0
  158. wisent/benchmarks/coding/providers/livecodebench/provider.py +0 -53
  159. wisent/classifiers/core/atoms.py +0 -747
  160. wisent/classifiers/models/logistic.py +0 -29
  161. wisent/classifiers/models/mlp.py +0 -47
  162. wisent/cli/classifiers/classifier_rotator.py +0 -137
  163. wisent/cli/cli_logger.py +0 -142
  164. wisent/cli/wisent_cli/commands/help_cmd.py +0 -52
  165. wisent/cli/wisent_cli/commands/listing.py +0 -154
  166. wisent/cli/wisent_cli/commands/train_cmd.py +0 -322
  167. wisent/cli/wisent_cli/main.py +0 -93
  168. wisent/cli/wisent_cli/shell.py +0 -80
  169. wisent/cli/wisent_cli/ui.py +0 -69
  170. wisent/cli/wisent_cli/util/aggregations.py +0 -43
  171. wisent/cli/wisent_cli/util/parsing.py +0 -126
  172. wisent/cli/wisent_cli/version.py +0 -4
  173. wisent/opti/methods/__init__.py +0 -0
  174. wisent/synthetic/__init__.py +0 -0
  175. wisent/synthetic/cleaners/__init__.py +0 -0
  176. wisent/synthetic/cleaners/core/__init__.py +0 -0
  177. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  178. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  179. wisent/synthetic/db_instructions/__init__.py +0 -0
  180. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  181. wisent/synthetic/generators/__init__.py +0 -0
  182. wisent/synthetic/generators/core/__init__.py +0 -0
  183. wisent/synthetic/generators/diversities/__init__.py +0 -0
  184. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  185. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  186. wisent-0.5.12.dist-info/RECORD +0 -220
  187. /wisent/{benchmarks → core/evaluators/benchmark_specific/coding}/__init__.py +0 -0
  188. /wisent/{benchmarks/coding → core/evaluators/benchmark_specific/coding/metrics}/__init__.py +0 -0
  189. /wisent/{benchmarks/coding/metrics → core/evaluators/benchmark_specific/coding/metrics/core}/__init__.py +0 -0
  190. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/core/atoms.py +0 -0
  191. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/passk.py +0 -0
  192. /wisent/{benchmarks/coding/metrics/core → core/evaluators/benchmark_specific/coding/output_sanitizer}/__init__.py +0 -0
  193. /wisent/{benchmarks/coding/output_sanitizer → core/evaluators/benchmark_specific/coding/output_sanitizer/core}/__init__.py +0 -0
  194. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/core/atoms.py +0 -0
  195. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/utils.py +0 -0
  196. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/providers/__init__.py +0 -0
  197. /wisent/{benchmarks/coding/output_sanitizer → core/evaluators/benchmark_specific/coding/providers}/core/__init__.py +0 -0
  198. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/providers/core/atoms.py +0 -0
  199. /wisent/{benchmarks/coding/providers/core → core/evaluators/benchmark_specific/coding/safe_docker}/__init__.py +0 -0
  200. /wisent/{benchmarks/coding/providers/livecodebench → core/evaluators/benchmark_specific/coding/safe_docker/core}/__init__.py +0 -0
  201. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/core/atoms.py +0 -0
  202. /wisent/{benchmarks/coding/safe_docker → core/opti}/__init__.py +0 -0
  203. /wisent/{benchmarks/coding/safe_docker → core/opti}/core/__init__.py +0 -0
  204. /wisent/{opti → core/opti}/core/atoms.py +0 -0
  205. /wisent/{classifiers → core/opti/methods}/__init__.py +0 -0
  206. /wisent/{opti → core/opti}/methods/opti_classificator.py +0 -0
  207. /wisent/{opti → core/opti}/methods/opti_steering.py +0 -0
  208. /wisent/{classifiers/core → core/synthetic}/__init__.py +0 -0
  209. /wisent/{classifiers/models → core/synthetic/cleaners}/__init__.py +0 -0
  210. /wisent/{cli → core/synthetic/cleaners/core}/__init__.py +0 -0
  211. /wisent/{synthetic → core/synthetic}/cleaners/core/atoms.py +0 -0
  212. /wisent/{cli/classifiers → core/synthetic/cleaners/methods}/__init__.py +0 -0
  213. /wisent/{cli/data_loaders → core/synthetic/cleaners/methods/core}/__init__.py +0 -0
  214. /wisent/{synthetic → core/synthetic}/cleaners/methods/core/atoms.py +0 -0
  215. /wisent/{cli/evaluators → core/synthetic/db_instructions}/__init__.py +0 -0
  216. /wisent/{cli/steering_methods → core/synthetic/db_instructions/core}/__init__.py +0 -0
  217. /wisent/{synthetic → core/synthetic}/db_instructions/core/atoms.py +0 -0
  218. /wisent/{cli/wisent_cli → core/synthetic/generators}/__init__.py +0 -0
  219. /wisent/{cli/wisent_cli/commands → core/synthetic/generators/core}/__init__.py +0 -0
  220. /wisent/{synthetic → core/synthetic}/generators/core/atoms.py +0 -0
  221. /wisent/{cli/wisent_cli/util → core/synthetic/generators/diversities}/__init__.py +0 -0
  222. /wisent/{opti → core/synthetic/generators/diversities/core}/__init__.py +0 -0
  223. /wisent/{synthetic → core/synthetic}/generators/diversities/core/core.py +0 -0
  224. /wisent/{opti/core → core/synthetic/generators/diversities/methods}/__init__.py +0 -0
  225. {wisent-0.5.12.dist-info → wisent-0.5.14.dist-info}/WHEEL +0 -0
  226. {wisent-0.5.12.dist-info → wisent-0.5.14.dist-info}/licenses/LICENSE +0 -0
  227. {wisent-0.5.12.dist-info → wisent-0.5.14.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,112 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, TYPE_CHECKING
4
+
5
+ from wisent.core.contrastive_pairs.core.pair import ContrastivePair
6
+ from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
7
+ from wisent.core.contrastive_pairs.lm_eval_pairs.atoms import LMEvalBenchmarkExtractor
8
+ from wisent.core.cli_logger import setup_logger, bind
9
+
10
+ if TYPE_CHECKING:
11
+ from lm_eval.api.task import ConfigurableTask
12
+
13
+
14
+ __all__ = ["HeadQAExtractor"]
15
+ _LOG = setup_logger(__name__)
16
+
17
+
18
+ class HeadQAExtractor(LMEvalBenchmarkExtractor):
19
+ """Extractor for the HeadQA benchmark."""
20
+
21
+ def extract_contrastive_pairs(
22
+ self,
23
+ lm_eval_task_data: ConfigurableTask,
24
+ limit: int | None = None,
25
+ ) -> list[ContrastivePair]:
26
+ """
27
+ Build contrastive pairs from HeadQA docs.
28
+
29
+ HeadQA schema:
30
+ - qtext: str
31
+ - answers: list of dictionaries with id adn answer keys
32
+ - ra: index of correct answer, int
33
+
34
+ Args:
35
+ lm_eval_task_data: lm-eval task instance for SciQ.
36
+ limit: Optional maximum number of pairs to produce.
37
+
38
+ Returns:
39
+ A list of ContrastivePair objects.
40
+ """
41
+ log = bind(_LOG, task=getattr(lm_eval_task_data, "NAME", "unknown"))
42
+
43
+ max_items = self._normalize_limit(limit)
44
+ docs = self.load_docs(lm_eval_task_data, max_items)
45
+
46
+ pairs: list[ContrastivePair] = []
47
+
48
+ log.info("Extracting contrastive pairs", extra={"doc_count": len(docs)})
49
+
50
+ for doc in docs:
51
+ pair = self._extract_pair_from_doc(doc)
52
+ if pair is not None:
53
+ pairs.append(pair)
54
+ if max_items is not None and len(pairs) >= max_items:
55
+ break
56
+
57
+ if not pairs:
58
+ task_name = getattr(lm_eval_task_data, "NAME", type(lm_eval_task_data).__name__)
59
+ log.warning("No valid HeadQA pairs extracted", extra={"task": task_name})
60
+
61
+ return pairs
62
+
63
+ def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
64
+ """
65
+ Convert a single HeadQA doc into a ContrastivePair, if possible.
66
+ Returns None when required fields are missing or malformed.
67
+ """
68
+ log = bind(_LOG, doc_id=doc.get("id", "unknown"))
69
+
70
+ try:
71
+ qtext = str(doc.get("qtext", "")).strip()
72
+ answers = doc.get("answers", [])
73
+ answers = [answer["atext"] for answer in answers]
74
+ answer_idx = doc.get("ra") - 1
75
+
76
+ if not qtext or not answers or not answer_idx:
77
+ log.debug(
78
+ "Skipping doc due to missing/invalid fields",
79
+ extra={"doc": doc},
80
+ )
81
+ return None
82
+
83
+ correct = answers[answer_idx]
84
+ incorrect = answers[(answer_idx+1)%len(answers)]
85
+
86
+ formatted_question = f"Question: {qtext}\nAnswer:\nA. {incorrect}\nB. {correct}"
87
+
88
+ metadata = {
89
+ "label": "headqa",
90
+ }
91
+
92
+ return self._build_pair(
93
+ question=formatted_question,
94
+ correct=correct,
95
+ incorrect=incorrect,
96
+ metadata=metadata,
97
+ )
98
+
99
+ except Exception as exc:
100
+ log.error("Error extracting pair from doc", exc_info=exc, extra={"doc": doc})
101
+ return None
102
+
103
+ @staticmethod
104
+ def _build_pair(
105
+ question: str,
106
+ correct: str,
107
+ incorrect: str,
108
+ metadata: dict[str, Any] | None = None,
109
+ ) -> ContrastivePair:
110
+ positive_response = PositiveResponse(model_response=correct)
111
+ negative_response = NegativeResponse(model_response=incorrect)
112
+ return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))
@@ -0,0 +1,113 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, TYPE_CHECKING
4
+
5
+ from wisent.core.contrastive_pairs.core.pair import ContrastivePair
6
+ from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
7
+ from wisent.core.contrastive_pairs.lm_eval_pairs.atoms import LMEvalBenchmarkExtractor
8
+ from wisent.core.cli_logger import setup_logger, bind
9
+
10
+ if TYPE_CHECKING:
11
+ from lm_eval.api.task import ConfigurableTask
12
+
13
+
14
+ __all__ = ["HellaSwagExtractor"]
15
+ _LOG = setup_logger(__name__)
16
+
17
+
18
+ class HellaSwagExtractor(LMEvalBenchmarkExtractor):
19
+ """Extractor for the HellaSwag benchmark."""
20
+
21
+ def extract_contrastive_pairs(
22
+ self,
23
+ lm_eval_task_data: ConfigurableTask,
24
+ limit: int | None = None,
25
+ ) -> list[ContrastivePair]:
26
+ """
27
+ Build contrastive pairs from HellaSwag docs.
28
+
29
+ HellaSwag schema:
30
+ - query: str
31
+ - endings: list of str
32
+ - label: index of correct ending, str
33
+
34
+ Args:
35
+ lm_eval_task_data: lm-eval task instance for HellaSwag.
36
+ limit: Optional maximum number of pairs to produce.
37
+
38
+ Returns:
39
+ A list of ContrastivePair objects.
40
+ """
41
+ log = bind(_LOG, task=getattr(lm_eval_task_data, "NAME", "unknown"))
42
+
43
+ max_items = self._normalize_limit(limit)
44
+ docs = self.load_docs(lm_eval_task_data, max_items)
45
+
46
+ pairs: list[ContrastivePair] = []
47
+
48
+ log.info("Extracting contrastive pairs", extra={"doc_count": len(docs)})
49
+
50
+ for doc in docs:
51
+ pair = self._extract_pair_from_doc(doc)
52
+ if pair is not None:
53
+ pairs.append(pair)
54
+ if max_items is not None and len(pairs) >= max_items:
55
+ break
56
+
57
+ if not pairs:
58
+ task_name = getattr(lm_eval_task_data, "NAME", type(lm_eval_task_data).__name__)
59
+ log.warning("No valid HellaSwag pairs extracted", extra={"task": task_name})
60
+
61
+ return pairs
62
+
63
+ def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
64
+ """
65
+ Convert a single Hellaswag doc into a ContrastivePair, if possible.
66
+ Returns None when required fields are missing or malformed.
67
+ """
68
+ log = bind(_LOG, doc_id=doc.get("id", "unknown"))
69
+
70
+ try:
71
+ query = str(doc.get("query", "")).strip()
72
+ endings = doc.get("endings", [])
73
+ label = str(doc.get("label", "")).strip()
74
+ label = int(label)
75
+
76
+ if not query or not endings or not (0 <= label < len(endings)):
77
+ log.debug(
78
+ "Skipping doc due to missing/invalid fields",
79
+ extra={"doc": doc},
80
+ )
81
+ return None
82
+
83
+ correct = endings[label]
84
+ incorrect = endings[(label+1)%len(endings)]
85
+
86
+ question = f"{query}"
87
+ formatted_question = f"{question}\nA. {incorrect}\nB. {correct}"
88
+
89
+ metadata = {
90
+ "label": "hellaswag",
91
+ }
92
+
93
+ return self._build_pair(
94
+ question=formatted_question,
95
+ correct=correct,
96
+ incorrect=incorrect,
97
+ metadata=metadata,
98
+ )
99
+
100
+ except Exception as exc:
101
+ log.error("Error extracting pair from doc", exc_info=exc, extra={"doc": doc})
102
+ return None
103
+
104
+ @staticmethod
105
+ def _build_pair(
106
+ question: str,
107
+ correct: str,
108
+ incorrect: str,
109
+ metadata: dict[str, Any] | None = None,
110
+ ) -> ContrastivePair:
111
+ positive_response = PositiveResponse(model_response=correct)
112
+ negative_response = NegativeResponse(model_response=incorrect)
113
+ return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))
@@ -0,0 +1,367 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from pathlib import Path
6
+ from typing import Any, TYPE_CHECKING
7
+
8
+ from wisent.core.contrastive_pairs.core.pair import ContrastivePair
9
+ from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
10
+ from wisent.core.contrastive_pairs.lm_eval_pairs.atoms import LMEvalBenchmarkExtractor
11
+ from wisent.core.cli_logger import setup_logger, bind
12
+
13
+ if TYPE_CHECKING:
14
+ from lm_eval.api.task import ConfigurableTask
15
+
16
+
17
+ __all__ = ["LiveCodeBenchExtractor"]
18
+ _LOG = setup_logger(__name__)
19
+
20
+
21
+ class LiveCodeBenchExtractor(LMEvalBenchmarkExtractor):
22
+ """Extractor for the LiveCodeBench benchmark using AI model submissions from HuggingFace Space."""
23
+
24
+ LIVECODEBENCH_SPACE = "livecodebench/code_generation_samples"
25
+ DEFAULT_MODEL = "GPT-4O-2024-08-06" # One of the 22 available models
26
+
27
+ def __init__(self, model_name: str | None = None):
28
+ """
29
+ Initialize the LiveCodeBench extractor.
30
+
31
+ Args:
32
+ model_name: Name of the model to extract submissions from.
33
+ If None, uses DEFAULT_MODEL.
34
+ Available models include: DeepSeek-V3, GPT-4O-2024-08-06, O1-2024-12-17, etc.
35
+ """
36
+ self.model_name = model_name or self.DEFAULT_MODEL
37
+ self._solution_data = None
38
+ self._problems_data = None
39
+
40
+ def _download_space_files(self) -> tuple[dict[str, Any], list[dict[str, Any]]]:
41
+ """
42
+ Download all_outputs.json and problems.json from HuggingFace Space.
43
+
44
+ Returns:
45
+ Tuple of (all_outputs dict, problems list)
46
+ """
47
+ try:
48
+ from huggingface_hub import hf_hub_download
49
+
50
+ _LOG.info(f"Downloading data from HuggingFace Space: {self.LIVECODEBENCH_SPACE}")
51
+
52
+ # Download all_outputs.json (model submissions)
53
+ all_outputs_path = hf_hub_download(
54
+ repo_id=self.LIVECODEBENCH_SPACE,
55
+ filename="all_outputs.json",
56
+ repo_type="space"
57
+ )
58
+
59
+ # Download problems.json (problem metadata)
60
+ problems_path = hf_hub_download(
61
+ repo_id=self.LIVECODEBENCH_SPACE,
62
+ filename="problems.json",
63
+ repo_type="space"
64
+ )
65
+
66
+ with open(all_outputs_path, 'r') as f:
67
+ all_outputs = json.load(f)
68
+
69
+ with open(problems_path, 'r') as f:
70
+ problems = json.load(f)
71
+
72
+ return all_outputs, problems
73
+
74
+ except ImportError:
75
+ raise ImportError(
76
+ "huggingface_hub is required to download from HuggingFace. "
77
+ "Install it with: pip install huggingface_hub"
78
+ )
79
+ except Exception as e:
80
+ raise RuntimeError(f"Failed to download from Space: {e}")
81
+
82
+
83
+ def _load_solution_data(self) -> dict[str, Any]:
84
+ """
85
+ Load AI model solutions and create contrastive pairs.
86
+
87
+ Downloads all_outputs.json and problems.json from HuggingFace Space
88
+ and extracts passing/failing solutions for the specified model.
89
+
90
+ Returns:
91
+ Dictionary with question_id -> {good_example, bad_example} mapping.
92
+ """
93
+ if self._solution_data is not None:
94
+ return self._solution_data
95
+
96
+ # Download Space files
97
+ all_outputs, problems = self._download_space_files()
98
+
99
+ # Check if requested model exists
100
+ if self.model_name not in all_outputs:
101
+ available_models = list(all_outputs.keys())
102
+ raise ValueError(
103
+ f"Model '{self.model_name}' not found in all_outputs.json. "
104
+ f"Available models: {available_models}"
105
+ )
106
+
107
+ # Get submissions for this model
108
+ model_submissions = all_outputs[self.model_name]
109
+
110
+ _LOG.info(f"Processing {len(model_submissions)} problems for model: {self.model_name}")
111
+
112
+ # Process submissions to create solution pairs
113
+ solution_map = self._process_submissions(model_submissions, problems)
114
+
115
+ _LOG.info(f"Loaded {len(solution_map)} LiveCodeBench problems with solutions")
116
+
117
+ self._solution_data = solution_map
118
+ self._problems_data = problems
119
+ return solution_map
120
+
121
+ def _process_submissions(
122
+ self,
123
+ model_submissions: list[dict[str, Any]],
124
+ problems: list[dict[str, Any]]
125
+ ) -> dict[str, Any]:
126
+ """
127
+ Process model submissions to create good/bad solution pairs.
128
+
129
+ Args:
130
+ model_submissions: List of submissions for a specific model
131
+ Each has: code_list, pass1_list, metadata_list
132
+ problems: List of problem metadata from problems.json
133
+ Each has: question_id, question_title, question_content, difficulty, etc.
134
+
135
+ Returns:
136
+ Dictionary with question_id -> {good_example, bad_example, difficulty}
137
+ """
138
+ import random
139
+
140
+ solution_map = {}
141
+
142
+ for problem_idx, submission in enumerate(model_submissions):
143
+ # Get problem metadata
144
+ if problem_idx >= len(problems):
145
+ continue
146
+
147
+ problem_meta = problems[problem_idx]
148
+ question_id = problem_meta.get("question_id", str(problem_idx))
149
+
150
+ code_list = submission.get("code_list", [])
151
+ pass1_list = submission.get("pass1_list", [])
152
+
153
+ # Separate passing and failing submissions
154
+ passing_codes = []
155
+ failing_codes = []
156
+
157
+ for code, passed in zip(code_list, pass1_list):
158
+ if passed:
159
+ passing_codes.append(code)
160
+ else:
161
+ failing_codes.append(code)
162
+
163
+ # Skip if we don't have both passing and failing examples
164
+ if not passing_codes or not failing_codes:
165
+ continue
166
+
167
+ # Randomly select one passing and one failing submission
168
+ good_code = random.choice(passing_codes)
169
+ bad_code = random.choice(failing_codes)
170
+
171
+ solution_map[question_id] = {
172
+ "good_example": {"code": good_code, "passed": True},
173
+ "bad_example": {"code": bad_code, "passed": False},
174
+ "difficulty": problem_meta.get("difficulty", "unknown"),
175
+ "problem_idx": problem_idx,
176
+ }
177
+
178
+ return solution_map
179
+
180
+ def extract_contrastive_pairs(
181
+ self,
182
+ lm_eval_task_data: ConfigurableTask,
183
+ limit: int | None = None,
184
+ ) -> list[ContrastivePair]:
185
+ """
186
+ Build contrastive pairs from LiveCodeBench using actual AI model solutions from HuggingFace Space.
187
+
188
+ Downloads submissions from livecodebench/code_generation_samples Space which contains:
189
+ - all_outputs.json: 22 models × 880 problems with code submissions and pass/fail results
190
+ - problems.json: Problem metadata (question_id, title, content, difficulty, etc.)
191
+
192
+ Available models include:
193
+ DeepSeek-V3, GPT-4O-2024-08-06, O1-2024-12-17, DeepSeek-R1-Preview,
194
+ GPT-4-Turbo, O1-Preview, O1-Mini, and 15 more
195
+
196
+ This method:
197
+ 1. Downloads all_outputs.json and problems.json from HuggingFace Space
198
+ 2. Extracts submissions for the specified model (default: GPT-4O-2024-08-06)
199
+ 3. Processes solutions to identify passing (good) and failing (bad) submissions
200
+ 4. Randomly selects one good and one bad example per problem
201
+ 5. Matches solutions with lm-eval task docs by question_id
202
+ 6. Creates ContrastivePair objects with actual code from the model
203
+
204
+ Args:
205
+ lm_eval_task_data: lm-eval task instance for LiveCodeBench.
206
+ limit: Optional maximum number of pairs to produce.
207
+
208
+ Returns:
209
+ A list of ContrastivePair objects.
210
+
211
+ Raises:
212
+ ValueError: If specified model not found in all_outputs.json
213
+ RuntimeError: If download from HuggingFace Space fails
214
+ """
215
+ log = bind(_LOG, task=getattr(lm_eval_task_data, "NAME", "livecodebench"))
216
+
217
+ max_items = self._normalize_limit(limit)
218
+ docs = self.load_docs(lm_eval_task_data, max_items)
219
+
220
+ # Load pre-generated solutions
221
+ try:
222
+ solution_map = self._load_solution_data()
223
+ except FileNotFoundError as e:
224
+ log.error(f"Failed to load solution data: {e}")
225
+ return []
226
+
227
+ pairs: list[ContrastivePair] = []
228
+
229
+ log.info("Extracting contrastive pairs", extra={"doc_count": len(docs)})
230
+
231
+ for doc_idx, doc in enumerate(docs):
232
+ pair = self._extract_pair_from_doc(doc, solution_map, doc_idx=doc_idx)
233
+ if pair is not None:
234
+ pairs.append(pair)
235
+ if max_items is not None and len(pairs) >= max_items:
236
+ break
237
+
238
+ if not pairs:
239
+ task_name = getattr(lm_eval_task_data, "NAME", type(lm_eval_task_data).__name__)
240
+ log.warning(
241
+ f"No valid LiveCodeBench pairs extracted from {len(docs)} docs. "
242
+ f"Make sure model outputs are available and contain both passing and failing solutions. "
243
+ f"Check that question_ids or problem indices match between lm-eval docs and model outputs.",
244
+ extra={"task": task_name, "solution_count": len(solution_map)}
245
+ )
246
+
247
+ return pairs
248
+
249
+ def _extract_pair_from_doc(
250
+ self,
251
+ doc: dict[str, Any],
252
+ solution_map: dict[str, Any],
253
+ doc_idx: int = None
254
+ ) -> ContrastivePair | None:
255
+ """
256
+ Convert a single LiveCodeBench doc into a ContrastivePair using pre-generated solutions.
257
+
258
+ Args:
259
+ doc: Document from lm-eval task
260
+ solution_map: Mapping from question_id to good/bad examples
261
+ doc_idx: Document index (used as fallback if question_id lookup fails)
262
+
263
+ Returns:
264
+ ContrastivePair if solutions exist for this problem, None otherwise.
265
+ """
266
+ log = bind(_LOG, doc_id=doc.get("question_id", "unknown"))
267
+
268
+ try:
269
+ question_id = doc.get("question_id")
270
+ question_title = doc.get("question_title", "").strip()
271
+ question_content = doc.get("question_content", "").strip()
272
+ starter_code = doc.get("starter_code", "").strip()
273
+
274
+ if not question_id or not question_content:
275
+ log.debug(
276
+ "Skipping doc due to missing question_id or content",
277
+ extra={"doc": doc},
278
+ )
279
+ return None
280
+
281
+ # Look up pre-generated solutions
282
+ # Try direct question_id lookup first
283
+ solutions = solution_map.get(question_id)
284
+
285
+ # If not found, try using doc_idx as fallback
286
+ if solutions is None and doc_idx is not None:
287
+ # Try to find by problem_idx
288
+ solutions = solution_map.get(str(doc_idx))
289
+
290
+ if solutions is None:
291
+ log.debug(
292
+ f"No pre-generated solutions found for question_id: {question_id} (doc_idx: {doc_idx})",
293
+ extra={"question_id": question_id, "doc_idx": doc_idx},
294
+ )
295
+ return None
296
+
297
+ good_code = solutions["good_example"]["code"]
298
+ bad_code = solutions["bad_example"]["code"]
299
+
300
+ # Build prompt from problem description
301
+ prompt_parts = []
302
+ if question_title:
303
+ prompt_parts.append(f"Problem: {question_title}")
304
+ prompt_parts.append(question_content)
305
+ if starter_code:
306
+ prompt_parts.append(f"\nStarter Code:\n{starter_code}")
307
+
308
+ prompt = "\n\n".join(prompt_parts)
309
+
310
+ metadata = {
311
+ "label": "livecodebench",
312
+ "question_id": question_id,
313
+ "difficulty": solutions.get("difficulty", "unknown"),
314
+ "model": self.model_name,
315
+ "problem_idx": solutions.get("problem_idx"),
316
+ }
317
+
318
+ return self._build_pair(
319
+ prompt=prompt,
320
+ correct_code=good_code,
321
+ incorrect_code=bad_code,
322
+ metadata=metadata,
323
+ )
324
+
325
+ except Exception as exc:
326
+ log.error("Error extracting pair from doc", exc_info=exc, extra={"doc": doc})
327
+ return None
328
+
329
+ @staticmethod
330
+ def _build_pair(
331
+ prompt: str,
332
+ correct_code: str,
333
+ incorrect_code: str,
334
+ metadata: dict[str, Any] | None = None,
335
+ ) -> ContrastivePair:
336
+ """
337
+ Build a ContrastivePair from a coding problem and correct/incorrect solutions.
338
+
339
+ Args:
340
+ prompt: The coding problem description
341
+ correct_code: Code that passes all tests (positive example)
342
+ incorrect_code: Code that fails tests (negative example)
343
+ metadata: Additional metadata about the problem
344
+
345
+ Returns:
346
+ ContrastivePair object
347
+ """
348
+ # Extract model name from metadata
349
+ model_name = metadata.get("model") if metadata else None
350
+
351
+ # Store model name in response labels for traceability
352
+ positive_response = PositiveResponse(
353
+ model_response=correct_code,
354
+ label=model_name
355
+ )
356
+ negative_response = NegativeResponse(
357
+ model_response=incorrect_code,
358
+ label=model_name
359
+ )
360
+
361
+ return ContrastivePair(
362
+ prompt=prompt,
363
+ positive_response=positive_response,
364
+ negative_response=negative_response,
365
+ label=metadata.get("label") if metadata else None,
366
+ trait_description=f"Model: {model_name}" if model_name else None,
367
+ )