wisent 0.5.12__py3-none-any.whl → 0.5.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (227) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/__init__.py +26 -0
  3. wisent/core/activations/activations.py +96 -0
  4. wisent/core/activations/activations_collector.py +71 -20
  5. wisent/core/activations/prompt_construction_strategy.py +47 -0
  6. wisent/core/agent/__init__.py +1 -18
  7. wisent/core/agent/budget.py +2 -2
  8. wisent/core/agent/device_benchmarks.py +1 -1
  9. wisent/core/agent/diagnose/__init__.py +1 -55
  10. wisent/core/agent/diagnose/classifier_marketplace.py +8 -8
  11. wisent/core/agent/diagnose/response_diagnostics.py +4 -4
  12. wisent/core/agent/diagnose/synthetic_classifier_option.py +1 -1
  13. wisent/core/agent/diagnose/tasks/task_manager.py +3 -3
  14. wisent/core/agent/diagnose.py +2 -1
  15. wisent/core/autonomous_agent.py +10 -2
  16. wisent/core/benchmark_extractors.py +293 -0
  17. wisent/core/bigcode_integration.py +20 -7
  18. wisent/core/branding.py +108 -0
  19. wisent/core/cli/__init__.py +15 -0
  20. wisent/core/cli/create_steering_vector.py +138 -0
  21. wisent/core/cli/evaluate_responses.py +715 -0
  22. wisent/core/cli/generate_pairs.py +128 -0
  23. wisent/core/cli/generate_pairs_from_task.py +119 -0
  24. wisent/core/cli/generate_responses.py +129 -0
  25. wisent/core/cli/generate_vector_from_synthetic.py +149 -0
  26. wisent/core/cli/generate_vector_from_task.py +147 -0
  27. wisent/core/cli/get_activations.py +191 -0
  28. wisent/core/cli/optimize_classification.py +339 -0
  29. wisent/core/cli/optimize_steering.py +364 -0
  30. wisent/core/cli/tasks.py +182 -0
  31. wisent/core/cli_logger.py +22 -0
  32. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +27 -1
  33. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +49 -1
  34. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +115 -0
  35. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +115 -0
  36. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +111 -0
  37. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +119 -0
  38. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +115 -0
  39. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +114 -0
  40. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +118 -0
  41. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +146 -0
  42. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +129 -0
  43. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +119 -0
  44. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +112 -0
  45. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +113 -0
  46. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livecodebench.py +367 -0
  47. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +115 -0
  48. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +114 -0
  49. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +113 -0
  50. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +112 -0
  51. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +111 -0
  52. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +114 -0
  53. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +113 -0
  54. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +115 -0
  55. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +111 -0
  56. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +113 -0
  57. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +113 -0
  58. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +112 -0
  59. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +116 -0
  60. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +115 -0
  61. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +111 -0
  62. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +111 -0
  63. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +121 -0
  64. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +121 -0
  65. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +111 -0
  66. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +110 -0
  67. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +114 -0
  68. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad2.py +124 -0
  69. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sst2.py +111 -0
  70. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +112 -0
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +127 -0
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_gen.py +112 -0
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +117 -0
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +117 -0
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +127 -0
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +119 -0
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +1 -1
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +111 -0
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +114 -0
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +112 -0
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +114 -0
  82. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +114 -0
  83. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +1 -1
  84. wisent/core/data_loaders/__init__.py +235 -0
  85. wisent/core/data_loaders/loaders/lm_loader.py +2 -2
  86. wisent/core/data_loaders/loaders/task_interface_loader.py +300 -0
  87. wisent/{cli/data_loaders/data_loader_rotator.py → core/data_loaders/rotator.py} +1 -1
  88. wisent/core/download_full_benchmarks.py +79 -2
  89. wisent/core/evaluators/benchmark_specific/__init__.py +26 -0
  90. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/evaluator.py +17 -17
  91. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/cpp_sanitizer.py +2 -2
  92. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/java_sanitizer.py +2 -2
  93. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/python_sanitizer.py +2 -2
  94. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/__init__.py +3 -0
  95. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/provider.py +305 -0
  96. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/core/runtime.py +36 -4
  97. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/entrypoint.py +2 -4
  98. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/recipes.py +1 -1
  99. wisent/core/evaluators/benchmark_specific/coding/solution_generator.py +258 -0
  100. wisent/core/evaluators/benchmark_specific/exact_match_evaluator.py +79 -0
  101. wisent/core/evaluators/benchmark_specific/f1_evaluator.py +101 -0
  102. wisent/core/evaluators/benchmark_specific/generation_evaluator.py +197 -0
  103. wisent/core/{log_likelihoods_evaluator.py → evaluators/benchmark_specific/log_likelihoods_evaluator.py} +10 -2
  104. wisent/core/evaluators/benchmark_specific/perplexity_evaluator.py +140 -0
  105. wisent/core/evaluators/benchmark_specific/personalization_evaluator.py +250 -0
  106. wisent/{cli/evaluators/evaluator_rotator.py → core/evaluators/rotator.py} +4 -4
  107. wisent/core/lm_eval_harness_ground_truth.py +3 -2
  108. wisent/core/main.py +57 -0
  109. wisent/core/model_persistence.py +2 -2
  110. wisent/core/models/wisent_model.py +6 -6
  111. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  112. wisent/core/optuna/steering/steering_optimization.py +1 -1
  113. wisent/core/parser_arguments/__init__.py +10 -0
  114. wisent/core/parser_arguments/agent_parser.py +110 -0
  115. wisent/core/parser_arguments/configure_model_parser.py +7 -0
  116. wisent/core/parser_arguments/create_steering_vector_parser.py +59 -0
  117. wisent/core/parser_arguments/evaluate_parser.py +40 -0
  118. wisent/core/parser_arguments/evaluate_responses_parser.py +10 -0
  119. wisent/core/parser_arguments/full_optimize_parser.py +115 -0
  120. wisent/core/parser_arguments/generate_pairs_from_task_parser.py +33 -0
  121. wisent/core/parser_arguments/generate_pairs_parser.py +29 -0
  122. wisent/core/parser_arguments/generate_responses_parser.py +15 -0
  123. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +127 -0
  124. wisent/core/parser_arguments/generate_vector_from_task_parser.py +127 -0
  125. wisent/core/parser_arguments/generate_vector_parser.py +90 -0
  126. wisent/core/parser_arguments/get_activations_parser.py +90 -0
  127. wisent/core/parser_arguments/main_parser.py +152 -0
  128. wisent/core/parser_arguments/model_config_parser.py +59 -0
  129. wisent/core/parser_arguments/monitor_parser.py +17 -0
  130. wisent/core/parser_arguments/multi_steer_parser.py +47 -0
  131. wisent/core/parser_arguments/optimize_classification_parser.py +67 -0
  132. wisent/core/parser_arguments/optimize_sample_size_parser.py +58 -0
  133. wisent/core/parser_arguments/optimize_steering_parser.py +147 -0
  134. wisent/core/parser_arguments/synthetic_parser.py +93 -0
  135. wisent/core/parser_arguments/tasks_parser.py +584 -0
  136. wisent/core/parser_arguments/test_nonsense_parser.py +26 -0
  137. wisent/core/parser_arguments/utils.py +111 -0
  138. wisent/core/prompts/core/prompt_formater.py +3 -3
  139. wisent/core/prompts/prompt_stratiegies/direct_completion.py +2 -0
  140. wisent/core/prompts/prompt_stratiegies/instruction_following.py +2 -0
  141. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +2 -0
  142. wisent/core/prompts/prompt_stratiegies/role_playing.py +2 -0
  143. wisent/{cli/steering_methods/steering_rotator.py → core/steering_methods/rotator.py} +4 -4
  144. wisent/core/steering_optimizer.py +45 -21
  145. wisent/{synthetic → core/synthetic}/cleaners/deduper_cleaner.py +3 -3
  146. wisent/{synthetic → core/synthetic}/cleaners/methods/base_dedupers.py +2 -2
  147. wisent/{synthetic → core/synthetic}/cleaners/methods/base_refusalers.py +1 -1
  148. wisent/{synthetic → core/synthetic}/cleaners/pairs_cleaner.py +5 -5
  149. wisent/{synthetic → core/synthetic}/cleaners/refusaler_cleaner.py +4 -4
  150. wisent/{synthetic → core/synthetic}/db_instructions/mini_dp.py +1 -1
  151. wisent/{synthetic → core/synthetic}/generators/diversities/methods/fast_diversity.py +1 -1
  152. wisent/{synthetic → core/synthetic}/generators/pairs_generator.py +38 -12
  153. wisent/core/tasks/livecodebench_task.py +4 -103
  154. wisent/core/timing_calibration.py +1 -1
  155. {wisent-0.5.12.dist-info → wisent-0.5.14.dist-info}/METADATA +3 -3
  156. wisent-0.5.14.dist-info/RECORD +294 -0
  157. wisent-0.5.14.dist-info/entry_points.txt +2 -0
  158. wisent/benchmarks/coding/providers/livecodebench/provider.py +0 -53
  159. wisent/classifiers/core/atoms.py +0 -747
  160. wisent/classifiers/models/logistic.py +0 -29
  161. wisent/classifiers/models/mlp.py +0 -47
  162. wisent/cli/classifiers/classifier_rotator.py +0 -137
  163. wisent/cli/cli_logger.py +0 -142
  164. wisent/cli/wisent_cli/commands/help_cmd.py +0 -52
  165. wisent/cli/wisent_cli/commands/listing.py +0 -154
  166. wisent/cli/wisent_cli/commands/train_cmd.py +0 -322
  167. wisent/cli/wisent_cli/main.py +0 -93
  168. wisent/cli/wisent_cli/shell.py +0 -80
  169. wisent/cli/wisent_cli/ui.py +0 -69
  170. wisent/cli/wisent_cli/util/aggregations.py +0 -43
  171. wisent/cli/wisent_cli/util/parsing.py +0 -126
  172. wisent/cli/wisent_cli/version.py +0 -4
  173. wisent/opti/methods/__init__.py +0 -0
  174. wisent/synthetic/__init__.py +0 -0
  175. wisent/synthetic/cleaners/__init__.py +0 -0
  176. wisent/synthetic/cleaners/core/__init__.py +0 -0
  177. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  178. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  179. wisent/synthetic/db_instructions/__init__.py +0 -0
  180. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  181. wisent/synthetic/generators/__init__.py +0 -0
  182. wisent/synthetic/generators/core/__init__.py +0 -0
  183. wisent/synthetic/generators/diversities/__init__.py +0 -0
  184. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  185. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  186. wisent-0.5.12.dist-info/RECORD +0 -220
  187. /wisent/{benchmarks → core/evaluators/benchmark_specific/coding}/__init__.py +0 -0
  188. /wisent/{benchmarks/coding → core/evaluators/benchmark_specific/coding/metrics}/__init__.py +0 -0
  189. /wisent/{benchmarks/coding/metrics → core/evaluators/benchmark_specific/coding/metrics/core}/__init__.py +0 -0
  190. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/core/atoms.py +0 -0
  191. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/passk.py +0 -0
  192. /wisent/{benchmarks/coding/metrics/core → core/evaluators/benchmark_specific/coding/output_sanitizer}/__init__.py +0 -0
  193. /wisent/{benchmarks/coding/output_sanitizer → core/evaluators/benchmark_specific/coding/output_sanitizer/core}/__init__.py +0 -0
  194. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/core/atoms.py +0 -0
  195. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/utils.py +0 -0
  196. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/providers/__init__.py +0 -0
  197. /wisent/{benchmarks/coding/output_sanitizer → core/evaluators/benchmark_specific/coding/providers}/core/__init__.py +0 -0
  198. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/providers/core/atoms.py +0 -0
  199. /wisent/{benchmarks/coding/providers/core → core/evaluators/benchmark_specific/coding/safe_docker}/__init__.py +0 -0
  200. /wisent/{benchmarks/coding/providers/livecodebench → core/evaluators/benchmark_specific/coding/safe_docker/core}/__init__.py +0 -0
  201. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/core/atoms.py +0 -0
  202. /wisent/{benchmarks/coding/safe_docker → core/opti}/__init__.py +0 -0
  203. /wisent/{benchmarks/coding/safe_docker → core/opti}/core/__init__.py +0 -0
  204. /wisent/{opti → core/opti}/core/atoms.py +0 -0
  205. /wisent/{classifiers → core/opti/methods}/__init__.py +0 -0
  206. /wisent/{opti → core/opti}/methods/opti_classificator.py +0 -0
  207. /wisent/{opti → core/opti}/methods/opti_steering.py +0 -0
  208. /wisent/{classifiers/core → core/synthetic}/__init__.py +0 -0
  209. /wisent/{classifiers/models → core/synthetic/cleaners}/__init__.py +0 -0
  210. /wisent/{cli → core/synthetic/cleaners/core}/__init__.py +0 -0
  211. /wisent/{synthetic → core/synthetic}/cleaners/core/atoms.py +0 -0
  212. /wisent/{cli/classifiers → core/synthetic/cleaners/methods}/__init__.py +0 -0
  213. /wisent/{cli/data_loaders → core/synthetic/cleaners/methods/core}/__init__.py +0 -0
  214. /wisent/{synthetic → core/synthetic}/cleaners/methods/core/atoms.py +0 -0
  215. /wisent/{cli/evaluators → core/synthetic/db_instructions}/__init__.py +0 -0
  216. /wisent/{cli/steering_methods → core/synthetic/db_instructions/core}/__init__.py +0 -0
  217. /wisent/{synthetic → core/synthetic}/db_instructions/core/atoms.py +0 -0
  218. /wisent/{cli/wisent_cli → core/synthetic/generators}/__init__.py +0 -0
  219. /wisent/{cli/wisent_cli/commands → core/synthetic/generators/core}/__init__.py +0 -0
  220. /wisent/{synthetic → core/synthetic}/generators/core/atoms.py +0 -0
  221. /wisent/{cli/wisent_cli/util → core/synthetic/generators/diversities}/__init__.py +0 -0
  222. /wisent/{opti → core/synthetic/generators/diversities/core}/__init__.py +0 -0
  223. /wisent/{synthetic → core/synthetic}/generators/diversities/core/core.py +0 -0
  224. /wisent/{opti/core → core/synthetic/generators/diversities/methods}/__init__.py +0 -0
  225. {wisent-0.5.12.dist-info → wisent-0.5.14.dist-info}/WHEEL +0 -0
  226. {wisent-0.5.12.dist-info → wisent-0.5.14.dist-info}/licenses/LICENSE +0 -0
  227. {wisent-0.5.12.dist-info → wisent-0.5.14.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,113 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, TYPE_CHECKING
4
+
5
+ from wisent.core.contrastive_pairs.core.pair import ContrastivePair
6
+ from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
7
+ from wisent.core.contrastive_pairs.lm_eval_pairs.atoms import LMEvalBenchmarkExtractor
8
+ from wisent.core.cli_logger import setup_logger, bind
9
+
10
+ if TYPE_CHECKING:
11
+ from lm_eval.api.task import ConfigurableTask
12
+
13
+
14
+ __all__ = ["MutualExtractor"]
15
+ _LOG = setup_logger(__name__)
16
+
17
+
18
+ class MutualExtractor(LMEvalBenchmarkExtractor):
19
+ """Extractor for the Mutual benchmark."""
20
+
21
+ def extract_contrastive_pairs(
22
+ self,
23
+ lm_eval_task_data: ConfigurableTask,
24
+ limit: int | None = None,
25
+ ) -> list[ContrastivePair]:
26
+ """
27
+ Build contrastive pairs from Mutual docs.
28
+
29
+ Mutual schema:
30
+ - article: str
31
+ - options: []
32
+ - answers: str
33
+
34
+ Args:
35
+ lm_eval_task_data: lm-eval task instance for Mutual.
36
+ limit: Optional maximum number of pairs to produce.
37
+
38
+ Returns:
39
+ A list of ContrastivePair objects.
40
+ """
41
+ log = bind(_LOG, task=getattr(lm_eval_task_data, "NAME", "unknown"))
42
+
43
+ max_items = self._normalize_limit(limit)
44
+ docs = self.load_docs(lm_eval_task_data, max_items)
45
+
46
+ pairs: list[ContrastivePair] = []
47
+
48
+ log.info("Extracting contrastive pairs", extra={"doc_count": len(docs)})
49
+
50
+ for doc in docs:
51
+ pair = self._extract_pair_from_doc(doc)
52
+ if pair is not None:
53
+ pairs.append(pair)
54
+ if max_items is not None and len(pairs) >= max_items:
55
+ break
56
+
57
+ if not pairs:
58
+ task_name = getattr(lm_eval_task_data, "NAME", type(lm_eval_task_data).__name__)
59
+ log.warning("No valid Mutual pairs extracted", extra={"task": task_name})
60
+
61
+ return pairs
62
+
63
+ def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
64
+ """
65
+ Convert a single Mutual doc into a ContrastivePair, if possible.
66
+ Returns None when required fields are missing or malformed.
67
+ """
68
+ log = bind(_LOG, doc_id=doc.get("id", "unknown"))
69
+
70
+ try:
71
+ article = str(doc.get("article", "")).strip()
72
+ options = doc.get("options", [])
73
+ answer = str(doc.get("answers", "")).strip()
74
+ answer_idx = int(ord(answer) - ord("A"))
75
+
76
+
77
+ if not article or not options or not (0 <= answer_idx < len(options)):
78
+ log.debug(
79
+ "Skipping doc due to missing/invalid fields",
80
+ extra={"doc": doc},
81
+ )
82
+ return None
83
+
84
+ correct = options[answer_idx]
85
+ incorrect = options[(answer_idx+1)%len(options)]
86
+
87
+ formatted_question = (f"{article}\nA. {incorrect}\nB. {correct}")
88
+
89
+ metadata = {
90
+ "label": "mutual",
91
+ }
92
+
93
+ return self._build_pair(
94
+ question=formatted_question,
95
+ correct=correct,
96
+ incorrect=incorrect,
97
+ metadata=metadata,
98
+ )
99
+
100
+ except Exception as exc:
101
+ log.error("Error extracting pair from doc", exc_info=exc, extra={"doc": doc})
102
+ return None
103
+
104
+ @staticmethod
105
+ def _build_pair(
106
+ question: str,
107
+ correct: str,
108
+ incorrect: str,
109
+ metadata: dict[str, Any] | None = None,
110
+ ) -> ContrastivePair:
111
+ positive_response = PositiveResponse(model_response=correct)
112
+ negative_response = NegativeResponse(model_response=incorrect)
113
+ return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))
@@ -0,0 +1,115 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, TYPE_CHECKING
4
+
5
+ from wisent.core.contrastive_pairs.core.pair import ContrastivePair
6
+ from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
7
+ from wisent.core.contrastive_pairs.lm_eval_pairs.atoms import LMEvalBenchmarkExtractor
8
+ from wisent.core.cli_logger import setup_logger, bind
9
+
10
+ if TYPE_CHECKING:
11
+ from lm_eval.api.task import ConfigurableTask
12
+
13
+
14
+ __all__ = ["OpenBookQAExtractor"]
15
+ _LOG = setup_logger(__name__)
16
+
17
+
18
+ class OpenBookQAExtractor(LMEvalBenchmarkExtractor):
19
+ """Extractor for the OpenBookQA benchmark."""
20
+
21
+ def extract_contrastive_pairs(
22
+ self,
23
+ lm_eval_task_data: ConfigurableTask,
24
+ limit: int | None = None,
25
+ ) -> list[ContrastivePair]:
26
+ """
27
+ Build contrastive pairs from OpenBookQA docs.
28
+
29
+ OpenBookQA schema:
30
+ - question_stem: str
31
+ - choices: dict,
32
+ - choices["text"]: list with possible endings strings
33
+ - answerKey: str
34
+
35
+ Args:
36
+ lm_eval_task_data: lm-eval task instance for OpenBookQA.
37
+ limit: Optional maximum number of pairs to produce.
38
+
39
+ Returns:
40
+ A list of ContrastivePair objects.
41
+ """
42
+ log = bind(_LOG, task=getattr(lm_eval_task_data, "NAME", "unknown"))
43
+
44
+ max_items = self._normalize_limit(limit)
45
+ docs = self.load_docs(lm_eval_task_data, max_items)
46
+
47
+ pairs: list[ContrastivePair] = []
48
+
49
+ log.info("Extracting contrastive pairs", extra={"doc_count": len(docs)})
50
+
51
+ for doc in docs:
52
+ pair = self._extract_pair_from_doc(doc)
53
+ if pair is not None:
54
+ pairs.append(pair)
55
+ if max_items is not None and len(pairs) >= max_items:
56
+ break
57
+
58
+ if not pairs:
59
+ task_name = getattr(lm_eval_task_data, "NAME", type(lm_eval_task_data).__name__)
60
+ log.warning("No valid OpenBookQA pairs extracted", extra={"task": task_name})
61
+
62
+ return pairs
63
+
64
+ def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
65
+ """
66
+ Convert a single OpenBookQA doc into a ContrastivePair, if possible.
67
+ Returns None when required fields are missing or malformed.
68
+ """
69
+ log = bind(_LOG, doc_id=doc.get("id", "unknown"))
70
+
71
+ try:
72
+ question_stem = str(doc.get("question_stem", "")).strip()
73
+ choices = doc.get("choices", {})
74
+ endings = choices["text"]
75
+ answer = str(doc.get("answerKey", "")).strip()
76
+ answer_idx = int(ord(answer) - ord('A'))
77
+
78
+ if not question_stem or not endings or not (0 <= answer_idx < len(endings)):
79
+ log.debug(
80
+ "Skipping doc due to missing/invalid fields",
81
+ extra={"doc": doc},
82
+ )
83
+ return None
84
+
85
+ correct = endings[answer_idx]
86
+ incorrect = endings[(answer_idx+1)%len(endings)]
87
+
88
+ question = f"{question_stem}"
89
+ formatted_question = f"{question}\nA. {incorrect}\nB. {correct}"
90
+
91
+ metadata = {
92
+ "label": "openbookqa",
93
+ }
94
+
95
+ return self._build_pair(
96
+ question=formatted_question,
97
+ correct=correct,
98
+ incorrect=incorrect,
99
+ metadata=metadata,
100
+ )
101
+
102
+ except Exception as exc:
103
+ log.error("Error extracting pair from doc", exc_info=exc, extra={"doc": doc})
104
+ return None
105
+
106
+ @staticmethod
107
+ def _build_pair(
108
+ question: str,
109
+ correct: str,
110
+ incorrect: str,
111
+ metadata: dict[str, Any] | None = None,
112
+ ) -> ContrastivePair:
113
+ positive_response = PositiveResponse(model_response=correct)
114
+ negative_response = NegativeResponse(model_response=incorrect)
115
+ return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))
@@ -0,0 +1,111 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, TYPE_CHECKING
4
+
5
+ from wisent.core.contrastive_pairs.core.pair import ContrastivePair
6
+ from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
7
+ from wisent.core.contrastive_pairs.lm_eval_pairs.atoms import LMEvalBenchmarkExtractor
8
+ from wisent.core.cli_logger import setup_logger, bind
9
+
10
+ if TYPE_CHECKING:
11
+ from lm_eval.api.task import ConfigurableTask
12
+
13
+
14
+ __all__ = ["PAWSXExtractor"]
15
+ _LOG = setup_logger(__name__)
16
+
17
+
18
+ class PAWSXExtractor(LMEvalBenchmarkExtractor):
19
+ """Extractor for the PAWS-X benchmark."""
20
+
21
+ def extract_contrastive_pairs(
22
+ self,
23
+ lm_eval_task_data: ConfigurableTask,
24
+ limit: int | None = None,
25
+ ) -> list[ContrastivePair]:
26
+ """
27
+ Build contrastive pairs from PAWS-X docs.
28
+
29
+ PAWS-X schema:
30
+ - sentence1: str
31
+ - sentence2: str
32
+ - label: 0 or 1
33
+
34
+ Args:
35
+ lm_eval_task_data: lm-eval task instance for PAWS-X.
36
+ limit: Optional maximum number of pairs to produce.
37
+
38
+ Returns:
39
+ A list of ContrastivePair objects.
40
+ """
41
+ log = bind(_LOG, task=getattr(lm_eval_task_data, "NAME", "unknown"))
42
+
43
+ max_items = self._normalize_limit(limit)
44
+ docs = self.load_docs(lm_eval_task_data, max_items)
45
+
46
+ pairs: list[ContrastivePair] = []
47
+
48
+ log.info("Extracting contrastive pairs", extra={"doc_count": len(docs)})
49
+
50
+ for doc in docs:
51
+ pair = self._extract_pair_from_doc(doc)
52
+ if pair is not None:
53
+ pairs.append(pair)
54
+ if max_items is not None and len(pairs) >= max_items:
55
+ break
56
+
57
+ if not pairs:
58
+ task_name = getattr(lm_eval_task_data, "NAME", type(lm_eval_task_data).__name__)
59
+ log.warning("No valid PAWS-X pairs extracted", extra={"task": task_name})
60
+
61
+ return pairs
62
+
63
+ def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
64
+ """
65
+ Convert a single PAWS-X doc into a ContrastivePair, if possible.
66
+ Returns None when required fields are missing or malformed.
67
+ """
68
+ log = bind(_LOG, doc_id=doc.get("id", "unknown"))
69
+
70
+ try:
71
+ sentence1 = str(doc.get("sentence1", "")).strip()
72
+ sentence2 = str(doc.get("sentence2", "")).strip()
73
+ label = doc.get("label")
74
+
75
+ if not sentence1 or not sentence2 or label not in {0, 1}:
76
+ log.debug(
77
+ "Skipping doc due to missing/invalid fields",
78
+ extra={"doc": doc},
79
+ )
80
+ return None
81
+
82
+ formatted_question = f"Is sentence '{sentence1}' paraphrase of sentence '{sentence2}'?\nA. Yes\nB. No"
83
+
84
+ correct = "Yes" if label == 0 else "No"
85
+ incorrect = "No" if label == 0 else "Yes"
86
+
87
+ metadata = {
88
+ "label": "paws-x",
89
+ }
90
+
91
+ return self._build_pair(
92
+ question=formatted_question,
93
+ correct=correct,
94
+ incorrect=incorrect,
95
+ metadata=metadata,
96
+ )
97
+
98
+ except Exception as exc:
99
+ log.error("Error extracting pair from doc", exc_info=exc, extra={"doc": doc})
100
+ return None
101
+
102
+ @staticmethod
103
+ def _build_pair(
104
+ question: str,
105
+ correct: str,
106
+ incorrect: str,
107
+ metadata: dict[str, Any] | None = None,
108
+ ) -> ContrastivePair:
109
+ positive_response = PositiveResponse(model_response=correct)
110
+ negative_response = NegativeResponse(model_response=incorrect)
111
+ return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))
@@ -0,0 +1,113 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, TYPE_CHECKING
4
+
5
+ from wisent.core.contrastive_pairs.core.pair import ContrastivePair
6
+ from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
7
+ from wisent.core.contrastive_pairs.lm_eval_pairs.atoms import LMEvalBenchmarkExtractor
8
+ from wisent.core.cli_logger import setup_logger, bind
9
+
10
+ if TYPE_CHECKING:
11
+ from lm_eval.api.task import ConfigurableTask
12
+
13
+
14
+ __all__ = ["PIQAExtractor"]
15
+ _LOG = setup_logger(__name__)
16
+
17
+
18
+ class PIQAExtractor(LMEvalBenchmarkExtractor):
19
+ """Extractor for the PIQA benchmark."""
20
+
21
+ def extract_contrastive_pairs(
22
+ self,
23
+ lm_eval_task_data: ConfigurableTask,
24
+ limit: int | None = None,
25
+ ) -> list[ContrastivePair]:
26
+ """
27
+ Build contrastive pairs from PIQA docs.
28
+
29
+ PIQA schema:
30
+ - goal: str
31
+ - sol1, sol2: str
32
+ - label: 0 or 1
33
+
34
+ Args:
35
+ lm_eval_task_data: lm-eval task instance for PIQA.
36
+ limit: Optional maximum number of pairs to produce.
37
+
38
+ Returns:
39
+ A list of ContrastivePair objects.
40
+ """
41
+ log = bind(_LOG, task=getattr(lm_eval_task_data, "NAME", "unknown"))
42
+
43
+ max_items = self._normalize_limit(limit)
44
+ docs = self.load_docs(lm_eval_task_data, max_items)
45
+
46
+ pairs: list[ContrastivePair] = []
47
+
48
+ log.info("Extracting contrastive pairs", extra={"doc_count": len(docs)})
49
+
50
+ for doc in docs:
51
+ pair = self._extract_pair_from_doc(doc)
52
+ if pair is not None:
53
+ pairs.append(pair)
54
+ if max_items is not None and len(pairs) >= max_items:
55
+ break
56
+
57
+ if not pairs:
58
+ task_name = getattr(lm_eval_task_data, "NAME", type(lm_eval_task_data).__name__)
59
+ log.warning("No valid PIQA pairs extracted", extra={"task": task_name})
60
+
61
+ return pairs
62
+
63
+ def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
64
+ """
65
+ Convert a single PIQA doc into a ContrastivePair, if possible.
66
+ Returns None when required fields are missing or malformed.
67
+ """
68
+ log = bind(_LOG, doc_id=doc.get("id", "unknown"))
69
+
70
+ try:
71
+ goal = str(doc.get("goal", "")).strip()
72
+ sol1 = str(doc.get("sol1", "")).strip()
73
+ sol2 = str(doc.get("sol2", "")).strip()
74
+ label = doc.get("label")
75
+
76
+ if not goal or not sol1 or not sol2 or label not in {0, 1}:
77
+ log.debug(
78
+ "Skipping doc due to missing/invalid fields",
79
+ extra={"doc": doc},
80
+ )
81
+ return None
82
+
83
+ question = f"Question: {goal}\nAnswer:"
84
+ formatted_question = f"{question}\nA. {sol1}\nB. {sol2}"
85
+
86
+ correct = sol1 if label == 0 else sol2
87
+ incorrect = sol2 if label == 0 else sol1
88
+
89
+ metadata = {
90
+ "label": "piqa",
91
+ }
92
+
93
+ return self._build_pair(
94
+ question=formatted_question,
95
+ correct=correct,
96
+ incorrect=incorrect,
97
+ metadata=metadata,
98
+ )
99
+
100
+ except Exception as exc:
101
+ log.error("Error extracting pair from doc", exc_info=exc, extra={"doc": doc})
102
+ return None
103
+
104
+ @staticmethod
105
+ def _build_pair(
106
+ question: str,
107
+ correct: str,
108
+ incorrect: str,
109
+ metadata: dict[str, Any] | None = None,
110
+ ) -> ContrastivePair:
111
+ positive_response = PositiveResponse(model_response=correct)
112
+ negative_response = NegativeResponse(model_response=incorrect)
113
+ return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))
@@ -0,0 +1,113 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, TYPE_CHECKING
4
+
5
+ from wisent.core.contrastive_pairs.core.pair import ContrastivePair
6
+ from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
7
+ from wisent.core.contrastive_pairs.lm_eval_pairs.atoms import LMEvalBenchmarkExtractor
8
+ from wisent.core.cli_logger import setup_logger, bind
9
+
10
+ if TYPE_CHECKING:
11
+ from lm_eval.api.task import ConfigurableTask
12
+
13
+
14
+ __all__ = ["PROSTExtractor"]
15
+ _LOG = setup_logger(__name__)
16
+
17
+
18
+ class PROSTExtractor(LMEvalBenchmarkExtractor):
19
+ """Extractor for the PROST benchmark."""
20
+
21
+ def extract_contrastive_pairs(
22
+ self,
23
+ lm_eval_task_data: ConfigurableTask,
24
+ limit: int | None = None,
25
+ ) -> list[ContrastivePair]:
26
+ """
27
+ Build contrastive pairs from PROST docs.
28
+
29
+ PROST schema:
30
+ - context: str
31
+ - ex_question: str
32
+ - A, B, C, D: possible answers, str
33
+ - label: 0 or 1 or 2 or 3
34
+
35
+ Args:
36
+ lm_eval_task_data: lm-eval task instance for PROST.
37
+ limit: Optional maximum number of pairs to produce.
38
+
39
+ Returns:
40
+ A list of ContrastivePair objects.
41
+ """
42
+ log = bind(_LOG, task=getattr(lm_eval_task_data, "NAME", "unknown"))
43
+
44
+ max_items = self._normalize_limit(limit)
45
+ docs = self.load_docs(lm_eval_task_data, max_items)
46
+
47
+ pairs: list[ContrastivePair] = []
48
+
49
+ log.info("Extracting contrastive pairs", extra={"doc_count": len(docs)})
50
+
51
+ for doc in docs:
52
+ pair = self._extract_pair_from_doc(doc)
53
+ if pair is not None:
54
+ pairs.append(pair)
55
+ if max_items is not None and len(pairs) >= max_items:
56
+ break
57
+
58
+ if not pairs:
59
+ task_name = getattr(lm_eval_task_data, "NAME", type(lm_eval_task_data).__name__)
60
+ log.warning("No valid PROST pairs extracted", extra={"task": task_name})
61
+
62
+ return pairs
63
+
64
+ def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
65
+ """
66
+ Convert a single PROST doc into a ContrastivePair, if possible.
67
+ Returns None when required fields are missing or malformed.
68
+ """
69
+ log = bind(_LOG, doc_id=doc.get("id", "unknown"))
70
+
71
+ try:
72
+ context= str(doc.get("context", "")).strip()
73
+ question = str(doc.get("ex_question", "")).strip()
74
+ answers = [str(doc.get("A", "")).strip(), str(doc.get("B", "")).strip(), str(doc.get("C", "")).strip(), str(doc.get("D", "")).strip()]
75
+ label = doc.get("label")
76
+
77
+ if not context or not question or not answers or label not in {0, 1, 2, 3}:
78
+ log.debug(
79
+ "Skipping doc due to missing/invalid fields",
80
+ extra={"doc": doc},
81
+ )
82
+ return None
83
+
84
+ correct = answers[label]
85
+ incorrect = answers[(label+1)%len(answers)]
86
+
87
+ formatted_question = f"{context}\nQuestion: {question}\nAnswer:\nA. {incorrect}\nB. {correct}"
88
+
89
+ metadata = {
90
+ "label": "prost",
91
+ }
92
+
93
+ return self._build_pair(
94
+ question=formatted_question,
95
+ correct=correct,
96
+ incorrect=incorrect,
97
+ metadata=metadata,
98
+ )
99
+
100
+ except Exception as exc:
101
+ log.error("Error extracting pair from doc", exc_info=exc, extra={"doc": doc})
102
+ return None
103
+
104
+ @staticmethod
105
+ def _build_pair(
106
+ question: str,
107
+ correct: str,
108
+ incorrect: str,
109
+ metadata: dict[str, Any] | None = None,
110
+ ) -> ContrastivePair:
111
+ positive_response = PositiveResponse(model_response=correct)
112
+ negative_response = NegativeResponse(model_response=incorrect)
113
+ return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))
@@ -0,0 +1,112 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, TYPE_CHECKING
4
+
5
+ from wisent.core.contrastive_pairs.core.pair import ContrastivePair
6
+ from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
7
+ from wisent.core.contrastive_pairs.lm_eval_pairs.atoms import LMEvalBenchmarkExtractor
8
+ from wisent.core.cli_logger import setup_logger, bind
9
+
10
+ if TYPE_CHECKING:
11
+ from lm_eval.api.task import ConfigurableTask
12
+
13
+
14
+ __all__ = ["PubMedQAExtractor"]
15
+ _LOG = setup_logger(__name__)
16
+
17
+
18
+ class PubMedQAExtractor(LMEvalBenchmarkExtractor):
19
+ """Extractor for the PubMedQA benchmark."""
20
+
21
+ def extract_contrastive_pairs(
22
+ self,
23
+ lm_eval_task_data: ConfigurableTask,
24
+ limit: int | None = None,
25
+ ) -> list[ContrastivePair]:
26
+ """
27
+ Build contrastive pairs from PubMedQA docs.
28
+
29
+ PubMedQA schema:
30
+ - CONTEXTS: list of strings to concatenate
31
+ - QUESTION: str
32
+ - final_decision: "yes" or "no"
33
+
34
+ Args:
35
+ lm_eval_task_data: lm-eval task instance for PubMedQA.
36
+ limit: Optional maximum number of pairs to produce.
37
+
38
+ Returns:
39
+ A list of ContrastivePair objects.
40
+ """
41
+ log = bind(_LOG, task=getattr(lm_eval_task_data, "NAME", "unknown"))
42
+
43
+ max_items = self._normalize_limit(limit)
44
+ docs = self.load_docs(lm_eval_task_data, max_items)
45
+
46
+ pairs: list[ContrastivePair] = []
47
+
48
+ log.info("Extracting contrastive pairs", extra={"doc_count": len(docs)})
49
+
50
+ for doc in docs:
51
+ pair = self._extract_pair_from_doc(doc)
52
+ if pair is not None:
53
+ pairs.append(pair)
54
+ if max_items is not None and len(pairs) >= max_items:
55
+ break
56
+
57
+ if not pairs:
58
+ task_name = getattr(lm_eval_task_data, "NAME", type(lm_eval_task_data).__name__)
59
+ log.warning("No valid PubMedQA pairs extracted", extra={"task": task_name})
60
+
61
+ return pairs
62
+
63
+ def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
64
+ """
65
+ Convert a single PubMedQA doc into a ContrastivePair, if possible.
66
+ Returns None when required fields are missing or malformed.
67
+ """
68
+ log = bind(_LOG, doc_id=doc.get("id", "unknown"))
69
+
70
+ try:
71
+ contexts = doc.get("CONTEXTS", [])
72
+ question = str(doc.get("QUESTION", "")).strip()
73
+ final_decision = str(doc.get("final_decision", "")).strip()
74
+
75
+ if not contexts or not question or not final_decision:
76
+ log.debug(
77
+ "Skipping doc due to missing/invalid fields",
78
+ extra={"doc": doc},
79
+ )
80
+ return None
81
+
82
+ formatted_context = " ".join(s.strip() for s in contexts if isinstance(s, str) and s.strip())
83
+ formatted_question = f"Abstract: {formatted_context}\nQuestion: {question}\nAnswer:\nA. yes\nB. no"
84
+
85
+ correct = final_decision
86
+ incorrect = "yes" if correct == "no" else "no"
87
+
88
+ metadata = {
89
+ "label": "pubmedqa",
90
+ }
91
+
92
+ return self._build_pair(
93
+ question=formatted_question,
94
+ correct=correct,
95
+ incorrect=incorrect,
96
+ metadata=metadata,
97
+ )
98
+
99
+ except Exception as exc:
100
+ log.error("Error extracting pair from doc", exc_info=exc, extra={"doc": doc})
101
+ return None
102
+
103
+ @staticmethod
104
+ def _build_pair(
105
+ question: str,
106
+ correct: str,
107
+ incorrect: str,
108
+ metadata: dict[str, Any] | None = None,
109
+ ) -> ContrastivePair:
110
+ positive_response = PositiveResponse(model_response=correct)
111
+ negative_response = NegativeResponse(model_response=incorrect)
112
+ return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))