wisent 0.5.11__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (225) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/__init__.py +26 -0
  3. wisent/core/activations/activations.py +96 -0
  4. wisent/core/activations/activations_collector.py +71 -20
  5. wisent/core/activations/prompt_construction_strategy.py +47 -0
  6. wisent/core/agent/budget.py +2 -2
  7. wisent/core/agent/device_benchmarks.py +1 -1
  8. wisent/core/agent/diagnose/classifier_marketplace.py +8 -8
  9. wisent/core/agent/diagnose/response_diagnostics.py +4 -4
  10. wisent/core/agent/diagnose/synthetic_classifier_option.py +1 -1
  11. wisent/core/agent/diagnose/tasks/task_manager.py +3 -3
  12. wisent/core/agent/diagnose.py +2 -1
  13. wisent/core/autonomous_agent.py +10 -2
  14. wisent/core/benchmark_extractors.py +293 -0
  15. wisent/core/bigcode_integration.py +20 -7
  16. wisent/core/branding.py +108 -0
  17. wisent/core/cli/__init__.py +15 -0
  18. wisent/core/cli/create_steering_vector.py +138 -0
  19. wisent/core/cli/evaluate_responses.py +715 -0
  20. wisent/core/cli/generate_pairs.py +128 -0
  21. wisent/core/cli/generate_pairs_from_task.py +119 -0
  22. wisent/core/cli/generate_responses.py +129 -0
  23. wisent/core/cli/generate_vector_from_synthetic.py +149 -0
  24. wisent/core/cli/generate_vector_from_task.py +147 -0
  25. wisent/core/cli/get_activations.py +191 -0
  26. wisent/core/cli/optimize_classification.py +339 -0
  27. wisent/core/cli/optimize_steering.py +364 -0
  28. wisent/core/cli/tasks.py +182 -0
  29. wisent/core/cli_logger.py +22 -0
  30. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +27 -1
  31. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +49 -1
  32. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +115 -0
  33. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +115 -0
  34. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +111 -0
  35. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +119 -0
  36. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +115 -0
  37. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +114 -0
  38. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +118 -0
  39. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +146 -0
  40. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +129 -0
  41. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +119 -0
  42. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +112 -0
  43. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +113 -0
  44. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livecodebench.py +367 -0
  45. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +115 -0
  46. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +114 -0
  47. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +113 -0
  48. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +112 -0
  49. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +111 -0
  50. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +114 -0
  51. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +113 -0
  52. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +115 -0
  53. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +111 -0
  54. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +113 -0
  55. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +113 -0
  56. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +112 -0
  57. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +116 -0
  58. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +115 -0
  59. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +111 -0
  60. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +111 -0
  61. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +121 -0
  62. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +121 -0
  63. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +111 -0
  64. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +110 -0
  65. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +114 -0
  66. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad2.py +124 -0
  67. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sst2.py +111 -0
  68. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +112 -0
  69. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +127 -0
  70. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_gen.py +112 -0
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +117 -0
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +117 -0
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +127 -0
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +119 -0
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +1 -1
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +111 -0
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +114 -0
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +112 -0
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +114 -0
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +114 -0
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +1 -1
  82. wisent/core/data_loaders/__init__.py +235 -0
  83. wisent/core/data_loaders/loaders/lm_loader.py +2 -2
  84. wisent/core/data_loaders/loaders/task_interface_loader.py +300 -0
  85. wisent/{cli/data_loaders/data_loader_rotator.py → core/data_loaders/rotator.py} +1 -1
  86. wisent/core/download_full_benchmarks.py +79 -2
  87. wisent/core/evaluators/benchmark_specific/__init__.py +26 -0
  88. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/evaluator.py +17 -17
  89. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/cpp_sanitizer.py +2 -2
  90. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/java_sanitizer.py +2 -2
  91. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/python_sanitizer.py +2 -2
  92. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/__init__.py +3 -0
  93. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/provider.py +305 -0
  94. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/core/runtime.py +36 -4
  95. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/entrypoint.py +2 -4
  96. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/recipes.py +1 -1
  97. wisent/core/evaluators/benchmark_specific/coding/solution_generator.py +258 -0
  98. wisent/core/evaluators/benchmark_specific/exact_match_evaluator.py +79 -0
  99. wisent/core/evaluators/benchmark_specific/f1_evaluator.py +101 -0
  100. wisent/core/evaluators/benchmark_specific/generation_evaluator.py +197 -0
  101. wisent/core/{log_likelihoods_evaluator.py → evaluators/benchmark_specific/log_likelihoods_evaluator.py} +10 -2
  102. wisent/core/evaluators/benchmark_specific/perplexity_evaluator.py +140 -0
  103. wisent/core/evaluators/benchmark_specific/personalization_evaluator.py +250 -0
  104. wisent/{cli/evaluators/evaluator_rotator.py → core/evaluators/rotator.py} +4 -4
  105. wisent/core/lm_eval_harness_ground_truth.py +3 -2
  106. wisent/core/main.py +57 -0
  107. wisent/core/model_persistence.py +2 -2
  108. wisent/core/models/wisent_model.py +8 -6
  109. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  110. wisent/core/optuna/steering/steering_optimization.py +1 -1
  111. wisent/core/parser_arguments/__init__.py +10 -0
  112. wisent/core/parser_arguments/agent_parser.py +110 -0
  113. wisent/core/parser_arguments/configure_model_parser.py +7 -0
  114. wisent/core/parser_arguments/create_steering_vector_parser.py +59 -0
  115. wisent/core/parser_arguments/evaluate_parser.py +40 -0
  116. wisent/core/parser_arguments/evaluate_responses_parser.py +10 -0
  117. wisent/core/parser_arguments/full_optimize_parser.py +115 -0
  118. wisent/core/parser_arguments/generate_pairs_from_task_parser.py +33 -0
  119. wisent/core/parser_arguments/generate_pairs_parser.py +29 -0
  120. wisent/core/parser_arguments/generate_responses_parser.py +15 -0
  121. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +127 -0
  122. wisent/core/parser_arguments/generate_vector_from_task_parser.py +127 -0
  123. wisent/core/parser_arguments/generate_vector_parser.py +90 -0
  124. wisent/core/parser_arguments/get_activations_parser.py +90 -0
  125. wisent/core/parser_arguments/main_parser.py +152 -0
  126. wisent/core/parser_arguments/model_config_parser.py +59 -0
  127. wisent/core/parser_arguments/monitor_parser.py +17 -0
  128. wisent/core/parser_arguments/multi_steer_parser.py +47 -0
  129. wisent/core/parser_arguments/optimize_classification_parser.py +67 -0
  130. wisent/core/parser_arguments/optimize_sample_size_parser.py +58 -0
  131. wisent/core/parser_arguments/optimize_steering_parser.py +147 -0
  132. wisent/core/parser_arguments/synthetic_parser.py +93 -0
  133. wisent/core/parser_arguments/tasks_parser.py +584 -0
  134. wisent/core/parser_arguments/test_nonsense_parser.py +26 -0
  135. wisent/core/parser_arguments/utils.py +111 -0
  136. wisent/core/prompts/core/prompt_formater.py +3 -3
  137. wisent/core/prompts/prompt_stratiegies/direct_completion.py +2 -0
  138. wisent/core/prompts/prompt_stratiegies/instruction_following.py +2 -0
  139. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +2 -0
  140. wisent/core/prompts/prompt_stratiegies/role_playing.py +2 -0
  141. wisent/{cli/steering_methods/steering_rotator.py → core/steering_methods/rotator.py} +4 -4
  142. wisent/core/steering_optimizer.py +45 -21
  143. wisent/{synthetic → core/synthetic}/cleaners/deduper_cleaner.py +3 -3
  144. wisent/{synthetic → core/synthetic}/cleaners/methods/base_dedupers.py +2 -2
  145. wisent/{synthetic → core/synthetic}/cleaners/methods/base_refusalers.py +1 -1
  146. wisent/{synthetic → core/synthetic}/cleaners/pairs_cleaner.py +5 -5
  147. wisent/{synthetic → core/synthetic}/cleaners/refusaler_cleaner.py +4 -4
  148. wisent/{synthetic → core/synthetic}/db_instructions/mini_dp.py +1 -1
  149. wisent/{synthetic → core/synthetic}/generators/diversities/methods/fast_diversity.py +1 -1
  150. wisent/{synthetic → core/synthetic}/generators/pairs_generator.py +38 -12
  151. wisent/core/tasks/livecodebench_task.py +4 -103
  152. wisent/core/timing_calibration.py +1 -1
  153. {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/METADATA +3 -3
  154. wisent-0.5.13.dist-info/RECORD +294 -0
  155. wisent-0.5.13.dist-info/entry_points.txt +2 -0
  156. wisent/benchmarks/coding/providers/livecodebench/provider.py +0 -53
  157. wisent/classifiers/core/atoms.py +0 -747
  158. wisent/classifiers/models/logistic.py +0 -29
  159. wisent/classifiers/models/mlp.py +0 -47
  160. wisent/cli/classifiers/classifier_rotator.py +0 -137
  161. wisent/cli/cli_logger.py +0 -142
  162. wisent/cli/wisent_cli/commands/help_cmd.py +0 -52
  163. wisent/cli/wisent_cli/commands/listing.py +0 -154
  164. wisent/cli/wisent_cli/commands/train_cmd.py +0 -322
  165. wisent/cli/wisent_cli/main.py +0 -93
  166. wisent/cli/wisent_cli/shell.py +0 -80
  167. wisent/cli/wisent_cli/ui.py +0 -69
  168. wisent/cli/wisent_cli/util/aggregations.py +0 -43
  169. wisent/cli/wisent_cli/util/parsing.py +0 -126
  170. wisent/cli/wisent_cli/version.py +0 -4
  171. wisent/opti/methods/__init__.py +0 -0
  172. wisent/synthetic/__init__.py +0 -0
  173. wisent/synthetic/cleaners/__init__.py +0 -0
  174. wisent/synthetic/cleaners/core/__init__.py +0 -0
  175. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  176. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  177. wisent/synthetic/db_instructions/__init__.py +0 -0
  178. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  179. wisent/synthetic/generators/__init__.py +0 -0
  180. wisent/synthetic/generators/core/__init__.py +0 -0
  181. wisent/synthetic/generators/diversities/__init__.py +0 -0
  182. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  183. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  184. wisent-0.5.11.dist-info/RECORD +0 -220
  185. /wisent/{benchmarks → core/evaluators/benchmark_specific/coding}/__init__.py +0 -0
  186. /wisent/{benchmarks/coding → core/evaluators/benchmark_specific/coding/metrics}/__init__.py +0 -0
  187. /wisent/{benchmarks/coding/metrics → core/evaluators/benchmark_specific/coding/metrics/core}/__init__.py +0 -0
  188. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/core/atoms.py +0 -0
  189. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/passk.py +0 -0
  190. /wisent/{benchmarks/coding/metrics/core → core/evaluators/benchmark_specific/coding/output_sanitizer}/__init__.py +0 -0
  191. /wisent/{benchmarks/coding/output_sanitizer → core/evaluators/benchmark_specific/coding/output_sanitizer/core}/__init__.py +0 -0
  192. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/core/atoms.py +0 -0
  193. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/utils.py +0 -0
  194. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/providers/__init__.py +0 -0
  195. /wisent/{benchmarks/coding/output_sanitizer → core/evaluators/benchmark_specific/coding/providers}/core/__init__.py +0 -0
  196. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/providers/core/atoms.py +0 -0
  197. /wisent/{benchmarks/coding/providers/core → core/evaluators/benchmark_specific/coding/safe_docker}/__init__.py +0 -0
  198. /wisent/{benchmarks/coding/providers/livecodebench → core/evaluators/benchmark_specific/coding/safe_docker/core}/__init__.py +0 -0
  199. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/core/atoms.py +0 -0
  200. /wisent/{benchmarks/coding/safe_docker → core/opti}/__init__.py +0 -0
  201. /wisent/{benchmarks/coding/safe_docker → core/opti}/core/__init__.py +0 -0
  202. /wisent/{opti → core/opti}/core/atoms.py +0 -0
  203. /wisent/{classifiers → core/opti/methods}/__init__.py +0 -0
  204. /wisent/{opti → core/opti}/methods/opti_classificator.py +0 -0
  205. /wisent/{opti → core/opti}/methods/opti_steering.py +0 -0
  206. /wisent/{classifiers/core → core/synthetic}/__init__.py +0 -0
  207. /wisent/{classifiers/models → core/synthetic/cleaners}/__init__.py +0 -0
  208. /wisent/{cli → core/synthetic/cleaners/core}/__init__.py +0 -0
  209. /wisent/{synthetic → core/synthetic}/cleaners/core/atoms.py +0 -0
  210. /wisent/{cli/classifiers → core/synthetic/cleaners/methods}/__init__.py +0 -0
  211. /wisent/{cli/data_loaders → core/synthetic/cleaners/methods/core}/__init__.py +0 -0
  212. /wisent/{synthetic → core/synthetic}/cleaners/methods/core/atoms.py +0 -0
  213. /wisent/{cli/evaluators → core/synthetic/db_instructions}/__init__.py +0 -0
  214. /wisent/{cli/steering_methods → core/synthetic/db_instructions/core}/__init__.py +0 -0
  215. /wisent/{synthetic → core/synthetic}/db_instructions/core/atoms.py +0 -0
  216. /wisent/{cli/wisent_cli → core/synthetic/generators}/__init__.py +0 -0
  217. /wisent/{cli/wisent_cli/commands → core/synthetic/generators/core}/__init__.py +0 -0
  218. /wisent/{synthetic → core/synthetic}/generators/core/atoms.py +0 -0
  219. /wisent/{cli/wisent_cli/util → core/synthetic/generators/diversities}/__init__.py +0 -0
  220. /wisent/{opti → core/synthetic/generators/diversities/core}/__init__.py +0 -0
  221. /wisent/{synthetic → core/synthetic}/generators/diversities/core/core.py +0 -0
  222. /wisent/{opti/core → core/synthetic/generators/diversities/methods}/__init__.py +0 -0
  223. {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/WHEEL +0 -0
  224. {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/licenses/LICENSE +0 -0
  225. {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,116 @@
1
+ from __future__ import annotations
2
+
3
+ import random
4
+ from typing import Any, TYPE_CHECKING
5
+
6
+ from wisent.core.contrastive_pairs.core.pair import ContrastivePair
7
+ from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
8
+ from wisent.core.contrastive_pairs.lm_eval_pairs.atoms import LMEvalBenchmarkExtractor
9
+ from wisent.core.cli_logger import setup_logger, bind
10
+
11
+ if TYPE_CHECKING:
12
+ from lm_eval.api.task import ConfigurableTask
13
+
14
+
15
+ __all__ = ["QA4MREExtractor"]
16
+ _LOG = setup_logger(__name__)
17
+
18
+
19
+ class QA4MREExtractor(LMEvalBenchmarkExtractor):
20
+ """Extractor for the QA4MRE benchmark."""
21
+
22
+ def extract_contrastive_pairs(
23
+ self,
24
+ lm_eval_task_data: ConfigurableTask,
25
+ limit: int | None = None,
26
+ ) -> list[ContrastivePair]:
27
+ """
28
+ Build contrastive pairs from QA4MRE docs.
29
+
30
+ Race schema:
31
+ - document_str: str
32
+ - question_str: str
33
+ - answer_options: dict
34
+ - correct_answer_id: str
35
+
36
+ Args:
37
+ lm_eval_task_data: lm-eval task instance for QA4MRE.
38
+ limit: Optional maximum number of pairs to produce.
39
+
40
+ Returns:
41
+ A list of ContrastivePair objects.
42
+ """
43
+ log = bind(_LOG, task=getattr(lm_eval_task_data, "NAME", "unknown"))
44
+
45
+ max_items = self._normalize_limit(limit)
46
+ docs = self.load_docs(lm_eval_task_data, max_items)
47
+
48
+ pairs: list[ContrastivePair] = []
49
+
50
+ log.info("Extracting contrastive pairs", extra={"doc_count": len(docs)})
51
+
52
+ for doc in docs:
53
+ pair = self._extract_pair_from_doc(doc)
54
+ if pair is not None:
55
+ pairs.append(pair)
56
+ if max_items is not None and len(pairs) >= max_items:
57
+ break
58
+
59
+ if not pairs:
60
+ task_name = getattr(lm_eval_task_data, "NAME", type(lm_eval_task_data).__name__)
61
+ log.warning("No valid QA4MRE pairs extracted", extra={"task": task_name})
62
+
63
+ return pairs
64
+
65
+ def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
66
+ """
67
+ Convert a single QA4MRE doc into a ContrastivePair, if possible.
68
+ Returns None when required fields are missing or malformed.
69
+ """
70
+ log = bind(_LOG, doc_id=doc.get("id", "unknown"))
71
+
72
+ try:
73
+ document_str = str(doc.get("document_str", "")).strip()
74
+ question_str = str(doc.get("question_str")).strip()
75
+ answers = doc.get("answer_options", {}).get("answer_str", [])
76
+ answer = str(doc.get("correct_answer_id", "")).strip()
77
+ answer = int(answer) - 1
78
+
79
+
80
+ if not document_str or not question_str or not answers or not (0 <= answer < len(answers)):
81
+ log.debug(
82
+ "Skipping doc due to missing/invalid fields",
83
+ extra={"doc": doc},
84
+ )
85
+ return None
86
+
87
+ correct = answers[answer]
88
+ incorrect = answers[(answer+1)%len(answers)]
89
+
90
+ formatted_question = f"{document_str}\nQuestion: {question_str}?\nAnswer:\nA. {correct}\nB. {incorrect}"
91
+
92
+ metadata = {
93
+ "label": "qa4mre",
94
+ }
95
+
96
+ return self._build_pair(
97
+ question=formatted_question,
98
+ correct=correct,
99
+ incorrect=incorrect,
100
+ metadata=metadata,
101
+ )
102
+
103
+ except Exception as exc:
104
+ log.error("Error extracting pair from doc", exc_info=exc, extra={"doc": doc})
105
+ return None
106
+
107
+ @staticmethod
108
+ def _build_pair(
109
+ question: str,
110
+ correct: str,
111
+ incorrect: str,
112
+ metadata: dict[str, Any] | None = None,
113
+ ) -> ContrastivePair:
114
+ positive_response = PositiveResponse(model_response=correct)
115
+ negative_response = NegativeResponse(model_response=incorrect)
116
+ return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))
@@ -0,0 +1,115 @@
1
+ from __future__ import annotations
2
+
3
+ import random
4
+ from typing import Any, TYPE_CHECKING
5
+
6
+ from wisent.core.contrastive_pairs.core.pair import ContrastivePair
7
+ from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
8
+ from wisent.core.contrastive_pairs.lm_eval_pairs.atoms import LMEvalBenchmarkExtractor
9
+ from wisent.core.cli_logger import setup_logger, bind
10
+
11
+ if TYPE_CHECKING:
12
+ from lm_eval.api.task import ConfigurableTask
13
+
14
+
15
+ __all__ = ["QASPERExtractor"]
16
+ _LOG = setup_logger(__name__)
17
+
18
+
19
+ class QASPERExtractor(LMEvalBenchmarkExtractor):
20
+ """Extractor for the QASPER benchmark."""
21
+
22
+ def extract_contrastive_pairs(
23
+ self,
24
+ lm_eval_task_data: ConfigurableTask,
25
+ limit: int | None = None,
26
+ ) -> list[ContrastivePair]:
27
+ """
28
+ Build contrastive pairs from QASPER docs.
29
+
30
+ Race schema:
31
+ - title: str
32
+ - abstract: str
33
+ - question: str
34
+ - answer: str
35
+
36
+ Args:
37
+ lm_eval_task_data: lm-eval task instance for QASPER.
38
+ limit: Optional maximum number of pairs to produce.
39
+
40
+ Returns:
41
+ A list of ContrastivePair objects.
42
+ """
43
+ log = bind(_LOG, task=getattr(lm_eval_task_data, "NAME", "unknown"))
44
+
45
+ max_items = self._normalize_limit(limit)
46
+ docs = self.load_docs(lm_eval_task_data, max_items)
47
+
48
+ pairs: list[ContrastivePair] = []
49
+
50
+ log.info("Extracting contrastive pairs", extra={"doc_count": len(docs)})
51
+
52
+ for doc in docs:
53
+ pair = self._extract_pair_from_doc(doc)
54
+ if pair is not None:
55
+ pairs.append(pair)
56
+ if max_items is not None and len(pairs) >= max_items:
57
+ break
58
+
59
+ if not pairs:
60
+ task_name = getattr(lm_eval_task_data, "NAME", type(lm_eval_task_data).__name__)
61
+ log.warning("No valid QA4MRE pairs extracted", extra={"task": task_name})
62
+
63
+ return pairs
64
+
65
+ def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
66
+ """
67
+ Convert a single QASPER doc into a ContrastivePair, if possible.
68
+ Returns None when required fields are missing or malformed.
69
+ """
70
+ log = bind(_LOG, doc_id=doc.get("id", "unknown"))
71
+
72
+ try:
73
+ title = str(doc.get("title", "")).strip()
74
+ abstract = str(doc.get("abstract", "")).strip()
75
+ question = str(doc.get("question", "")).strip()
76
+ answer = str(doc.get("answer", "")).strip()
77
+
78
+ if not title or not abstract or not question or not answer:
79
+ log.debug(
80
+ "Skipping doc due to missing/invalid fields",
81
+ extra={"doc": doc},
82
+ )
83
+ return None
84
+
85
+
86
+ formatted_question = f"TITLE: {title}\nABSTRACT: {abstract}\nQ: {question}\nA. yes\nB. no"
87
+
88
+ correct = answer
89
+ incorrect = "yes" if answer == "no" else "no"
90
+
91
+ metadata = {
92
+ "label": "qasper",
93
+ }
94
+
95
+ return self._build_pair(
96
+ question=formatted_question,
97
+ correct=correct,
98
+ incorrect=incorrect,
99
+ metadata=metadata,
100
+ )
101
+
102
+ except Exception as exc:
103
+ log.error("Error extracting pair from doc", exc_info=exc, extra={"doc": doc})
104
+ return None
105
+
106
+ @staticmethod
107
+ def _build_pair(
108
+ question: str,
109
+ correct: str,
110
+ incorrect: str,
111
+ metadata: dict[str, Any] | None = None,
112
+ ) -> ContrastivePair:
113
+ positive_response = PositiveResponse(model_response=correct)
114
+ negative_response = NegativeResponse(model_response=incorrect)
115
+ return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))
@@ -0,0 +1,111 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, TYPE_CHECKING
4
+
5
+ from wisent.core.contrastive_pairs.core.pair import ContrastivePair
6
+ from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
7
+ from wisent.core.contrastive_pairs.lm_eval_pairs.atoms import LMEvalBenchmarkExtractor
8
+ from wisent.core.cli_logger import setup_logger, bind
9
+
10
+ if TYPE_CHECKING:
11
+ from lm_eval.api.task import ConfigurableTask
12
+
13
+
14
+ __all__ = ["QNLIExtractor"]
15
+ _LOG = setup_logger(__name__)
16
+
17
+
18
+ class QNLIExtractor(LMEvalBenchmarkExtractor):
19
+ """Extractor for the QNLI benchmark."""
20
+
21
+ def extract_contrastive_pairs(
22
+ self,
23
+ lm_eval_task_data: ConfigurableTask,
24
+ limit: int | None = None,
25
+ ) -> list[ContrastivePair]:
26
+ """
27
+ Build contrastive pairs from QNLI docs.
28
+
29
+ QNLI schema:
30
+ - sentence: str
31
+ - question: str
32
+ - label: 0 or 1
33
+
34
+ Args:
35
+ lm_eval_task_data: lm-eval task instance for QNLI.
36
+ limit: Optional maximum number of pairs to produce.
37
+
38
+ Returns:
39
+ A list of ContrastivePair objects.
40
+ """
41
+ log = bind(_LOG, task=getattr(lm_eval_task_data, "NAME", "unknown"))
42
+
43
+ max_items = self._normalize_limit(limit)
44
+ docs = self.load_docs(lm_eval_task_data, max_items)
45
+
46
+ pairs: list[ContrastivePair] = []
47
+
48
+ log.info("Extracting contrastive pairs", extra={"doc_count": len(docs)})
49
+
50
+ for doc in docs:
51
+ pair = self._extract_pair_from_doc(doc)
52
+ if pair is not None:
53
+ pairs.append(pair)
54
+ if max_items is not None and len(pairs) >= max_items:
55
+ break
56
+
57
+ if not pairs:
58
+ task_name = getattr(lm_eval_task_data, "NAME", type(lm_eval_task_data).__name__)
59
+ log.warning("No valid QNLI pairs extracted", extra={"task": task_name})
60
+
61
+ return pairs
62
+
63
+ def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
64
+ """
65
+ Convert a single QNLI doc into a ContrastivePair, if possible.
66
+ Returns None when required fields are missing or malformed.
67
+ """
68
+ log = bind(_LOG, doc_id=doc.get("id", "unknown"))
69
+
70
+ try:
71
+ sentence = str(doc.get("sentence", "")).strip()
72
+ question = str(doc.get("question", "")).strip()
73
+ label = doc.get("label")
74
+
75
+ if not sentence or not question or label not in {0, 1}:
76
+ log.debug(
77
+ "Skipping doc due to missing/invalid fields",
78
+ extra={"doc": doc},
79
+ )
80
+ return None
81
+
82
+ formatted_question = f"{question}\n{sentence}\nQuestion: Does this response answer the question?\nAnswer:\nA. Yes\nB. No"
83
+
84
+ correct = "Yes" if label == 0 else "No"
85
+ incorrect = "No" if label == 0 else "Yes"
86
+
87
+ metadata = {
88
+ "label": "qnli",
89
+ }
90
+
91
+ return self._build_pair(
92
+ question=formatted_question,
93
+ correct=correct,
94
+ incorrect=incorrect,
95
+ metadata=metadata,
96
+ )
97
+
98
+ except Exception as exc:
99
+ log.error("Error extracting pair from doc", exc_info=exc, extra={"doc": doc})
100
+ return None
101
+
102
+ @staticmethod
103
+ def _build_pair(
104
+ question: str,
105
+ correct: str,
106
+ incorrect: str,
107
+ metadata: dict[str, Any] | None = None,
108
+ ) -> ContrastivePair:
109
+ positive_response = PositiveResponse(model_response=correct)
110
+ negative_response = NegativeResponse(model_response=incorrect)
111
+ return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))
@@ -0,0 +1,111 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, TYPE_CHECKING
4
+
5
+ from wisent.core.contrastive_pairs.core.pair import ContrastivePair
6
+ from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
7
+ from wisent.core.contrastive_pairs.lm_eval_pairs.atoms import LMEvalBenchmarkExtractor
8
+ from wisent.core.cli_logger import setup_logger, bind
9
+
10
+ if TYPE_CHECKING:
11
+ from lm_eval.api.task import ConfigurableTask
12
+
13
+
14
+ __all__ = ["QQPxtractor"]
15
+ _LOG = setup_logger(__name__)
16
+
17
+
18
+ class QQPExtractor(LMEvalBenchmarkExtractor):
19
+ """Extractor for the QQP benchmark."""
20
+
21
+ def extract_contrastive_pairs(
22
+ self,
23
+ lm_eval_task_data: ConfigurableTask,
24
+ limit: int | None = None,
25
+ ) -> list[ContrastivePair]:
26
+ """
27
+ Build contrastive pairs from QQP docs.
28
+
29
+ QQP schema:
30
+ - question1: str
31
+ - question2: str
32
+ - label: 0 or 1
33
+
34
+ Args:
35
+ lm_eval_task_data: lm-eval task instance for QQP.
36
+ limit: Optional maximum number of pairs to produce.
37
+
38
+ Returns:
39
+ A list of ContrastivePair objects.
40
+ """
41
+ log = bind(_LOG, task=getattr(lm_eval_task_data, "NAME", "unknown"))
42
+
43
+ max_items = self._normalize_limit(limit)
44
+ docs = self.load_docs(lm_eval_task_data, max_items)
45
+
46
+ pairs: list[ContrastivePair] = []
47
+
48
+ log.info("Extracting contrastive pairs", extra={"doc_count": len(docs)})
49
+
50
+ for doc in docs:
51
+ pair = self._extract_pair_from_doc(doc)
52
+ if pair is not None:
53
+ pairs.append(pair)
54
+ if max_items is not None and len(pairs) >= max_items:
55
+ break
56
+
57
+ if not pairs:
58
+ task_name = getattr(lm_eval_task_data, "NAME", type(lm_eval_task_data).__name__)
59
+ log.warning("No valid QNLI pairs extracted", extra={"task": task_name})
60
+
61
+ return pairs
62
+
63
+ def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
64
+ """
65
+ Convert a single QQP doc into a ContrastivePair, if possible.
66
+ Returns None when required fields are missing or malformed.
67
+ """
68
+ log = bind(_LOG, doc_id=doc.get("id", "unknown"))
69
+
70
+ try:
71
+ question1 = str(doc.get("question1", "")).strip()
72
+ question2 = str(doc.get("question2", "")).strip()
73
+ label = doc.get("label")
74
+
75
+ if not question1 or not question2 or label not in {0, 1}:
76
+ log.debug(
77
+ "Skipping doc due to missing/invalid fields",
78
+ extra={"doc": doc},
79
+ )
80
+ return None
81
+
82
+ formatted_question = f"{question1}\n{question2}\nQuestion: Do both qiestions ask the same thing?\nAnswer:\nA. Yes\nB. No"
83
+
84
+ correct = "Yes" if label == 1 else "No"
85
+ incorrect = "No" if label == 1 else "Yes"
86
+
87
+ metadata = {
88
+ "label": "qqp",
89
+ }
90
+
91
+ return self._build_pair(
92
+ question=formatted_question,
93
+ correct=correct,
94
+ incorrect=incorrect,
95
+ metadata=metadata,
96
+ )
97
+
98
+ except Exception as exc:
99
+ log.error("Error extracting pair from doc", exc_info=exc, extra={"doc": doc})
100
+ return None
101
+
102
+ @staticmethod
103
+ def _build_pair(
104
+ question: str,
105
+ correct: str,
106
+ incorrect: str,
107
+ metadata: dict[str, Any] | None = None,
108
+ ) -> ContrastivePair:
109
+ positive_response = PositiveResponse(model_response=correct)
110
+ negative_response = NegativeResponse(model_response=incorrect)
111
+ return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))
@@ -0,0 +1,121 @@
1
+ from __future__ import annotations
2
+
3
+ import random
4
+ import ast
5
+ from typing import Any, TYPE_CHECKING
6
+
7
+ from wisent.core.contrastive_pairs.core.pair import ContrastivePair
8
+ from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
9
+ from wisent.core.contrastive_pairs.lm_eval_pairs.atoms import LMEvalBenchmarkExtractor
10
+ from wisent.core.cli_logger import setup_logger, bind
11
+
12
+ if TYPE_CHECKING:
13
+ from lm_eval.api.task import ConfigurableTask
14
+
15
+
16
+ __all__ = ["RaceExtractor"]
17
+ _LOG = setup_logger(__name__)
18
+
19
+
20
+ class RaceExtractor(LMEvalBenchmarkExtractor):
21
+ """Extractor for the Race benchmark."""
22
+
23
+ def extract_contrastive_pairs(
24
+ self,
25
+ lm_eval_task_data: ConfigurableTask,
26
+ limit: int | None = None,
27
+ ) -> list[ContrastivePair]:
28
+ """
29
+ Build contrastive pairs from Race docs.
30
+
31
+ Race schema:
32
+ - article: str
33
+ - problems: str
34
+
35
+ Args:
36
+ lm_eval_task_data: lm-eval task instance for Race.
37
+ limit: Optional maximum number of pairs to produce.
38
+
39
+ Returns:
40
+ A list of ContrastivePair objects.
41
+ """
42
+ log = bind(_LOG, task=getattr(lm_eval_task_data, "NAME", "unknown"))
43
+
44
+ max_items = self._normalize_limit(limit)
45
+ docs = self.load_docs(lm_eval_task_data, max_items)
46
+
47
+ pairs: list[ContrastivePair] = []
48
+
49
+ log.info("Extracting contrastive pairs", extra={"doc_count": len(docs)})
50
+
51
+ for doc in docs:
52
+ pair = self._extract_pair_from_doc(doc)
53
+ if pair is not None:
54
+ pairs.append(pair)
55
+ if max_items is not None and len(pairs) >= max_items:
56
+ break
57
+
58
+ if not pairs:
59
+ task_name = getattr(lm_eval_task_data, "NAME", type(lm_eval_task_data).__name__)
60
+ log.warning("No valid Race pairs extracted", extra={"task": task_name})
61
+
62
+ return pairs
63
+
64
+ def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
65
+ """
66
+ Convert a single Race doc into a ContrastivePair, if possible.
67
+ Returns None when required fields are missing or malformed.
68
+ """
69
+ log = bind(_LOG, doc_id=doc.get("id", "unknown"))
70
+
71
+ try:
72
+ article = str(doc.get("article", "")).strip()
73
+ problems = doc.get("problems", "")
74
+ try:
75
+ problems = ast.literal_eval(problems)
76
+ except Exception as e:
77
+ print(f"Failed to parse problems: {e}")
78
+
79
+ if not article or not problems:
80
+ log.debug(
81
+ "Skipping doc due to missing/invalid fields",
82
+ extra={"doc": doc},
83
+ )
84
+ return None
85
+
86
+ problem = problems[0]
87
+ question = problem["question"]
88
+ options = problem["options"]
89
+ answer = problem["answer"]
90
+ answer_idx = int(ord(answer) - ord("A"))
91
+
92
+ correct = options[answer_idx]
93
+ incorrect = options[(answer_idx+1)%len(options)]
94
+
95
+ formatted_question = f"{article}\nQuestion: {question}?\nAnswer:\nA. {correct}\nB. {incorrect}"
96
+
97
+ metadata = {
98
+ "label": "race",
99
+ }
100
+
101
+ return self._build_pair(
102
+ question=formatted_question,
103
+ correct=correct,
104
+ incorrect=incorrect,
105
+ metadata=metadata,
106
+ )
107
+
108
+ except Exception as exc:
109
+ log.error("Error extracting pair from doc", exc_info=exc, extra={"doc": doc})
110
+ return None
111
+
112
+ @staticmethod
113
+ def _build_pair(
114
+ question: str,
115
+ correct: str,
116
+ incorrect: str,
117
+ metadata: dict[str, Any] | None = None,
118
+ ) -> ContrastivePair:
119
+ positive_response = PositiveResponse(model_response=correct)
120
+ negative_response = NegativeResponse(model_response=incorrect)
121
+ return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))
@@ -0,0 +1,121 @@
1
+ from __future__ import annotations
2
+
3
+ import random
4
+ from typing import Any, TYPE_CHECKING
5
+
6
+ from wisent.core.contrastive_pairs.core.pair import ContrastivePair
7
+ from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
8
+ from wisent.core.contrastive_pairs.lm_eval_pairs.atoms import LMEvalBenchmarkExtractor
9
+ from wisent.core.cli_logger import setup_logger, bind
10
+
11
+ if TYPE_CHECKING:
12
+ from lm_eval.api.task import ConfigurableTask
13
+
14
+
15
+ __all__ = ["ReCoRDExtractor"]
16
+ _LOG = setup_logger(__name__)
17
+
18
+
19
+ class ReCoRDExtractor(LMEvalBenchmarkExtractor):
20
+ """Extractor for the ReCoRD benchmark."""
21
+
22
+ def extract_contrastive_pairs(
23
+ self,
24
+ lm_eval_task_data: ConfigurableTask,
25
+ limit: int | None = None,
26
+ ) -> list[ContrastivePair]:
27
+ """
28
+ Build contrastive pairs from ReCoRD docs.
29
+
30
+ ReCoRD schema:
31
+ - paassage: str
32
+ - query: str
33
+ - entities: list
34
+ - answers: list
35
+
36
+ Args:
37
+ lm_eval_task_data: lm-eval task instance for ReCoRD.
38
+ limit: Optional maximum number of pairs to produce.
39
+
40
+ Returns:
41
+ A list of ContrastivePair objects.
42
+ """
43
+ log = bind(_LOG, task=getattr(lm_eval_task_data, "NAME", "unknown"))
44
+
45
+ max_items = self._normalize_limit(limit)
46
+ docs = self.load_docs(lm_eval_task_data, max_items)
47
+
48
+ pairs: list[ContrastivePair] = []
49
+
50
+ log.info("Extracting contrastive pairs", extra={"doc_count": len(docs)})
51
+
52
+ for doc in docs:
53
+ pair = self._extract_pair_from_doc(doc)
54
+ if pair is not None:
55
+ pairs.append(pair)
56
+ if max_items is not None and len(pairs) >= max_items:
57
+ break
58
+
59
+ if not pairs:
60
+ task_name = getattr(lm_eval_task_data, "NAME", type(lm_eval_task_data).__name__)
61
+ log.warning("No valid ReCoRD pairs extracted", extra={"task": task_name})
62
+
63
+ return pairs
64
+
65
+ def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
66
+ """
67
+ Convert a single ReCoRD doc into a ContrastivePair, if possible.
68
+ Returns None when required fields are missing or malformed.
69
+ """
70
+ log = bind(_LOG, doc_id=doc.get("id", "unknown"))
71
+
72
+ try:
73
+ passage = str(doc.get("passage", "")).strip()
74
+ query = str(doc.get("query", "")).strip()
75
+ entities = doc.get("entities", [])
76
+ answers = doc.get("answers", [])
77
+
78
+ if not passage or not query or not entities or not answers:
79
+ log.debug(
80
+ "Skipping doc due to missing/invalid fields",
81
+ extra={"doc": doc},
82
+ )
83
+ return None
84
+
85
+ correct = answers[0]
86
+ incorrect = None
87
+ for entity in entities:
88
+ if entity not in answers:
89
+ incorrect = entity
90
+ break
91
+
92
+ # Remove @highlight prefix
93
+ passage = passage.replace('@highlight', '')
94
+
95
+ formatted_question = f"Passage: {passage}\n\nQuery: {query}\nWhich option correctly completes the sentence at @placeholder?\nA. {incorrect}\nB. {correct}"
96
+
97
+ metadata = {
98
+ "label": "record",
99
+ }
100
+
101
+ return self._build_pair(
102
+ question=formatted_question,
103
+ correct=correct,
104
+ incorrect=incorrect,
105
+ metadata=metadata,
106
+ )
107
+
108
+ except Exception as exc:
109
+ log.error("Error extracting pair from doc", exc_info=exc, extra={"doc": doc})
110
+ return None
111
+
112
+ @staticmethod
113
+ def _build_pair(
114
+ question: str,
115
+ correct: str,
116
+ incorrect: str,
117
+ metadata: dict[str, Any] | None = None,
118
+ ) -> ContrastivePair:
119
+ positive_response = PositiveResponse(model_response=correct)
120
+ negative_response = NegativeResponse(model_response=incorrect)
121
+ return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))