wisent 0.7.701__py3-none-any.whl → 0.7.901__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/activation_cache.py +393 -0
  3. wisent/core/activations/activations.py +3 -3
  4. wisent/core/activations/activations_collector.py +9 -5
  5. wisent/core/activations/classifier_inference_strategy.py +12 -11
  6. wisent/core/activations/extraction_strategy.py +256 -84
  7. wisent/core/classifiers/classifiers/core/atoms.py +3 -2
  8. wisent/core/cli/__init__.py +2 -1
  9. wisent/core/cli/agent/apply_steering.py +5 -7
  10. wisent/core/cli/agent/train_classifier.py +19 -7
  11. wisent/core/cli/check_linearity.py +35 -3
  12. wisent/core/cli/cluster_benchmarks.py +4 -6
  13. wisent/core/cli/create_steering_vector.py +6 -4
  14. wisent/core/cli/diagnose_vectors.py +7 -4
  15. wisent/core/cli/estimate_unified_goodness_time.py +6 -4
  16. wisent/core/cli/generate_pairs_from_task.py +9 -56
  17. wisent/core/cli/geometry_search.py +137 -0
  18. wisent/core/cli/get_activations.py +1 -1
  19. wisent/core/cli/method_optimizer.py +4 -3
  20. wisent/core/cli/modify_weights.py +3 -2
  21. wisent/core/cli/optimize_sample_size.py +1 -1
  22. wisent/core/cli/optimize_steering.py +14 -16
  23. wisent/core/cli/optimize_weights.py +2 -1
  24. wisent/core/cli/preview_pairs.py +203 -0
  25. wisent/core/cli/steering_method_trainer.py +3 -3
  26. wisent/core/cli/tasks.py +19 -76
  27. wisent/core/cli/train_unified_goodness.py +3 -3
  28. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +4 -4
  29. wisent/core/contrastive_pairs/diagnostics/linearity.py +7 -0
  30. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
  31. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
  32. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
  33. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
  34. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
  35. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
  36. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
  37. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
  38. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
  39. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
  40. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
  41. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
  42. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
  43. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
  44. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
  45. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
  46. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
  47. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
  48. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
  49. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
  50. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
  51. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
  52. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
  53. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +3 -0
  54. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
  55. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
  56. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
  57. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
  58. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
  59. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
  60. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
  61. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
  62. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
  63. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
  64. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
  65. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
  66. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
  67. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
  68. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
  69. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
  70. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
  82. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
  83. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
  84. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
  85. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
  86. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
  87. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
  88. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
  89. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
  90. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
  91. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
  92. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
  93. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
  94. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
  95. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
  96. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
  102. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
  103. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
  104. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
  105. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
  106. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
  107. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
  108. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
  109. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
  110. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
  111. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
  112. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
  113. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
  114. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
  115. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
  116. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
  117. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
  118. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
  119. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
  120. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
  121. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
  122. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
  123. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
  124. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
  125. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
  126. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
  127. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
  128. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
  129. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
  130. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
  131. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
  132. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
  133. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
  134. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
  135. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
  136. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
  137. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
  138. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
  139. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
  140. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
  141. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
  142. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
  143. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
  144. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
  145. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
  146. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
  147. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
  148. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
  149. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
  150. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
  151. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
  152. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
  153. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
  154. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
  155. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
  156. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
  157. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
  158. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
  159. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
  160. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
  161. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
  162. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
  163. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
  164. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
  165. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
  166. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
  167. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
  168. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
  169. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
  170. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
  171. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
  172. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
  173. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
  174. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
  175. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
  176. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
  177. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
  178. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
  179. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
  180. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
  181. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
  182. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
  183. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
  184. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
  185. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
  186. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
  187. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
  188. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
  189. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
  190. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
  191. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
  192. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
  193. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
  194. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
  195. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
  196. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
  197. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
  198. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
  199. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
  200. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
  201. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
  202. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
  203. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
  204. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
  205. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
  206. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
  207. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
  208. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
  209. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
  210. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
  211. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
  212. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
  213. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
  214. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
  215. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
  216. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
  217. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
  218. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
  219. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
  220. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
  221. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
  222. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
  223. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
  224. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
  225. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
  226. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
  227. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
  228. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
  229. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
  230. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
  231. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
  232. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
  233. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
  234. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
  235. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
  236. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
  237. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
  238. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
  239. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
  240. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
  241. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
  242. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
  243. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
  244. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
  245. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
  246. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
  247. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
  248. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
  249. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
  250. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
  251. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
  252. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
  253. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
  254. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
  255. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
  256. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
  257. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
  258. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
  259. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
  260. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
  261. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
  262. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
  263. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
  264. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
  265. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
  266. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
  267. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
  268. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
  269. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
  270. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
  271. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
  272. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
  273. wisent/core/data_loaders/loaders/lm_loader.py +12 -1
  274. wisent/core/geometry_runner.py +995 -0
  275. wisent/core/geometry_search_space.py +237 -0
  276. wisent/core/hyperparameter_optimizer.py +1 -1
  277. wisent/core/main.py +3 -0
  278. wisent/core/models/core/atoms.py +5 -3
  279. wisent/core/models/wisent_model.py +1 -1
  280. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  281. wisent/core/parser_arguments/check_linearity_parser.py +12 -2
  282. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
  283. wisent/core/parser_arguments/generate_vector_from_task_parser.py +2 -2
  284. wisent/core/parser_arguments/geometry_search_parser.py +61 -0
  285. wisent/core/parser_arguments/main_parser.py +8 -0
  286. wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
  287. wisent/core/steering.py +5 -3
  288. wisent/core/steering_methods/methods/hyperplane.py +2 -1
  289. wisent/core/synthetic/generators/nonsense_generator.py +30 -18
  290. wisent/core/trainers/steering_trainer.py +2 -2
  291. wisent/core/utils/device.py +27 -27
  292. wisent/core/utils/layer_combinations.py +70 -0
  293. wisent/examples/__init__.py +1 -0
  294. wisent/examples/scripts/__init__.py +1 -0
  295. wisent/examples/scripts/count_all_benchmarks.py +121 -0
  296. wisent/examples/scripts/discover_directions.py +469 -0
  297. wisent/examples/scripts/extract_benchmark_info.py +71 -0
  298. wisent/examples/scripts/generate_paper_data.py +384 -0
  299. wisent/examples/scripts/intervention_validation.py +626 -0
  300. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
  301. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
  302. wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
  303. wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
  304. wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
  305. wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
  306. wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
  307. wisent/examples/scripts/search_all_short_names.py +31 -0
  308. wisent/examples/scripts/test_all_benchmarks.py +138 -0
  309. wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
  310. wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
  311. wisent/examples/scripts/test_nonsense_baseline.py +261 -0
  312. wisent/examples/scripts/test_one_benchmark.py +324 -0
  313. wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
  314. wisent/examples/scripts/threshold_analysis.py +434 -0
  315. wisent/examples/scripts/visualization_gallery.py +582 -0
  316. wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
  317. wisent/parameters/lm_eval/category_directions.json +137 -0
  318. wisent/parameters/lm_eval/repair_plan.json +282 -0
  319. wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
  320. wisent/parameters/lm_eval/working_benchmarks.json +206 -0
  321. wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
  322. wisent/tests/test_detector_accuracy.py +1 -1
  323. wisent/tests/visualize_geometry.py +1 -1
  324. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
  325. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/RECORD +329 -295
  326. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
  327. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
  328. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
  329. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
  330. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
@@ -123,14 +123,12 @@ class Llama3Extractor(LMEvalBenchmarkExtractor):
123
123
  incorrect_idx = (answer_idx + 1) % len(choices)
124
124
  incorrect = choices[incorrect_idx]
125
125
 
126
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
127
-
128
126
  metadata = {
129
127
  "label": "llama3",
130
128
  }
131
129
 
132
130
  return self._build_pair(
133
- question=formatted_question,
131
+ question=question,
134
132
  correct=correct,
135
133
  incorrect=incorrect,
136
134
  metadata=metadata,
@@ -123,14 +123,12 @@ class LmSynevalExtractor(LMEvalBenchmarkExtractor):
123
123
  incorrect_idx = (answer_idx + 1) % len(choices)
124
124
  incorrect = choices[incorrect_idx]
125
125
 
126
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
127
-
128
126
  metadata = {
129
127
  "label": "lm_syneval",
130
128
  }
131
129
 
132
130
  return self._build_pair(
133
- question=formatted_question,
131
+ question=question,
134
132
  correct=correct,
135
133
  incorrect=incorrect,
136
134
  metadata=metadata,
@@ -86,14 +86,14 @@ class LogiQAExtractor(LMEvalBenchmarkExtractor):
86
86
  incorrect = options[(label_idx+1)%len(options)]
87
87
 
88
88
  question = f"{question}"
89
- formatted_question = f"Passage: {context}\nQuestion: {question}\nA. {incorrect}\nB. {correct}"
89
+ prompt = f"Passage: {context}\nQuestion: {question}"
90
90
 
91
91
  metadata = {
92
92
  "label": "logiqa",
93
93
  }
94
94
 
95
95
  return self._build_pair(
96
- question=formatted_question,
96
+ question=prompt,
97
97
  correct=correct,
98
98
  incorrect=incorrect,
99
99
  metadata=metadata,
@@ -85,14 +85,14 @@ class LogiQA2Extractor(LMEvalBenchmarkExtractor):
85
85
  incorrect = options[(answer+1)%len(options)]
86
86
 
87
87
  question = f"{question}"
88
- formatted_question = f"Passage: {text}\nQuestion: {question}\nA. {incorrect}\nB. {correct}"
88
+ prompt = f"Passage: {text}\nQuestion: {question}"
89
89
 
90
90
  metadata = {
91
91
  "label": "logiqa2",
92
92
  }
93
93
 
94
94
  return self._build_pair(
95
- question=formatted_question,
95
+ question=prompt,
96
96
  correct=correct,
97
97
  incorrect=incorrect,
98
98
  metadata=metadata,
@@ -123,14 +123,12 @@ class LongbenchExtractor(LMEvalBenchmarkExtractor):
123
123
  incorrect_idx = (answer_idx + 1) % len(choices)
124
124
  incorrect = choices[incorrect_idx]
125
125
 
126
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
127
-
128
126
  metadata = {
129
127
  "label": "longbench",
130
128
  }
131
129
 
132
130
  return self._build_pair(
133
- question=formatted_question,
131
+ question=question,
134
132
  correct=correct,
135
133
  incorrect=incorrect,
136
134
  metadata=metadata,
@@ -123,14 +123,12 @@ class Longbenchv2Extractor(LMEvalBenchmarkExtractor):
123
123
  incorrect_idx = (answer_idx + 1) % len(choices)
124
124
  incorrect = choices[incorrect_idx]
125
125
 
126
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
127
-
128
126
  metadata = {
129
127
  "label": "longbenchv2",
130
128
  }
131
129
 
132
130
  return self._build_pair(
133
- question=formatted_question,
131
+ question=question,
134
132
  correct=correct,
135
133
  incorrect=incorrect,
136
134
  metadata=metadata,
@@ -115,7 +115,7 @@ class MastermindExtractor(LMEvalBenchmarkExtractor):
115
115
  }
116
116
 
117
117
  return self._build_pair(
118
- question=formatted_question,
118
+ question=question,
119
119
  correct=correct,
120
120
  incorrect=incorrect,
121
121
  metadata=metadata,
@@ -174,14 +174,12 @@ class MastermindExtractor(LMEvalBenchmarkExtractor):
174
174
  incorrect_idx = (answer_idx + 1) % len(choices)
175
175
  incorrect = choices[incorrect_idx]
176
176
 
177
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
178
-
179
177
  metadata = {
180
178
  "label": "mastermind",
181
179
  }
182
180
 
183
181
  return self._build_pair(
184
- question=formatted_question,
182
+ question=question,
185
183
  correct=correct,
186
184
  incorrect=incorrect,
187
185
  metadata=metadata,
@@ -83,7 +83,7 @@ class MCTACOExtractor(LMEvalBenchmarkExtractor):
83
83
  )
84
84
  return None
85
85
 
86
- formatted_question = f"{sentence}\nQuestion: {question}\nAnswer: {answer}\nPlausible:\nA. Yes\nB. No"
86
+ prompt = f"{sentence}\nQuestion: {question}\nAnswer: {answer}\nPlausible?"
87
87
 
88
88
  correct = "Yes" if label == 1 else "No"
89
89
  incorrect = "No" if label == 1 else "Yes"
@@ -93,7 +93,7 @@ class MCTACOExtractor(LMEvalBenchmarkExtractor):
93
93
  }
94
94
 
95
95
  return self._build_pair(
96
- question=formatted_question,
96
+ question=prompt,
97
97
  correct=correct,
98
98
  incorrect=incorrect,
99
99
  metadata=metadata,
@@ -131,7 +131,7 @@ class MedConceptsQaExtractor(LMEvalBenchmarkExtractor):
131
131
 
132
132
  # For this format, the response should be just the letter
133
133
  return self._build_pair(
134
- question=formatted_question,
134
+ question=question,
135
135
  correct=answer_key,
136
136
  incorrect=chr(ord('A') + incorrect_idx),
137
137
  metadata=metadata,
@@ -195,14 +195,12 @@ class MedConceptsQaExtractor(LMEvalBenchmarkExtractor):
195
195
  incorrect_idx = (answer_idx + 1) % len(choices)
196
196
  incorrect = choices[incorrect_idx]
197
197
 
198
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
199
-
200
198
  metadata = {
201
199
  "label": "med_concepts_qa",
202
200
  }
203
201
 
204
202
  return self._build_pair(
205
- question=formatted_question,
203
+ question=question,
206
204
  correct=correct,
207
205
  incorrect=incorrect,
208
206
  metadata=metadata,
@@ -151,14 +151,12 @@ class MeddialogExtractor(LMEvalBenchmarkExtractor):
151
151
  incorrect_idx = (answer_idx + 1) % len(choices)
152
152
  incorrect = choices[incorrect_idx]
153
153
 
154
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
155
-
156
154
  metadata = {
157
155
  "label": "meddialog",
158
156
  }
159
157
 
160
158
  return self._build_pair(
161
- question=formatted_question,
159
+ question=question,
162
160
  correct=correct,
163
161
  incorrect=incorrect,
164
162
  metadata=metadata,
@@ -80,12 +80,10 @@ class MedicalExtractor(LMEvalBenchmarkExtractor):
80
80
  correct = str(choices[answer_idx]).strip()
81
81
  incorrect_idx = (answer_idx + 1) % len(choices)
82
82
  incorrect = str(choices[incorrect_idx]).strip()
83
-
84
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
85
83
  metadata = {"label": "medical"}
86
84
 
87
85
  return self._build_pair(
88
- question=formatted_question,
86
+ question=question,
89
87
  correct=correct,
90
88
  incorrect=incorrect,
91
89
  metadata=metadata,
@@ -140,14 +140,12 @@ class MedmcqaExtractor(LMEvalBenchmarkExtractor):
140
140
  incorrect_idx = (answer_idx + 1) % len(choices)
141
141
  incorrect = choices[incorrect_idx]
142
142
 
143
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
144
-
145
143
  metadata = {
146
144
  "label": "medmcqa",
147
145
  }
148
146
 
149
147
  return self._build_pair(
150
- question=formatted_question,
148
+ question=question,
151
149
  correct=correct,
152
150
  incorrect=incorrect,
153
151
  metadata=metadata,
@@ -89,14 +89,14 @@ class MedQAExtractor(LMEvalBenchmarkExtractor):
89
89
  correct = endings[label]
90
90
  incorrect = endings[(label + 1) % 4]
91
91
 
92
- formatted_question = f"Question: {sent1}\nA. {incorrect}\nB. {correct}"
92
+ prompt = f"Question: {sent1}"
93
93
 
94
94
  metadata = {
95
95
  "label": "medqa",
96
96
  }
97
97
 
98
98
  return self._build_pair(
99
- question=formatted_question,
99
+ question=prompt,
100
100
  correct=correct,
101
101
  incorrect=incorrect,
102
102
  metadata=metadata,
@@ -67,11 +67,11 @@ class MelaExtractor(LMEvalBenchmarkExtractor):
67
67
  incorrect_idx = 1 - answer_idx
68
68
  incorrect = choices[incorrect_idx]
69
69
 
70
- formatted_question = f"Sentence: {sentence}\nDetermine whether this sentence is acceptable or unacceptable?\nA. {incorrect}\nB. {correct}"
70
+ prompt = f"Sentence: {sentence}\nDetermine whether this sentence is acceptable or unacceptable?"
71
71
  metadata = {"label": "mela"}
72
72
 
73
73
  return self._build_pair(
74
- question=formatted_question,
74
+ question=prompt,
75
75
  correct=correct,
76
76
  incorrect=incorrect,
77
77
  metadata=metadata,
@@ -125,14 +125,12 @@ class MetabenchExtractor(LMEvalBenchmarkExtractor):
125
125
  incorrect_idx = (answer_idx + 1) % len(choices)
126
126
  incorrect = choices[incorrect_idx]
127
127
 
128
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
129
-
130
128
  metadata = {
131
129
  "label": "metabench",
132
130
  }
133
131
 
134
132
  return self._build_pair(
135
- question=formatted_question,
133
+ question=question,
136
134
  correct=correct,
137
135
  incorrect=incorrect,
138
136
  metadata=metadata,
@@ -143,14 +143,12 @@ class MinervaMathExtractor(LMEvalBenchmarkExtractor):
143
143
  incorrect_idx = (answer_idx + 1) % len(choices)
144
144
  incorrect = choices[incorrect_idx]
145
145
 
146
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
147
-
148
146
  metadata = {
149
147
  "label": "minerva_math",
150
148
  }
151
149
 
152
150
  return self._build_pair(
153
- question=formatted_question,
151
+ question=question,
154
152
  correct=correct,
155
153
  incorrect=incorrect,
156
154
  metadata=metadata,
@@ -115,14 +115,12 @@ class MMLUExtractor(LMEvalBenchmarkExtractor):
115
115
  incorrect_idx = (answer_idx + 1) % len(choices)
116
116
  incorrect = choices[incorrect_idx]
117
117
 
118
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
119
-
120
118
  metadata = {
121
119
  "label": "mmlu",
122
120
  }
123
121
 
124
122
  return self._build_pair(
125
- question=formatted_question,
123
+ question=question,
126
124
  correct=correct,
127
125
  incorrect=incorrect,
128
126
  metadata=metadata,
@@ -154,20 +154,19 @@ class MmlusrExtractor(LMEvalBenchmarkExtractor):
154
154
  )
155
155
  return None
156
156
 
157
- # Build prompt matching lm-eval format
157
+ # Build prompt - raw question without MC formatting
158
158
  correct = choices[answer_idx]
159
159
  incorrect_idx = (answer_idx + 1) % len(choices)
160
160
  incorrect = choices[incorrect_idx]
161
161
 
162
- # Format exactly as lm-eval does it
163
- formatted_question = f"{question}\nA. {choices[0]}\nB. {choices[1]}\nC. {choices[2]}\nD. {choices[3]}\nAnswer:"
162
+ prompt = question
164
163
 
165
164
  metadata = {
166
165
  "label": "mmlusr",
167
166
  }
168
167
 
169
168
  return self._build_pair(
170
- question=formatted_question,
169
+ question=prompt,
171
170
  correct=correct,
172
171
  incorrect=incorrect,
173
172
  metadata=metadata,
@@ -79,7 +79,7 @@ class MRPCExtractor(LMEvalBenchmarkExtractor):
79
79
  )
80
80
  return None
81
81
 
82
- formatted_question = f"Sentence 1: {sentence1}\nSentence 2: {sentence2}. Do both sequences mean the same thing?\nAnswer:\nA. Yes\nB. No"
82
+ prompt = f"Sentence 1: {sentence1}\nSentence 2: {sentence2}. Do both sequences mean the same thing?"
83
83
 
84
84
  correct = "Yes" if label == 1 else "No"
85
85
  incorrect = "No" if label == 1 else "Yes"
@@ -89,7 +89,7 @@ class MRPCExtractor(LMEvalBenchmarkExtractor):
89
89
  }
90
90
 
91
91
  return self._build_pair(
92
- question=formatted_question,
92
+ question=prompt,
93
93
  correct=correct,
94
94
  incorrect=incorrect,
95
95
  metadata=metadata,
@@ -100,11 +100,8 @@ class MultiblimpExtractor(LMEvalBenchmarkExtractor):
100
100
  log.debug("Skipping doc with missing sen/wrong_sen", extra={"doc": doc})
101
101
  return None
102
102
 
103
- # Prompt: present both sentences as choices (matching lm-eval format)
104
- # Since doc_to_text is empty, we format as multiple choice
105
- prompt = "Which sentence is grammatically correct?\nA. {}\nB. {}".format(
106
- correct_sentence, incorrect_sentence
107
- )
103
+ # Raw prompt without A./B. formatting
104
+ prompt = "Which sentence is grammatically correct?"
108
105
 
109
106
  metadata = {"label": "multiblimp"}
110
107
 
@@ -82,7 +82,7 @@ class MultiRCExtractor(LMEvalBenchmarkExtractor):
82
82
  )
83
83
  return None
84
84
 
85
- formatted_question = f"{paragraph}\nQuestion: {question}\nAnswer: {answer}\nIs this answer correct?\nA. Yes\nB. No"
85
+ prompt = f"{paragraph}\nQuestion: {question}\nAnswer: {answer}\nIs this answer correct?"
86
86
 
87
87
  correct = "Yes" if label == 1 else "No"
88
88
  incorrect = "No" if label == 1 else "Yes"
@@ -92,7 +92,7 @@ class MultiRCExtractor(LMEvalBenchmarkExtractor):
92
92
  }
93
93
 
94
94
  return self._build_pair(
95
- question=formatted_question,
95
+ question=prompt,
96
96
  correct=correct,
97
97
  incorrect=incorrect,
98
98
  metadata=metadata,
@@ -84,14 +84,14 @@ class MutualExtractor(LMEvalBenchmarkExtractor):
84
84
  correct = options[answer_idx]
85
85
  incorrect = options[(answer_idx+1)%len(options)]
86
86
 
87
- formatted_question = (f"{article}\nA. {incorrect}\nB. {correct}")
87
+ prompt = article
88
88
 
89
89
  metadata = {
90
90
  "label": "mutual",
91
91
  }
92
92
 
93
93
  return self._build_pair(
94
- question=formatted_question,
94
+ question=prompt,
95
95
  correct=correct,
96
96
  incorrect=incorrect,
97
97
  metadata=metadata,
@@ -80,12 +80,10 @@ class NonExtractor(LMEvalBenchmarkExtractor):
80
80
  correct = str(choices[answer_idx]).strip()
81
81
  incorrect_idx = (answer_idx + 1) % len(choices)
82
82
  incorrect = str(choices[incorrect_idx]).strip()
83
-
84
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
85
83
  metadata = {"label": "non"}
86
84
 
87
85
  return self._build_pair(
88
- question=formatted_question,
86
+ question=question,
89
87
  correct=correct,
90
88
  incorrect=incorrect,
91
89
  metadata=metadata,
@@ -144,14 +144,12 @@ class NorevalExtractor(LMEvalBenchmarkExtractor):
144
144
  incorrect_idx = (answer_idx + 1) % len(choices)
145
145
  incorrect = choices[incorrect_idx]
146
146
 
147
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
148
-
149
147
  metadata = {
150
148
  "label": "noreval",
151
149
  }
152
150
 
153
151
  return self._build_pair(
154
- question=formatted_question,
152
+ question=question,
155
153
  correct=correct,
156
154
  incorrect=incorrect,
157
155
  metadata=metadata,
@@ -128,14 +128,12 @@ class NorevalExactMatchExtractor(LMEvalBenchmarkExtractor):
128
128
  incorrect_idx = (answer_idx + 1) % len(choices)
129
129
  incorrect = choices[incorrect_idx]
130
130
 
131
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
132
-
133
131
  metadata = {
134
132
  "label": "noreval_exact",
135
133
  }
136
134
 
137
135
  return self._build_pair(
138
- question=formatted_question,
136
+ question=question,
139
137
  correct=correct,
140
138
  incorrect=incorrect,
141
139
  metadata=metadata,
@@ -136,14 +136,12 @@ class NorevalGenerationExactMatchExtractor(LMEvalBenchmarkExtractor):
136
136
  incorrect_idx = (answer_idx + 1) % len(choices)
137
137
  incorrect = choices[incorrect_idx]
138
138
 
139
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
140
-
141
139
  metadata = {
142
140
  "label": "noreval_gen_exact",
143
141
  }
144
142
 
145
143
  return self._build_pair(
146
- question=formatted_question,
144
+ question=question,
147
145
  correct=correct,
148
146
  incorrect=incorrect,
149
147
  metadata=metadata,
@@ -107,12 +107,12 @@ class NorevalMultipleChoiceExtractor(LMEvalBenchmarkExtractor):
107
107
  log.debug("Skipping doc due to empty correct/wrong fields", extra={"doc": doc})
108
108
  return None
109
109
 
110
- formatted_question = f"Which sentence is grammatically correct?\nA. {incorrect}\nB. {correct}"
110
+ prompt = f"Which sentence is grammatically correct?"
111
111
 
112
112
  metadata = {"label": "noreval_ncb"}
113
113
 
114
114
  return self._build_pair(
115
- question=formatted_question,
115
+ question=question,
116
116
  correct=correct,
117
117
  incorrect=incorrect,
118
118
  metadata=metadata,
@@ -151,12 +151,10 @@ class NorevalMultipleChoiceExtractor(LMEvalBenchmarkExtractor):
151
151
  correct = str(choices[correct_idx]).strip()
152
152
  incorrect = str(choices[incorrect_idx]).strip()
153
153
 
154
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
155
-
156
154
  metadata = {"label": "noreval_truthfulqa"}
157
155
 
158
156
  return self._build_pair(
159
- question=formatted_question,
157
+ question=question,
160
158
  correct=correct,
161
159
  incorrect=incorrect,
162
160
  metadata=metadata,
@@ -192,12 +190,10 @@ class NorevalMultipleChoiceExtractor(LMEvalBenchmarkExtractor):
192
190
  incorrect_idx = (answer_idx + 1) % len(choice_texts)
193
191
  incorrect = str(choice_texts[incorrect_idx]).strip()
194
192
 
195
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
196
-
197
193
  metadata = {"label": "noreval_nrk_quiz"}
198
194
 
199
195
  return self._build_pair(
200
- question=formatted_question,
196
+ question=question,
201
197
  correct=correct,
202
198
  incorrect=incorrect,
203
199
  metadata=metadata,
@@ -102,12 +102,12 @@ class NorevalMultipleChoiceExtractor(LMEvalBenchmarkExtractor):
102
102
  log.debug("Skipping doc due to empty correct/wrong fields", extra={"doc": doc})
103
103
  return None
104
104
 
105
- formatted_question = f"Which sentence is grammatically correct?\nA. {incorrect}\nB. {correct}"
105
+ prompt = f"Which sentence is grammatically correct?"
106
106
 
107
107
  metadata = {"label": "noreval_ncb"}
108
108
 
109
109
  return self._build_pair(
110
- question=formatted_question,
110
+ question=question,
111
111
  correct=correct,
112
112
  incorrect=incorrect,
113
113
  metadata=metadata,
@@ -146,12 +146,10 @@ class NorevalMultipleChoiceExtractor(LMEvalBenchmarkExtractor):
146
146
  correct = str(choices[correct_idx]).strip()
147
147
  incorrect = str(choices[incorrect_idx]).strip()
148
148
 
149
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
150
-
151
149
  metadata = {"label": "noreval_truthfulqa"}
152
150
 
153
151
  return self._build_pair(
154
- question=formatted_question,
152
+ question=question,
155
153
  correct=correct,
156
154
  incorrect=incorrect,
157
155
  metadata=metadata,
@@ -187,12 +185,10 @@ class NorevalMultipleChoiceExtractor(LMEvalBenchmarkExtractor):
187
185
  incorrect_idx = (answer_idx + 1) % len(choice_texts)
188
186
  incorrect = str(choice_texts[incorrect_idx]).strip()
189
187
 
190
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
191
-
192
188
  metadata = {"label": "noreval_nrk_quiz"}
193
189
 
194
190
  return self._build_pair(
195
- question=formatted_question,
191
+ question=question,
196
192
  correct=correct,
197
193
  incorrect=incorrect,
198
194
  metadata=metadata,
@@ -106,14 +106,14 @@ class NQOpenExtractor(LMEvalBenchmarkExtractor):
106
106
  if incorrect == correct:
107
107
  incorrect += "k"
108
108
 
109
- formatted_question = f"Question: {question}\nAnswer:\nA. {incorrect}\nB. {correct}"
109
+ prompt = f"Question: {question}\nAnswer:"
110
110
 
111
111
  metadata = {
112
112
  "label": "nq_open",
113
113
  }
114
114
 
115
115
  return self._build_pair(
116
- question=formatted_question,
116
+ question=prompt,
117
117
  correct=correct,
118
118
  incorrect=incorrect,
119
119
  metadata=metadata,
@@ -138,14 +138,12 @@ class OkapiArcMultilingualExtractor(LMEvalBenchmarkExtractor):
138
138
  incorrect_idx = (answer_idx + 1) % len(choices)
139
139
  incorrect = choices[incorrect_idx]
140
140
 
141
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
142
-
143
141
  metadata = {
144
142
  "label": "okapi/arc_multilingual",
145
143
  }
146
144
 
147
145
  return self._build_pair(
148
- question=formatted_question,
146
+ question=question,
149
147
  correct=correct,
150
148
  incorrect=incorrect,
151
149
  metadata=metadata,
@@ -145,14 +145,12 @@ class OkapiHellaswagMultilingualExtractor(LMEvalBenchmarkExtractor):
145
145
  )
146
146
  return None
147
147
 
148
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
149
-
150
148
  metadata = {
151
149
  "label": "okapi/hellaswag_multilingual",
152
150
  }
153
151
 
154
152
  return self._build_pair(
155
- question=formatted_question,
153
+ question=question,
156
154
  correct=correct,
157
155
  incorrect=incorrect,
158
156
  metadata=metadata,
@@ -133,14 +133,12 @@ class OkapiMmluMultilingualExtractor(LMEvalBenchmarkExtractor):
133
133
  incorrect_idx = (answer_idx + 1) % len(choices)
134
134
  incorrect = choices[incorrect_idx]
135
135
 
136
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
137
-
138
136
  metadata = {
139
137
  "label": "okapi/mmlu_multilingual",
140
138
  }
141
139
 
142
140
  return self._build_pair(
143
- question=formatted_question,
141
+ question=question,
144
142
  correct=correct,
145
143
  incorrect=incorrect,
146
144
  metadata=metadata,