wisent 0.7.701__py3-none-any.whl → 0.7.1045__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (391) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/comparison/__init__.py +1 -0
  3. wisent/comparison/detect_bos_features.py +275 -0
  4. wisent/comparison/fgaa.py +465 -0
  5. wisent/comparison/lora.py +669 -0
  6. wisent/comparison/lora_dpo.py +592 -0
  7. wisent/comparison/main.py +444 -0
  8. wisent/comparison/ours.py +76 -0
  9. wisent/comparison/sae.py +304 -0
  10. wisent/comparison/utils.py +381 -0
  11. wisent/core/activations/activation_cache.py +393 -0
  12. wisent/core/activations/activations.py +3 -3
  13. wisent/core/activations/activations_collector.py +12 -7
  14. wisent/core/activations/classifier_inference_strategy.py +12 -11
  15. wisent/core/activations/extraction_strategy.py +260 -84
  16. wisent/core/classifiers/classifiers/core/atoms.py +3 -2
  17. wisent/core/cli/__init__.py +2 -1
  18. wisent/core/cli/agent/train_classifier.py +16 -3
  19. wisent/core/cli/check_linearity.py +35 -3
  20. wisent/core/cli/cluster_benchmarks.py +4 -6
  21. wisent/core/cli/create_steering_vector.py +6 -4
  22. wisent/core/cli/diagnose_vectors.py +7 -4
  23. wisent/core/cli/estimate_unified_goodness_time.py +6 -4
  24. wisent/core/cli/generate_pairs_from_task.py +9 -56
  25. wisent/core/cli/generate_vector_from_task.py +11 -20
  26. wisent/core/cli/geometry_search.py +137 -0
  27. wisent/core/cli/get_activations.py +2 -2
  28. wisent/core/cli/method_optimizer.py +4 -3
  29. wisent/core/cli/modify_weights.py +3 -2
  30. wisent/core/cli/optimize_sample_size.py +1 -1
  31. wisent/core/cli/optimize_steering.py +14 -16
  32. wisent/core/cli/optimize_weights.py +2 -1
  33. wisent/core/cli/preview_pairs.py +203 -0
  34. wisent/core/cli/steering_method_trainer.py +3 -3
  35. wisent/core/cli/tasks.py +19 -76
  36. wisent/core/cli/train_unified_goodness.py +3 -3
  37. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +4 -4
  38. wisent/core/contrastive_pairs/diagnostics/linearity.py +7 -0
  39. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
  40. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
  41. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
  42. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
  43. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
  44. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
  45. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
  46. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
  47. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
  48. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
  49. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
  50. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
  51. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
  52. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
  53. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
  54. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
  55. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
  56. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
  57. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
  58. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
  59. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
  60. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
  61. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
  62. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +3 -0
  63. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
  64. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
  65. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
  66. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
  67. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
  68. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
  69. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
  70. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
  82. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
  83. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
  84. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
  85. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
  86. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
  87. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
  88. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
  89. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
  90. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
  91. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
  92. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
  93. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +22 -5
  94. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
  95. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
  96. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +10 -3
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
  102. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
  103. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
  104. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
  105. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
  106. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
  107. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
  108. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
  109. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
  110. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
  111. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
  112. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
  113. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
  114. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
  115. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
  116. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
  117. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
  118. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
  119. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
  120. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
  121. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
  122. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
  123. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
  124. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
  125. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
  126. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
  127. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
  128. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
  129. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
  130. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
  131. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
  132. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
  133. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
  134. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
  135. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
  136. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
  137. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
  138. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
  139. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
  140. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
  141. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
  142. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
  143. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
  144. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
  145. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
  146. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
  147. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
  148. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
  149. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
  150. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
  151. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
  152. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
  153. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
  154. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
  155. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
  156. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
  157. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
  158. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
  159. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
  160. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
  161. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
  162. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
  163. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
  164. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
  165. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
  166. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
  167. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
  168. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
  169. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
  170. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
  171. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
  172. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
  173. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
  174. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
  175. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
  176. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
  177. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
  178. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
  179. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
  180. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
  181. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
  182. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
  183. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
  184. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
  185. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
  186. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
  187. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
  188. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
  189. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
  190. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
  191. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
  192. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
  193. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
  194. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
  195. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
  196. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
  197. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
  198. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
  199. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
  200. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
  201. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
  202. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
  203. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
  204. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
  205. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
  206. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
  207. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
  208. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
  209. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
  210. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
  211. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
  212. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
  213. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
  214. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
  215. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
  216. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
  217. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
  218. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
  219. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
  220. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
  221. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
  222. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
  223. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
  224. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
  225. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
  226. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
  227. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
  228. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
  229. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
  230. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
  231. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
  232. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
  233. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
  234. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
  235. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
  236. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
  237. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
  238. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
  239. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
  240. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
  241. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
  242. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
  243. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
  244. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
  245. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
  246. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
  247. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
  248. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
  249. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
  250. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
  251. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
  252. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
  253. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
  254. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
  255. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
  256. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
  257. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
  258. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
  259. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
  260. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +9 -4
  261. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
  262. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
  263. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
  264. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
  265. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
  266. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
  267. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
  268. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
  269. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
  270. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
  271. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
  272. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
  273. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
  274. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
  275. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
  276. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
  277. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
  278. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
  279. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
  280. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
  281. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
  282. wisent/core/data_loaders/loaders/lm_loader.py +12 -1
  283. wisent/core/geometry_runner.py +995 -0
  284. wisent/core/geometry_search_space.py +237 -0
  285. wisent/core/hyperparameter_optimizer.py +1 -1
  286. wisent/core/main.py +3 -0
  287. wisent/core/models/core/atoms.py +5 -3
  288. wisent/core/models/wisent_model.py +1 -1
  289. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  290. wisent/core/parser_arguments/check_linearity_parser.py +12 -2
  291. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
  292. wisent/core/parser_arguments/generate_vector_from_task_parser.py +6 -13
  293. wisent/core/parser_arguments/geometry_search_parser.py +61 -0
  294. wisent/core/parser_arguments/get_activations_parser.py +5 -14
  295. wisent/core/parser_arguments/main_parser.py +8 -0
  296. wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
  297. wisent/core/steering.py +5 -3
  298. wisent/core/steering_methods/methods/hyperplane.py +2 -1
  299. wisent/core/synthetic/generators/nonsense_generator.py +30 -18
  300. wisent/core/trainers/steering_trainer.py +2 -2
  301. wisent/core/utils/device.py +27 -27
  302. wisent/core/utils/layer_combinations.py +70 -0
  303. wisent/examples/__init__.py +1 -0
  304. wisent/examples/scripts/__init__.py +1 -0
  305. wisent/examples/scripts/count_all_benchmarks.py +121 -0
  306. wisent/examples/scripts/discover_directions.py +469 -0
  307. wisent/examples/scripts/extract_benchmark_info.py +71 -0
  308. wisent/examples/scripts/search_all_short_names.py +31 -0
  309. wisent/examples/scripts/test_all_benchmarks.py +138 -0
  310. wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
  311. wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
  312. wisent/examples/scripts/test_nonsense_baseline.py +261 -0
  313. wisent/examples/scripts/test_one_benchmark.py +324 -0
  314. wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
  315. wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
  316. wisent/parameters/lm_eval/category_directions.json +137 -0
  317. wisent/parameters/lm_eval/repair_plan.json +282 -0
  318. wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
  319. wisent/parameters/lm_eval/working_benchmarks.json +206 -0
  320. wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
  321. wisent/tests/test_detector_accuracy.py +1 -1
  322. wisent/tests/visualize_geometry.py +1 -1
  323. {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/METADATA +5 -1
  324. {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/RECORD +328 -358
  325. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
  326. wisent/examples/contrastive_pairs/humanization_human_vs_ai.json +0 -2112
  327. wisent/examples/scripts/1/test_basqueglue_evaluation.json +0 -51
  328. wisent/examples/scripts/1/test_basqueglue_pairs.json +0 -14
  329. wisent/examples/scripts/1/test_bec2016eu_evaluation.json +0 -51
  330. wisent/examples/scripts/1/test_bec2016eu_pairs.json +0 -14
  331. wisent/examples/scripts/1/test_belebele_evaluation.json +0 -51
  332. wisent/examples/scripts/1/test_belebele_pairs.json +0 -14
  333. wisent/examples/scripts/1/test_benchmarks_evaluation.json +0 -51
  334. wisent/examples/scripts/1/test_benchmarks_pairs.json +0 -14
  335. wisent/examples/scripts/1/test_bertaqa_evaluation.json +0 -51
  336. wisent/examples/scripts/1/test_bertaqa_pairs.json +0 -14
  337. wisent/examples/scripts/1/test_bhtc_v2_evaluation.json +0 -30
  338. wisent/examples/scripts/1/test_bhtc_v2_pairs.json +0 -8
  339. wisent/examples/scripts/1/test_boolq-seq2seq_evaluation.json +0 -30
  340. wisent/examples/scripts/1/test_boolq-seq2seq_pairs.json +0 -8
  341. wisent/examples/scripts/1/test_cabreu_evaluation.json +0 -30
  342. wisent/examples/scripts/1/test_cabreu_pairs.json +0 -8
  343. wisent/examples/scripts/1/test_careqa_en_evaluation.json +0 -30
  344. wisent/examples/scripts/1/test_careqa_en_pairs.json +0 -8
  345. wisent/examples/scripts/1/test_careqa_evaluation.json +0 -30
  346. wisent/examples/scripts/1/test_careqa_pairs.json +0 -8
  347. wisent/examples/scripts/1/test_catalanqa_evaluation.json +0 -30
  348. wisent/examples/scripts/1/test_catalanqa_pairs.json +0 -8
  349. wisent/examples/scripts/1/test_catcola_evaluation.json +0 -30
  350. wisent/examples/scripts/1/test_catcola_pairs.json +0 -8
  351. wisent/examples/scripts/1/test_chartqa_evaluation.json +0 -30
  352. wisent/examples/scripts/1/test_chartqa_pairs.json +0 -8
  353. wisent/examples/scripts/1/test_claim_stance_topic_evaluation.json +0 -30
  354. wisent/examples/scripts/1/test_claim_stance_topic_pairs.json +0 -8
  355. wisent/examples/scripts/1/test_cnn_dailymail_evaluation.json +0 -30
  356. wisent/examples/scripts/1/test_cnn_dailymail_pairs.json +0 -8
  357. wisent/examples/scripts/1/test_cocoteros_es_evaluation.json +0 -30
  358. wisent/examples/scripts/1/test_cocoteros_es_pairs.json +0 -8
  359. wisent/examples/scripts/1/test_coedit_gec_evaluation.json +0 -30
  360. wisent/examples/scripts/1/test_coedit_gec_pairs.json +0 -8
  361. wisent/examples/scripts/1/test_cola_evaluation.json +0 -30
  362. wisent/examples/scripts/1/test_cola_pairs.json +0 -8
  363. wisent/examples/scripts/1/test_coqcat_evaluation.json +0 -30
  364. wisent/examples/scripts/1/test_coqcat_pairs.json +0 -8
  365. wisent/examples/scripts/1/test_dbpedia_14_evaluation.json +0 -30
  366. wisent/examples/scripts/1/test_dbpedia_14_pairs.json +0 -8
  367. wisent/examples/scripts/1/test_epec_koref_bin_evaluation.json +0 -30
  368. wisent/examples/scripts/1/test_epec_koref_bin_pairs.json +0 -8
  369. wisent/examples/scripts/1/test_ethos_binary_evaluation.json +0 -30
  370. wisent/examples/scripts/1/test_ethos_binary_pairs.json +0 -8
  371. wisent/examples/scripts/2/test_afrimgsm_direct_amh_evaluation.json +0 -30
  372. wisent/examples/scripts/2/test_afrimgsm_direct_amh_pairs.json +0 -8
  373. wisent/examples/scripts/2/test_afrimmlu_direct_amh_evaluation.json +0 -30
  374. wisent/examples/scripts/2/test_afrimmlu_direct_amh_pairs.json +0 -8
  375. wisent/examples/scripts/2/test_afrixnli_en_direct_amh_evaluation.json +0 -30
  376. wisent/examples/scripts/2/test_afrixnli_en_direct_amh_pairs.json +0 -8
  377. wisent/examples/scripts/2/test_arc_ar_evaluation.json +0 -30
  378. wisent/examples/scripts/2/test_arc_ar_pairs.json +0 -8
  379. wisent/examples/scripts/2/test_atis_evaluation.json +0 -30
  380. wisent/examples/scripts/2/test_atis_pairs.json +0 -8
  381. wisent/examples/scripts/2/test_babi_evaluation.json +0 -30
  382. wisent/examples/scripts/2/test_babi_pairs.json +0 -8
  383. wisent/examples/scripts/2/test_babilong_evaluation.json +0 -30
  384. wisent/examples/scripts/2/test_babilong_pairs.json +0 -8
  385. wisent/examples/scripts/2/test_bangla_mmlu_evaluation.json +0 -30
  386. wisent/examples/scripts/2/test_bangla_mmlu_pairs.json +0 -8
  387. wisent/examples/scripts/2/test_basque-glue_pairs.json +0 -14
  388. {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/WHEEL +0 -0
  389. {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/entry_points.txt +0 -0
  390. {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/licenses/LICENSE +0 -0
  391. {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/top_level.txt +0 -0
@@ -178,14 +178,12 @@ class AcpBenchExtractor(LMEvalBenchmarkExtractor):
178
178
  incorrect_idx = (answer_idx + 1) % len(choices)
179
179
  incorrect = choices[incorrect_idx]
180
180
 
181
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
182
-
183
181
  metadata = {
184
182
  "label": "acp_bench",
185
183
  }
186
184
 
187
185
  return self._build_pair(
188
- question=formatted_question,
186
+ question=question,
189
187
  correct=correct,
190
188
  incorrect=incorrect,
191
189
  metadata=metadata,
@@ -156,14 +156,12 @@ class AcpBenchHardExtractor(LMEvalBenchmarkExtractor):
156
156
  incorrect_idx = (answer_idx + 1) % len(choices)
157
157
  incorrect = choices[incorrect_idx]
158
158
 
159
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
160
-
161
159
  metadata = {
162
160
  "label": "acp_bench_hard",
163
161
  }
164
162
 
165
163
  return self._build_pair(
166
- question=formatted_question,
164
+ question=question,
167
165
  correct=correct,
168
166
  incorrect=incorrect,
169
167
  metadata=metadata,
@@ -66,7 +66,7 @@ class AdvancedExtractor(LMEvalBenchmarkExtractor):
66
66
  metadata = {"label": "advanced_ai_risk"}
67
67
 
68
68
  return self._build_pair(
69
- question=formatted_question,
69
+ question=question,
70
70
  correct=correct,
71
71
  incorrect=incorrect,
72
72
  metadata=metadata,
@@ -103,12 +103,10 @@ class AdvancedExtractor(LMEvalBenchmarkExtractor):
103
103
  correct = str(choices[answer_idx]).strip()
104
104
  incorrect_idx = (answer_idx + 1) % len(choices)
105
105
  incorrect = str(choices[incorrect_idx]).strip()
106
-
107
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
108
106
  metadata = {"label": "advanced"}
109
107
 
110
108
  return self._build_pair(
111
- question=formatted_question,
109
+ question=question,
112
110
  correct=correct,
113
111
  incorrect=incorrect,
114
112
  metadata=metadata,
@@ -155,14 +155,12 @@ class AexamsExtractor(LMEvalBenchmarkExtractor):
155
155
  )
156
156
  return None
157
157
 
158
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
159
-
160
158
  metadata = {
161
159
  "label": "aexams",
162
160
  }
163
161
 
164
162
  return self._build_pair(
165
- question=formatted_question,
163
+ question=question,
166
164
  correct=correct,
167
165
  incorrect=incorrect,
168
166
  metadata=metadata,
@@ -86,12 +86,10 @@ class AfrimmluExtractor(LMEvalBenchmarkExtractor):
86
86
  correct = str(choices[answer_idx]).strip()
87
87
  incorrect_idx = (answer_idx + 1) % len(choices)
88
88
  incorrect = str(choices[incorrect_idx]).strip()
89
-
90
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
91
89
  metadata = {"label": "afrimmlu"}
92
90
 
93
91
  return self._build_pair(
94
- question=formatted_question,
92
+ question=question,
95
93
  correct=correct,
96
94
  incorrect=incorrect,
97
95
  metadata=metadata,
@@ -101,8 +101,8 @@ class AfrixnliExtractor(LMEvalBenchmarkExtractor):
101
101
  return None
102
102
  incorrect = label_map[incorrect_labels[0]]
103
103
 
104
- # Format the NLI prompt
105
- prompt = f"Premise: {premise}\nHypothesis: {hypothesis}.\nA. {incorrect}\nB. {correct}"
104
+ # Raw prompt without A./B. formatting
105
+ prompt = f"Premise: {premise}\nHypothesis: {hypothesis}"
106
106
 
107
107
  metadata = {"label": "afrixnli"}
108
108
 
@@ -151,14 +151,12 @@ class ArabcultureExtractor(LMEvalBenchmarkExtractor):
151
151
  incorrect_idx = (answer_idx + 1) % len(choices)
152
152
  incorrect = choices[incorrect_idx]
153
153
 
154
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
155
-
156
154
  metadata = {
157
155
  "label": "arabculture",
158
156
  }
159
157
 
160
158
  return self._build_pair(
161
- question=formatted_question,
159
+ question=question,
162
160
  correct=correct,
163
161
  incorrect=incorrect,
164
162
  metadata=metadata,
@@ -71,12 +71,10 @@ class ArabicExtractor(LMEvalBenchmarkExtractor):
71
71
  correct = choices[answer_idx]
72
72
  incorrect_idx = (answer_idx + 1) % len(choices)
73
73
  incorrect = choices[incorrect_idx]
74
-
75
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
76
74
  metadata = {"label": "arabic"}
77
75
 
78
76
  return self._build_pair(
79
- question=formatted_question,
77
+ question=question,
80
78
  correct=correct,
81
79
  incorrect=incorrect,
82
80
  metadata=metadata,
@@ -77,12 +77,10 @@ class ArabicExamsExtractor(LMEvalBenchmarkExtractor):
77
77
  correct = str(choices[answer_idx]).strip()
78
78
  incorrect_idx = (answer_idx + 1) % len(choices)
79
79
  incorrect = str(choices[incorrect_idx]).strip()
80
-
81
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
82
80
  metadata = {"label": "arabic_exams"}
83
81
 
84
82
  return self._build_pair(
85
- question=formatted_question,
83
+ question=question,
86
84
  correct=correct,
87
85
  incorrect=incorrect,
88
86
  metadata=metadata,
@@ -139,14 +139,12 @@ class ArabicLeaderboardCompleteExtractor(LMEvalBenchmarkExtractor):
139
139
  incorrect_idx = (answer_idx + 1) % len(choices)
140
140
  incorrect = choices[incorrect_idx]
141
141
 
142
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
143
-
144
142
  metadata = {
145
143
  "label": "arabic_leaderboard_complete",
146
144
  }
147
145
 
148
146
  return self._build_pair(
149
- question=formatted_question,
147
+ question=question,
150
148
  correct=correct,
151
149
  incorrect=incorrect,
152
150
  metadata=metadata,
@@ -139,14 +139,12 @@ class ArabicLeaderboardLightExtractor(LMEvalBenchmarkExtractor):
139
139
  incorrect_idx = (answer_idx + 1) % len(choices)
140
140
  incorrect = choices[incorrect_idx]
141
141
 
142
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
143
-
144
142
  metadata = {
145
143
  "label": "arabic_leaderboard_light",
146
144
  }
147
145
 
148
146
  return self._build_pair(
149
- question=formatted_question,
147
+ question=question,
150
148
  correct=correct,
151
149
  incorrect=incorrect,
152
150
  metadata=metadata,
@@ -138,14 +138,12 @@ class ArabicmmluExtractor(LMEvalBenchmarkExtractor):
138
138
  incorrect_idx = (answer_idx + 1) % len(choices)
139
139
  incorrect = choices[incorrect_idx]
140
140
 
141
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
142
-
143
141
  metadata = {
144
142
  "label": "arabicmmlu",
145
143
  }
146
144
 
147
145
  return self._build_pair(
148
- question=formatted_question,
146
+ question=question,
149
147
  correct=correct,
150
148
  incorrect=incorrect,
151
149
  metadata=metadata,
@@ -241,12 +241,10 @@ class AradiceExtractor(LMEvalBenchmarkExtractor):
241
241
  correct = str(choices[answer_idx]).strip()
242
242
  incorrect_idx = (answer_idx + 1) % len(choices)
243
243
  incorrect = str(choices[incorrect_idx]).strip()
244
-
245
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
246
244
  metadata = {"label": "aradice"}
247
245
 
248
246
  return self._build_pair(
249
- question=formatted_question,
247
+ question=question,
250
248
  correct=correct,
251
249
  incorrect=incorrect,
252
250
  metadata=metadata,
@@ -106,12 +106,10 @@ class ArcExtractor(LMEvalBenchmarkExtractor):
106
106
  extra={"doc": doc},
107
107
  )
108
108
  return None
109
-
110
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
111
109
  metadata = {"label": "arc"}
112
110
 
113
111
  return self._build_pair(
114
- question=formatted_question,
112
+ question=question,
115
113
  correct=correct,
116
114
  incorrect=incorrect,
117
115
  metadata=metadata,
@@ -89,14 +89,13 @@ class ArcChallengeExtractor(LMEvalBenchmarkExtractor):
89
89
  incorrect = choices[(answer_idx+1)%len(choices)]
90
90
 
91
91
  question = f"{question}"
92
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
93
92
 
94
93
  metadata = {
95
94
  "label": "arc_easy",
96
95
  }
97
96
 
98
97
  return self._build_pair(
99
- question=formatted_question,
98
+ question=question,
100
99
  correct=correct,
101
100
  incorrect=incorrect,
102
101
  metadata=metadata,
@@ -89,14 +89,13 @@ class ArcEasyExtractor(LMEvalBenchmarkExtractor):
89
89
  incorrect = choices[(answer_idx+1)%len(choices)]
90
90
 
91
91
  question = f"{question}"
92
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
93
92
 
94
93
  metadata = {
95
94
  "label": "arc_easy",
96
95
  }
97
96
 
98
97
  return self._build_pair(
99
- question=formatted_question,
98
+ question=question,
100
99
  correct=correct,
101
100
  incorrect=incorrect,
102
101
  metadata=metadata,
@@ -85,14 +85,14 @@ class ArithmeticExtractor(LMEvalBenchmarkExtractor):
85
85
  incorrect_val = float(completion) + 1
86
86
  incorrect = str(int(incorrect_val)) if incorrect_val == int(incorrect_val) else str(incorrect_val)
87
87
 
88
- formatted_question = f"{context}\nA. {incorrect}\nB. {correct}"
88
+ prompt = f"{context}"
89
89
 
90
90
  metadata = {
91
91
  "label": "arithmetic",
92
92
  }
93
93
 
94
94
  return self._build_pair(
95
- question=formatted_question,
95
+ question=prompt,
96
96
  correct=correct,
97
97
  incorrect=incorrect,
98
98
  metadata=metadata,
@@ -93,14 +93,14 @@ class ASDivExtractor(LMEvalBenchmarkExtractor):
93
93
  incorrect_val = float(numerical_answer) + 1
94
94
  incorrect = str(int(incorrect_val)) if incorrect_val == int(incorrect_val) else str(incorrect_val)
95
95
 
96
- formatted_question = f"{body}\nQuestion:{question}\nA. {incorrect}\nB. {correct}"
96
+ prompt = f"{body}\nQuestion:{question}"
97
97
 
98
98
  metadata = {
99
99
  "label": "asdiv",
100
100
  }
101
101
 
102
102
  return self._build_pair(
103
- question=formatted_question,
103
+ question=prompt,
104
104
  correct=correct,
105
105
  incorrect=incorrect,
106
106
  metadata=metadata,
@@ -82,8 +82,42 @@ class BabiExtractor(LMEvalBenchmarkExtractor):
82
82
  log.debug("Skipping doc due to missing/invalid fields", extra={"doc": doc})
83
83
  return None
84
84
 
85
- # Create an incorrect answer by appending "incorrect" or using a generic wrong answer
86
- incorrect = f"not {correct}"
85
+ # Create an incorrect answer using plausible alternatives from babi vocabulary
86
+ import random
87
+ random.seed(hash(correct + passage) % (2**32))
88
+
89
+ # Common babi answer categories
90
+ locations = ['bathroom', 'bedroom', 'kitchen', 'garden', 'hallway', 'office', 'park']
91
+ people = ['Mary', 'John', 'Sandra', 'Daniel', 'Bill', 'Fred', 'Julie', 'Emily']
92
+ objects = ['football', 'apple', 'milk', 'keys', 'box', 'ball']
93
+ directions = ['north', 'south', 'east', 'west']
94
+ animals = ['cat', 'dog', 'mouse', 'wolf', 'sheep', 'lion']
95
+ yes_no = ['yes', 'no']
96
+
97
+ # Determine answer type and pick a wrong alternative
98
+ correct_lower = correct.lower()
99
+ if correct_lower in [l.lower() for l in locations]:
100
+ incorrect = random.choice([l for l in locations if l.lower() != correct_lower])
101
+ elif correct_lower in [p.lower() for p in people]:
102
+ incorrect = random.choice([p for p in people if p.lower() != correct_lower])
103
+ elif correct_lower in [o.lower() for o in objects]:
104
+ incorrect = random.choice([o for o in objects if o.lower() != correct_lower])
105
+ elif correct_lower in [d.lower() for d in directions]:
106
+ incorrect = random.choice([d for d in directions if d.lower() != correct_lower])
107
+ elif correct_lower in [a.lower() for a in animals]:
108
+ incorrect = random.choice([a for a in animals if a.lower() != correct_lower])
109
+ elif correct_lower in yes_no:
110
+ incorrect = 'no' if correct_lower == 'yes' else 'yes'
111
+ elif correct.isdigit():
112
+ num = int(correct)
113
+ incorrect = str(random.choice([n for n in [num-1, num+1, num*2] if n != num and n >= 0]))
114
+ else:
115
+ # Fallback: use a generic wrong answer from the passage words
116
+ passage_words = [w for w in passage.split() if len(w) > 3 and w.isalpha() and w.lower() != correct_lower]
117
+ if passage_words:
118
+ incorrect = random.choice(passage_words)
119
+ else:
120
+ incorrect = "unknown"
87
121
 
88
122
  # Format the prompt with passage and question
89
123
  prompt = f"Passage: {passage}\n\nQuestion: {question}"
@@ -126,14 +126,12 @@ class BasqueBenchExtractor(LMEvalBenchmarkExtractor):
126
126
  incorrect_idx = (answer_idx + 1) % len(choices)
127
127
  incorrect = choices[incorrect_idx]
128
128
 
129
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
130
-
131
129
  metadata = {
132
130
  "label": "basque_bench",
133
131
  }
134
132
 
135
133
  return self._build_pair(
136
- question=formatted_question,
134
+ question=question,
137
135
  correct=correct,
138
136
  incorrect=incorrect,
139
137
  metadata=metadata,
@@ -140,14 +140,12 @@ class BbqExtractor(LMEvalBenchmarkExtractor):
140
140
  incorrect_idx = (answer_idx + 1) % len(choices)
141
141
  incorrect = choices[incorrect_idx]
142
142
 
143
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
144
-
145
143
  metadata = {
146
144
  "label": "bbq",
147
145
  }
148
146
 
149
147
  return self._build_pair(
150
- question=formatted_question,
148
+ question=question,
151
149
  correct=correct,
152
150
  incorrect=incorrect,
153
151
  metadata=metadata,
@@ -152,14 +152,12 @@ class BelebeleExtractor(LMEvalBenchmarkExtractor):
152
152
  )
153
153
  return None
154
154
 
155
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
156
-
157
155
  metadata = {
158
156
  "label": "belebele",
159
157
  }
160
158
 
161
159
  return self._build_pair(
162
- question=formatted_question,
160
+ question=question,
163
161
  correct=correct,
164
162
  incorrect=incorrect,
165
163
  metadata=metadata,
@@ -126,14 +126,12 @@ class BenchmarksExtractor(LMEvalBenchmarkExtractor):
126
126
  incorrect_idx = (answer_idx + 1) % len(choices)
127
127
  incorrect = choices[incorrect_idx]
128
128
 
129
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
130
-
131
129
  metadata = {
132
130
  "label": "benchmarks",
133
131
  }
134
132
 
135
133
  return self._build_pair(
136
- question=formatted_question,
134
+ question=question,
137
135
  correct=correct,
138
136
  incorrect=incorrect,
139
137
  metadata=metadata,
@@ -136,14 +136,12 @@ class BertaqaExtractor(LMEvalBenchmarkExtractor):
136
136
  incorrect_idx = (answer_idx + 1) % len(choices)
137
137
  incorrect = choices[incorrect_idx]
138
138
 
139
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
140
-
141
139
  metadata = {
142
140
  "label": "bertaqa",
143
141
  }
144
142
 
145
143
  return self._build_pair(
146
- question=formatted_question,
144
+ question=question,
147
145
  correct=correct,
148
146
  incorrect=incorrect,
149
147
  metadata=metadata,
@@ -126,14 +126,12 @@ class BhsExtractor(LMEvalBenchmarkExtractor):
126
126
  incorrect_idx = (answer_idx + 1) % len(choices)
127
127
  incorrect = choices[incorrect_idx]
128
128
 
129
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
130
-
131
129
  metadata = {
132
130
  "label": "bhs",
133
131
  }
134
132
 
135
133
  return self._build_pair(
136
- question=formatted_question,
134
+ question=question,
137
135
  correct=correct,
138
136
  incorrect=incorrect,
139
137
  metadata=metadata,
@@ -72,11 +72,11 @@ class BhtcExtractor(LMEvalBenchmarkExtractor):
72
72
  incorrect_idx = (answer_idx + 1) % len(choices)
73
73
  incorrect = str(choices[incorrect_idx]).strip()
74
74
 
75
- formatted_question = f"Text: {question}\nQuestion: What is the topic of the above text?\nA. {incorrect}\nB. {correct}"
75
+ prompt = f"Text: {question}\nQuestion: What is the topic of the above text?"
76
76
  metadata = {"label": "bhtc"}
77
77
 
78
78
  return self._build_pair(
79
- question=formatted_question,
79
+ question=question,
80
80
  correct=correct,
81
81
  incorrect=incorrect,
82
82
  metadata=metadata,
@@ -116,12 +116,10 @@ class BhtcExtractor(LMEvalBenchmarkExtractor):
116
116
  correct = str(choices[answer_idx]).strip()
117
117
  incorrect_idx = (answer_idx + 1) % len(choices)
118
118
  incorrect = str(choices[incorrect_idx]).strip()
119
-
120
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
121
119
  metadata = {"label": "bhtc"}
122
120
 
123
121
  return self._build_pair(
124
- question=formatted_question,
122
+ question=question,
125
123
  correct=correct,
126
124
  incorrect=incorrect,
127
125
  metadata=metadata,
@@ -142,14 +142,12 @@ class BlimpExtractor(LMEvalBenchmarkExtractor):
142
142
  incorrect_idx = (answer_idx + 1) % len(choices)
143
143
  incorrect = choices[incorrect_idx]
144
144
 
145
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
146
-
147
145
  metadata = {
148
146
  "label": "blimp",
149
147
  }
150
148
 
151
149
  return self._build_pair(
152
- question=formatted_question,
150
+ question=question,
153
151
  correct=correct,
154
152
  incorrect=incorrect,
155
153
  metadata=metadata,
@@ -123,14 +123,12 @@ class BlimpNlExtractor(LMEvalBenchmarkExtractor):
123
123
  incorrect_idx = (answer_idx + 1) % len(choices)
124
124
  incorrect = choices[incorrect_idx]
125
125
 
126
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
127
-
128
126
  metadata = {
129
127
  "label": "blimp_nl",
130
128
  }
131
129
 
132
130
  return self._build_pair(
133
- question=formatted_question,
131
+ question=question,
134
132
  correct=correct,
135
133
  incorrect=incorrect,
136
134
  metadata=metadata,
@@ -85,17 +85,17 @@ class BoolQExtractor(LMEvalBenchmarkExtractor):
85
85
  )
86
86
  return None
87
87
 
88
- correct = "Yes" if label == 1 else "No"
89
- incorrect = "No" if label == 1 else "Yes"
88
+ correct = "yes" if label == 1 else "no"
89
+ incorrect = "no" if label == 1 else "yes"
90
90
 
91
- formatted_question = f"{passage}\nQuestion: {question}?\nAnswer:\nA. {incorrect}\nB. {correct}"
91
+ prompt = f"{passage}\nQuestion: {question}?\nAnswer:"
92
92
 
93
93
  metadata = {
94
94
  "label": "boolq",
95
95
  }
96
96
 
97
97
  return self._build_pair(
98
- question=formatted_question,
98
+ question=prompt,
99
99
  correct=correct,
100
100
  incorrect=incorrect,
101
101
  metadata=metadata,
@@ -114,4 +114,21 @@ class BoolQExtractor(LMEvalBenchmarkExtractor):
114
114
  ) -> ContrastivePair:
115
115
  positive_response = PositiveResponse(model_response=correct)
116
116
  negative_response = NegativeResponse(model_response=incorrect)
117
- return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))
117
+ return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))
118
+
119
+ @staticmethod
120
+ def extract_choices_and_answer(task, doc: dict[str, Any]) -> tuple[list[str], str]:
121
+ """
122
+ Extract choices and expected answer from a BoolQ document.
123
+
124
+ Args:
125
+ task: lm-eval task instance (has doc_to_choice, doc_to_target methods)
126
+ doc: BoolQ document
127
+
128
+ Returns:
129
+ Tuple of (choices, expected_answer)
130
+ """
131
+ choices = task.doc_to_choice(doc)
132
+ target_idx = task.doc_to_target(doc)
133
+ expected = choices[target_idx]
134
+ return choices, expected
@@ -121,12 +121,10 @@ class C4Extractor(LMEvalBenchmarkExtractor):
121
121
  correct = choices[answer_idx]
122
122
  incorrect_idx = (answer_idx + 1) % len(choices)
123
123
  incorrect = choices[incorrect_idx]
124
-
125
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
126
124
  metadata = {"label": "c4"}
127
125
 
128
126
  return self._build_pair(
129
- question=formatted_question,
127
+ question=question,
130
128
  correct=correct,
131
129
  incorrect=incorrect,
132
130
  metadata=metadata,
@@ -123,14 +123,12 @@ class CabbqExtractor(LMEvalBenchmarkExtractor):
123
123
  incorrect_idx = (answer_idx + 1) % len(choices)
124
124
  incorrect = choices[incorrect_idx]
125
125
 
126
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
127
-
128
126
  metadata = {
129
127
  "label": "cabbq",
130
128
  }
131
129
 
132
130
  return self._build_pair(
133
- question=formatted_question,
131
+ question=question,
134
132
  correct=correct,
135
133
  incorrect=incorrect,
136
134
  metadata=metadata,
@@ -140,14 +140,12 @@ class CareqaExtractor(LMEvalBenchmarkExtractor):
140
140
  incorrect_idx = (answer_idx + 1) % len(choices)
141
141
  incorrect = choices[incorrect_idx]
142
142
 
143
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
144
-
145
143
  metadata = {
146
144
  "label": "careqa",
147
145
  }
148
146
 
149
147
  return self._build_pair(
150
- question=formatted_question,
148
+ question=question,
151
149
  correct=correct,
152
150
  incorrect=incorrect,
153
151
  metadata=metadata,
@@ -126,14 +126,12 @@ class CatalanBenchExtractor(LMEvalBenchmarkExtractor):
126
126
  incorrect_idx = (answer_idx + 1) % len(choices)
127
127
  incorrect = choices[incorrect_idx]
128
128
 
129
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
130
-
131
129
  metadata = {
132
130
  "label": "catalan_bench",
133
131
  }
134
132
 
135
133
  return self._build_pair(
136
- question=formatted_question,
134
+ question=question,
137
135
  correct=correct,
138
136
  incorrect=incorrect,
139
137
  metadata=metadata,
@@ -144,12 +144,10 @@ class CatalanqaExtractor(LMEvalBenchmarkExtractor):
144
144
  correct = str(choices[answer_idx]).strip()
145
145
  incorrect_idx = (answer_idx + 1) % len(choices)
146
146
  incorrect = str(choices[incorrect_idx]).strip()
147
-
148
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
149
147
  metadata = {"label": "catalanqa"}
150
148
 
151
149
  return self._build_pair(
152
- question=formatted_question,
150
+ question=question,
153
151
  correct=correct,
154
152
  incorrect=incorrect,
155
153
  metadata=metadata,