wisent 0.7.701__py3-none-any.whl → 0.7.1045__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (391) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/comparison/__init__.py +1 -0
  3. wisent/comparison/detect_bos_features.py +275 -0
  4. wisent/comparison/fgaa.py +465 -0
  5. wisent/comparison/lora.py +669 -0
  6. wisent/comparison/lora_dpo.py +592 -0
  7. wisent/comparison/main.py +444 -0
  8. wisent/comparison/ours.py +76 -0
  9. wisent/comparison/sae.py +304 -0
  10. wisent/comparison/utils.py +381 -0
  11. wisent/core/activations/activation_cache.py +393 -0
  12. wisent/core/activations/activations.py +3 -3
  13. wisent/core/activations/activations_collector.py +12 -7
  14. wisent/core/activations/classifier_inference_strategy.py +12 -11
  15. wisent/core/activations/extraction_strategy.py +260 -84
  16. wisent/core/classifiers/classifiers/core/atoms.py +3 -2
  17. wisent/core/cli/__init__.py +2 -1
  18. wisent/core/cli/agent/train_classifier.py +16 -3
  19. wisent/core/cli/check_linearity.py +35 -3
  20. wisent/core/cli/cluster_benchmarks.py +4 -6
  21. wisent/core/cli/create_steering_vector.py +6 -4
  22. wisent/core/cli/diagnose_vectors.py +7 -4
  23. wisent/core/cli/estimate_unified_goodness_time.py +6 -4
  24. wisent/core/cli/generate_pairs_from_task.py +9 -56
  25. wisent/core/cli/generate_vector_from_task.py +11 -20
  26. wisent/core/cli/geometry_search.py +137 -0
  27. wisent/core/cli/get_activations.py +2 -2
  28. wisent/core/cli/method_optimizer.py +4 -3
  29. wisent/core/cli/modify_weights.py +3 -2
  30. wisent/core/cli/optimize_sample_size.py +1 -1
  31. wisent/core/cli/optimize_steering.py +14 -16
  32. wisent/core/cli/optimize_weights.py +2 -1
  33. wisent/core/cli/preview_pairs.py +203 -0
  34. wisent/core/cli/steering_method_trainer.py +3 -3
  35. wisent/core/cli/tasks.py +19 -76
  36. wisent/core/cli/train_unified_goodness.py +3 -3
  37. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +4 -4
  38. wisent/core/contrastive_pairs/diagnostics/linearity.py +7 -0
  39. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
  40. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
  41. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
  42. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
  43. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
  44. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
  45. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
  46. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
  47. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
  48. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
  49. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
  50. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
  51. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
  52. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
  53. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
  54. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
  55. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
  56. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
  57. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
  58. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
  59. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
  60. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
  61. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
  62. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +3 -0
  63. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
  64. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
  65. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
  66. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
  67. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
  68. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
  69. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
  70. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
  82. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
  83. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
  84. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
  85. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
  86. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
  87. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
  88. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
  89. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
  90. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
  91. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
  92. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
  93. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +22 -5
  94. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
  95. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
  96. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +10 -3
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
  102. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
  103. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
  104. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
  105. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
  106. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
  107. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
  108. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
  109. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
  110. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
  111. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
  112. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
  113. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
  114. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
  115. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
  116. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
  117. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
  118. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
  119. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
  120. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
  121. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
  122. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
  123. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
  124. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
  125. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
  126. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
  127. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
  128. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
  129. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
  130. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
  131. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
  132. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
  133. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
  134. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
  135. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
  136. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
  137. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
  138. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
  139. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
  140. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
  141. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
  142. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
  143. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
  144. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
  145. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
  146. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
  147. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
  148. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
  149. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
  150. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
  151. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
  152. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
  153. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
  154. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
  155. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
  156. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
  157. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
  158. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
  159. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
  160. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
  161. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
  162. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
  163. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
  164. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
  165. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
  166. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
  167. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
  168. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
  169. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
  170. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
  171. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
  172. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
  173. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
  174. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
  175. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
  176. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
  177. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
  178. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
  179. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
  180. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
  181. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
  182. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
  183. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
  184. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
  185. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
  186. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
  187. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
  188. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
  189. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
  190. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
  191. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
  192. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
  193. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
  194. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
  195. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
  196. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
  197. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
  198. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
  199. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
  200. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
  201. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
  202. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
  203. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
  204. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
  205. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
  206. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
  207. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
  208. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
  209. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
  210. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
  211. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
  212. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
  213. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
  214. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
  215. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
  216. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
  217. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
  218. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
  219. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
  220. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
  221. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
  222. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
  223. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
  224. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
  225. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
  226. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
  227. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
  228. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
  229. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
  230. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
  231. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
  232. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
  233. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
  234. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
  235. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
  236. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
  237. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
  238. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
  239. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
  240. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
  241. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
  242. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
  243. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
  244. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
  245. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
  246. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
  247. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
  248. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
  249. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
  250. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
  251. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
  252. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
  253. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
  254. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
  255. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
  256. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
  257. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
  258. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
  259. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
  260. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +9 -4
  261. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
  262. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
  263. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
  264. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
  265. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
  266. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
  267. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
  268. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
  269. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
  270. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
  271. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
  272. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
  273. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
  274. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
  275. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
  276. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
  277. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
  278. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
  279. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
  280. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
  281. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
  282. wisent/core/data_loaders/loaders/lm_loader.py +12 -1
  283. wisent/core/geometry_runner.py +995 -0
  284. wisent/core/geometry_search_space.py +237 -0
  285. wisent/core/hyperparameter_optimizer.py +1 -1
  286. wisent/core/main.py +3 -0
  287. wisent/core/models/core/atoms.py +5 -3
  288. wisent/core/models/wisent_model.py +1 -1
  289. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  290. wisent/core/parser_arguments/check_linearity_parser.py +12 -2
  291. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
  292. wisent/core/parser_arguments/generate_vector_from_task_parser.py +6 -13
  293. wisent/core/parser_arguments/geometry_search_parser.py +61 -0
  294. wisent/core/parser_arguments/get_activations_parser.py +5 -14
  295. wisent/core/parser_arguments/main_parser.py +8 -0
  296. wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
  297. wisent/core/steering.py +5 -3
  298. wisent/core/steering_methods/methods/hyperplane.py +2 -1
  299. wisent/core/synthetic/generators/nonsense_generator.py +30 -18
  300. wisent/core/trainers/steering_trainer.py +2 -2
  301. wisent/core/utils/device.py +27 -27
  302. wisent/core/utils/layer_combinations.py +70 -0
  303. wisent/examples/__init__.py +1 -0
  304. wisent/examples/scripts/__init__.py +1 -0
  305. wisent/examples/scripts/count_all_benchmarks.py +121 -0
  306. wisent/examples/scripts/discover_directions.py +469 -0
  307. wisent/examples/scripts/extract_benchmark_info.py +71 -0
  308. wisent/examples/scripts/search_all_short_names.py +31 -0
  309. wisent/examples/scripts/test_all_benchmarks.py +138 -0
  310. wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
  311. wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
  312. wisent/examples/scripts/test_nonsense_baseline.py +261 -0
  313. wisent/examples/scripts/test_one_benchmark.py +324 -0
  314. wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
  315. wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
  316. wisent/parameters/lm_eval/category_directions.json +137 -0
  317. wisent/parameters/lm_eval/repair_plan.json +282 -0
  318. wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
  319. wisent/parameters/lm_eval/working_benchmarks.json +206 -0
  320. wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
  321. wisent/tests/test_detector_accuracy.py +1 -1
  322. wisent/tests/visualize_geometry.py +1 -1
  323. {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/METADATA +5 -1
  324. {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/RECORD +328 -358
  325. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
  326. wisent/examples/contrastive_pairs/humanization_human_vs_ai.json +0 -2112
  327. wisent/examples/scripts/1/test_basqueglue_evaluation.json +0 -51
  328. wisent/examples/scripts/1/test_basqueglue_pairs.json +0 -14
  329. wisent/examples/scripts/1/test_bec2016eu_evaluation.json +0 -51
  330. wisent/examples/scripts/1/test_bec2016eu_pairs.json +0 -14
  331. wisent/examples/scripts/1/test_belebele_evaluation.json +0 -51
  332. wisent/examples/scripts/1/test_belebele_pairs.json +0 -14
  333. wisent/examples/scripts/1/test_benchmarks_evaluation.json +0 -51
  334. wisent/examples/scripts/1/test_benchmarks_pairs.json +0 -14
  335. wisent/examples/scripts/1/test_bertaqa_evaluation.json +0 -51
  336. wisent/examples/scripts/1/test_bertaqa_pairs.json +0 -14
  337. wisent/examples/scripts/1/test_bhtc_v2_evaluation.json +0 -30
  338. wisent/examples/scripts/1/test_bhtc_v2_pairs.json +0 -8
  339. wisent/examples/scripts/1/test_boolq-seq2seq_evaluation.json +0 -30
  340. wisent/examples/scripts/1/test_boolq-seq2seq_pairs.json +0 -8
  341. wisent/examples/scripts/1/test_cabreu_evaluation.json +0 -30
  342. wisent/examples/scripts/1/test_cabreu_pairs.json +0 -8
  343. wisent/examples/scripts/1/test_careqa_en_evaluation.json +0 -30
  344. wisent/examples/scripts/1/test_careqa_en_pairs.json +0 -8
  345. wisent/examples/scripts/1/test_careqa_evaluation.json +0 -30
  346. wisent/examples/scripts/1/test_careqa_pairs.json +0 -8
  347. wisent/examples/scripts/1/test_catalanqa_evaluation.json +0 -30
  348. wisent/examples/scripts/1/test_catalanqa_pairs.json +0 -8
  349. wisent/examples/scripts/1/test_catcola_evaluation.json +0 -30
  350. wisent/examples/scripts/1/test_catcola_pairs.json +0 -8
  351. wisent/examples/scripts/1/test_chartqa_evaluation.json +0 -30
  352. wisent/examples/scripts/1/test_chartqa_pairs.json +0 -8
  353. wisent/examples/scripts/1/test_claim_stance_topic_evaluation.json +0 -30
  354. wisent/examples/scripts/1/test_claim_stance_topic_pairs.json +0 -8
  355. wisent/examples/scripts/1/test_cnn_dailymail_evaluation.json +0 -30
  356. wisent/examples/scripts/1/test_cnn_dailymail_pairs.json +0 -8
  357. wisent/examples/scripts/1/test_cocoteros_es_evaluation.json +0 -30
  358. wisent/examples/scripts/1/test_cocoteros_es_pairs.json +0 -8
  359. wisent/examples/scripts/1/test_coedit_gec_evaluation.json +0 -30
  360. wisent/examples/scripts/1/test_coedit_gec_pairs.json +0 -8
  361. wisent/examples/scripts/1/test_cola_evaluation.json +0 -30
  362. wisent/examples/scripts/1/test_cola_pairs.json +0 -8
  363. wisent/examples/scripts/1/test_coqcat_evaluation.json +0 -30
  364. wisent/examples/scripts/1/test_coqcat_pairs.json +0 -8
  365. wisent/examples/scripts/1/test_dbpedia_14_evaluation.json +0 -30
  366. wisent/examples/scripts/1/test_dbpedia_14_pairs.json +0 -8
  367. wisent/examples/scripts/1/test_epec_koref_bin_evaluation.json +0 -30
  368. wisent/examples/scripts/1/test_epec_koref_bin_pairs.json +0 -8
  369. wisent/examples/scripts/1/test_ethos_binary_evaluation.json +0 -30
  370. wisent/examples/scripts/1/test_ethos_binary_pairs.json +0 -8
  371. wisent/examples/scripts/2/test_afrimgsm_direct_amh_evaluation.json +0 -30
  372. wisent/examples/scripts/2/test_afrimgsm_direct_amh_pairs.json +0 -8
  373. wisent/examples/scripts/2/test_afrimmlu_direct_amh_evaluation.json +0 -30
  374. wisent/examples/scripts/2/test_afrimmlu_direct_amh_pairs.json +0 -8
  375. wisent/examples/scripts/2/test_afrixnli_en_direct_amh_evaluation.json +0 -30
  376. wisent/examples/scripts/2/test_afrixnli_en_direct_amh_pairs.json +0 -8
  377. wisent/examples/scripts/2/test_arc_ar_evaluation.json +0 -30
  378. wisent/examples/scripts/2/test_arc_ar_pairs.json +0 -8
  379. wisent/examples/scripts/2/test_atis_evaluation.json +0 -30
  380. wisent/examples/scripts/2/test_atis_pairs.json +0 -8
  381. wisent/examples/scripts/2/test_babi_evaluation.json +0 -30
  382. wisent/examples/scripts/2/test_babi_pairs.json +0 -8
  383. wisent/examples/scripts/2/test_babilong_evaluation.json +0 -30
  384. wisent/examples/scripts/2/test_babilong_pairs.json +0 -8
  385. wisent/examples/scripts/2/test_bangla_mmlu_evaluation.json +0 -30
  386. wisent/examples/scripts/2/test_bangla_mmlu_pairs.json +0 -8
  387. wisent/examples/scripts/2/test_basque-glue_pairs.json +0 -14
  388. {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/WHEEL +0 -0
  389. {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/entry_points.txt +0 -0
  390. {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/licenses/LICENSE +0 -0
  391. {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/top_level.txt +0 -0
@@ -112,12 +112,10 @@ class CatcolaExtractor(LMEvalBenchmarkExtractor):
112
112
  correct = str(choices[answer_idx]).strip()
113
113
  incorrect_idx = (answer_idx + 1) % len(choices)
114
114
  incorrect = str(choices[incorrect_idx]).strip()
115
-
116
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
117
115
  metadata = {"label": "catcola"}
118
116
 
119
117
  return self._build_pair(
120
- question=formatted_question,
118
+ question=question,
121
119
  correct=correct,
122
120
  incorrect=incorrect,
123
121
  metadata=metadata,
@@ -88,14 +88,14 @@ class CBExtractor(LMEvalBenchmarkExtractor):
88
88
  correct = labels[label]
89
89
  incorrect = labels[(label+1)%3]
90
90
 
91
- formatted_question = f"{premise}\nQuestion: {hypothesis}.\nA. {incorrect}\nB. {correct}"
91
+ prompt = f"{premise}\nQuestion: {hypothesis}."
92
92
 
93
93
  metadata = {
94
94
  "label": "cb",
95
95
  }
96
96
 
97
97
  return self._build_pair(
98
- question=formatted_question,
98
+ question=prompt,
99
99
  correct=correct,
100
100
  incorrect=incorrect,
101
101
  metadata=metadata,
@@ -114,4 +114,11 @@ class CBExtractor(LMEvalBenchmarkExtractor):
114
114
  ) -> ContrastivePair:
115
115
  positive_response = PositiveResponse(model_response=correct)
116
116
  negative_response = NegativeResponse(model_response=incorrect)
117
- return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))
117
+ return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))
118
+
119
+ @staticmethod
120
+ def extract_choices_and_answer(task, doc: dict[str, Any]) -> tuple[list[str], str]:
121
+ choices = task.doc_to_choice(doc)
122
+ target_idx = task.doc_to_target(doc)
123
+ expected = choices[target_idx]
124
+ return choices, expected
@@ -194,14 +194,12 @@ class CevalExtractor(LMEvalBenchmarkExtractor):
194
194
  incorrect_idx = (answer_idx + 1) % len(choices)
195
195
  incorrect = choices[incorrect_idx]
196
196
 
197
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
198
-
199
197
  metadata = {
200
198
  "label": "ceval",
201
199
  }
202
200
 
203
201
  return self._build_pair(
204
- question=formatted_question,
202
+ question=question,
205
203
  correct=correct,
206
204
  incorrect=incorrect,
207
205
  metadata=metadata,
@@ -136,12 +136,10 @@ class CevalValidExtractor(LMEvalBenchmarkExtractor):
136
136
  correct = str(choices[answer_idx]).strip()
137
137
  incorrect_idx = (answer_idx + 1) % len(choices)
138
138
  incorrect = str(choices[incorrect_idx]).strip()
139
-
140
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
141
139
  metadata = {"label": "ceval_valid"}
142
140
 
143
141
  return self._build_pair(
144
- question=formatted_question,
142
+ question=question,
145
143
  correct=correct,
146
144
  incorrect=incorrect,
147
145
  metadata=metadata,
@@ -83,12 +83,10 @@ class ChainExtractor(LMEvalBenchmarkExtractor):
83
83
  correct = str(choices[answer_idx]).strip()
84
84
  incorrect_idx = (answer_idx + 1) % len(choices)
85
85
  incorrect = str(choices[incorrect_idx]).strip()
86
-
87
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
88
86
  metadata = {"label": "chain"}
89
87
 
90
88
  return self._build_pair(
91
- question=formatted_question,
89
+ question=question,
92
90
  correct=correct,
93
91
  incorrect=incorrect,
94
92
  metadata=metadata,
@@ -153,14 +153,12 @@ class ChartqaExtractor(LMEvalBenchmarkExtractor):
153
153
  incorrect_idx = (answer_idx + 1) % len(choices)
154
154
  incorrect = choices[incorrect_idx]
155
155
 
156
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
157
-
158
156
  metadata = {
159
157
  "label": "chartqa",
160
158
  }
161
159
 
162
160
  return self._build_pair(
163
- question=formatted_question,
161
+ question=question,
164
162
  correct=correct,
165
163
  incorrect=incorrect,
166
164
  metadata=metadata,
@@ -124,12 +124,10 @@ class ClaimExtractor(LMEvalBenchmarkExtractor):
124
124
  correct = str(choices[answer_idx]).strip()
125
125
  incorrect_idx = (answer_idx + 1) % len(choices)
126
126
  incorrect = str(choices[incorrect_idx]).strip()
127
-
128
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
129
127
  metadata = {"label": "claim"}
130
128
 
131
129
  return self._build_pair(
132
- question=formatted_question,
130
+ question=question,
133
131
  correct=correct,
134
132
  incorrect=incorrect,
135
133
  metadata=metadata,
@@ -123,14 +123,12 @@ class ClickExtractor(LMEvalBenchmarkExtractor):
123
123
  incorrect_idx = (answer_idx + 1) % len(choices)
124
124
  incorrect = choices[incorrect_idx]
125
125
 
126
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
127
-
128
126
  metadata = {
129
127
  "label": "click",
130
128
  }
131
129
 
132
130
  return self._build_pair(
133
- question=formatted_question,
131
+ question=question,
134
132
  correct=correct,
135
133
  incorrect=incorrect,
136
134
  metadata=metadata,
@@ -137,14 +137,12 @@ class CmmluExtractor(LMEvalBenchmarkExtractor):
137
137
  incorrect_idx = (answer_idx + 1) % len(choices)
138
138
  incorrect = choices[incorrect_idx]
139
139
 
140
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
141
-
142
140
  metadata = {
143
141
  "label": "cmmlu",
144
142
  }
145
143
 
146
144
  return self._build_pair(
147
- question=formatted_question,
145
+ question=question,
148
146
  correct=correct,
149
147
  incorrect=incorrect,
150
148
  metadata=metadata,
@@ -117,12 +117,10 @@ class CnnExtractor(LMEvalBenchmarkExtractor):
117
117
  correct = str(choices[answer_idx]).strip()
118
118
  incorrect_idx = (answer_idx + 1) % len(choices)
119
119
  incorrect = str(choices[incorrect_idx]).strip()
120
-
121
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
122
120
  metadata = {"label": "cnn"}
123
121
 
124
122
  return self._build_pair(
125
- question=formatted_question,
123
+ question=question,
126
124
  correct=correct,
127
125
  incorrect=incorrect,
128
126
  metadata=metadata,
@@ -121,12 +121,10 @@ class CocoterosExtractor(LMEvalBenchmarkExtractor):
121
121
  correct = str(choices[answer_idx]).strip()
122
122
  incorrect_idx = (answer_idx + 1) % len(choices)
123
123
  incorrect = str(choices[incorrect_idx]).strip()
124
-
125
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
126
124
  metadata = {"label": "cocoteros"}
127
125
 
128
126
  return self._build_pair(
129
- question=formatted_question,
127
+ question=question,
130
128
  correct=correct,
131
129
  incorrect=incorrect,
132
130
  metadata=metadata,
@@ -122,12 +122,10 @@ class CoeditExtractor(LMEvalBenchmarkExtractor):
122
122
  correct = str(choices[answer_idx]).strip()
123
123
  incorrect_idx = (answer_idx + 1) % len(choices)
124
124
  incorrect = str(choices[incorrect_idx]).strip()
125
-
126
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
127
125
  metadata = {"label": "coedit"}
128
126
 
129
127
  return self._build_pair(
130
- question=formatted_question,
128
+ question=question,
131
129
  correct=correct,
132
130
  incorrect=incorrect,
133
131
  metadata=metadata,
@@ -80,12 +80,10 @@ class CommonsenseExtractor(LMEvalBenchmarkExtractor):
80
80
  correct = str(choices[answer_idx]).strip()
81
81
  incorrect_idx = (answer_idx + 1) % len(choices)
82
82
  incorrect = str(choices[incorrect_idx]).strip()
83
-
84
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
85
83
  metadata = {"label": "commonsense"}
86
84
 
87
85
  return self._build_pair(
88
- question=formatted_question,
86
+ question=question,
89
87
  correct=correct,
90
88
  incorrect=incorrect,
91
89
  metadata=metadata,
@@ -98,14 +98,12 @@ class CommonsenseQAExtractor(LMEvalBenchmarkExtractor):
98
98
  incorrect_idx = (answer_idx + 1) % len(choices)
99
99
  incorrect = choices[incorrect_idx]
100
100
 
101
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
102
-
103
101
  metadata = {
104
102
  "label": "commonsense_qa",
105
103
  }
106
104
 
107
105
  return self._build_pair(
108
- question=formatted_question,
106
+ question=question,
109
107
  correct=correct,
110
108
  incorrect=incorrect,
111
109
  metadata=metadata,
@@ -92,7 +92,7 @@ class COPAExtractor(LMEvalBenchmarkExtractor):
92
92
  fills = {"cause": "because", "effect": "therefore"}
93
93
 
94
94
  question = f"{premise.rstrip('.')} {fills[question]}"
95
- formatted_question = f"{question}\nA. {choice1}\nB. {choice2}"
95
+ prompt = f"{question}"
96
96
 
97
97
  correct = choice1 if label == 0 else choice2
98
98
  incorrect = choice2 if label == 0 else choice1
@@ -102,7 +102,7 @@ class COPAExtractor(LMEvalBenchmarkExtractor):
102
102
  }
103
103
 
104
104
  return self._build_pair(
105
- question=formatted_question,
105
+ question=prompt,
106
106
  correct=correct,
107
107
  incorrect=incorrect,
108
108
  metadata=metadata,
@@ -140,14 +140,12 @@ class CopalIdExtractor(LMEvalBenchmarkExtractor):
140
140
  incorrect_idx = (answer_idx + 1) % len(choices)
141
141
  incorrect = choices[incorrect_idx]
142
142
 
143
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
144
-
145
143
  metadata = {
146
144
  "label": "copal_id",
147
145
  }
148
146
 
149
147
  return self._build_pair(
150
- question=formatted_question,
148
+ question=question,
151
149
  correct=correct,
152
150
  incorrect=incorrect,
153
151
  metadata=metadata,
@@ -95,7 +95,8 @@ class CoQAExtractor(LMEvalBenchmarkExtractor):
95
95
  if qs:
96
96
  lines.append(f"Q: {qs[-1]}")
97
97
 
98
- formatted_question = "\n".join(lines)
98
+ prompt = "\n".join(lines)
99
+ prompt = f"{prompt}\nA:"
99
100
 
100
101
  correct = asw[-1] if len(asw) == len(qs) else "no"
101
102
  incorrect = None
@@ -133,14 +134,12 @@ class CoQAExtractor(LMEvalBenchmarkExtractor):
133
134
  # Generic fallback: negate or add "not "
134
135
  incorrect = f"not {correct}"
135
136
 
136
- formatted_question = f"{formatted_question}\nA:\nA. {incorrect}\nB. {correct}"
137
-
138
137
  metadata = {
139
138
  "label": "coqa",
140
139
  }
141
140
 
142
141
  return self._build_pair(
143
- question=formatted_question,
142
+ question=prompt,
144
143
  correct=correct,
145
144
  incorrect=incorrect,
146
145
  metadata=metadata,
@@ -123,14 +123,12 @@ class CsatqaExtractor(LMEvalBenchmarkExtractor):
123
123
  incorrect_idx = (answer_idx + 1) % len(choices)
124
124
  incorrect = choices[incorrect_idx]
125
125
 
126
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
127
-
128
126
  metadata = {
129
127
  "label": "csatqa",
130
128
  }
131
129
 
132
130
  return self._build_pair(
133
- question=formatted_question,
131
+ question=question,
134
132
  correct=correct,
135
133
  incorrect=incorrect,
136
134
  metadata=metadata,
@@ -80,12 +80,10 @@ class CycleExtractor(LMEvalBenchmarkExtractor):
80
80
  correct = str(choices[answer_idx]).strip()
81
81
  incorrect_idx = (answer_idx + 1) % len(choices)
82
82
  incorrect = str(choices[incorrect_idx]).strip()
83
-
84
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
85
83
  metadata = {"label": "cycle"}
86
84
 
87
85
  return self._build_pair(
88
- question=formatted_question,
86
+ question=question,
89
87
  correct=correct,
90
88
  incorrect=incorrect,
91
89
  metadata=metadata,
@@ -192,14 +192,12 @@ class DarijaBenchExtractor(LMEvalBenchmarkExtractor):
192
192
  incorrect_idx = (answer_idx + 1) % len(choices)
193
193
  incorrect = choices[incorrect_idx]
194
194
 
195
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
196
-
197
195
  metadata = {
198
196
  "label": "darija_bench",
199
197
  }
200
198
 
201
199
  return self._build_pair(
202
- question=formatted_question,
200
+ question=question,
203
201
  correct=correct,
204
202
  incorrect=incorrect,
205
203
  metadata=metadata,
@@ -121,12 +121,10 @@ class DarijahellaswagExtractor(LMEvalBenchmarkExtractor):
121
121
  correct = str(choices[answer_idx]).strip()
122
122
  incorrect_idx = (answer_idx + 1) % len(choices)
123
123
  incorrect = str(choices[incorrect_idx]).strip()
124
-
125
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
126
124
  metadata = {"label": "darijahellaswag"}
127
125
 
128
126
  return self._build_pair(
129
- question=formatted_question,
127
+ question=question,
130
128
  correct=correct,
131
129
  incorrect=incorrect,
132
130
  metadata=metadata,
@@ -145,14 +143,12 @@ class DarijahellaswagExtractor(LMEvalBenchmarkExtractor):
145
143
  incorrect_idx = (answer_idx + 1) % len(choices)
146
144
  incorrect = choices[incorrect_idx]
147
145
 
148
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
149
-
150
146
  metadata = {
151
147
  "label": "darijahellaswag",
152
148
  }
153
149
 
154
150
  return self._build_pair(
155
- question=formatted_question,
151
+ question=question,
156
152
  correct=correct,
157
153
  incorrect=incorrect,
158
154
  metadata=metadata,
@@ -123,14 +123,12 @@ class DarijammluExtractor(LMEvalBenchmarkExtractor):
123
123
  incorrect_idx = (answer_idx + 1) % len(choices)
124
124
  incorrect = choices[incorrect_idx]
125
125
 
126
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
127
-
128
126
  metadata = {
129
127
  "label": "darijammlu",
130
128
  }
131
129
 
132
130
  return self._build_pair(
133
- question=formatted_question,
131
+ question=question,
134
132
  correct=correct,
135
133
  incorrect=incorrect,
136
134
  metadata=metadata,
@@ -130,12 +130,10 @@ class DbpediaExtractor(LMEvalBenchmarkExtractor):
130
130
  correct = str(choices[answer_idx]).strip()
131
131
  incorrect_idx = (answer_idx + 1) % len(choices)
132
132
  incorrect = str(choices[incorrect_idx]).strip()
133
-
134
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
135
133
  metadata = {"label": "dbpedia"}
136
134
 
137
135
  return self._build_pair(
138
- question=formatted_question,
136
+ question=question,
139
137
  correct=correct,
140
138
  incorrect=incorrect,
141
139
  metadata=metadata,
@@ -123,14 +123,12 @@ class DiscrimEvalExtractor(LMEvalBenchmarkExtractor):
123
123
  incorrect_idx = (answer_idx + 1) % len(choices)
124
124
  incorrect = choices[incorrect_idx]
125
125
 
126
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
127
-
128
126
  metadata = {
129
127
  "label": "discrim_eval",
130
128
  }
131
129
 
132
130
  return self._build_pair(
133
- question=formatted_question,
131
+ question=question,
134
132
  correct=correct,
135
133
  incorrect=incorrect,
136
134
  metadata=metadata,
@@ -80,12 +80,10 @@ class DocExtractor(LMEvalBenchmarkExtractor):
80
80
  correct = str(choices[answer_idx]).strip()
81
81
  incorrect_idx = (answer_idx + 1) % len(choices)
82
82
  incorrect = str(choices[incorrect_idx]).strip()
83
-
84
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
85
83
  metadata = {"label": "doc"}
86
84
 
87
85
  return self._build_pair(
88
- question=formatted_question,
86
+ question=question,
89
87
  correct=correct,
90
88
  incorrect=incorrect,
91
89
  metadata=metadata,
@@ -100,14 +100,14 @@ class DropExtractor(LMEvalBenchmarkExtractor):
100
100
  if correct == incorrect:
101
101
  incorrect += "k"
102
102
 
103
- formatted_question = f"{passage} {question}\nA. {incorrect}\nB. {correct}"
103
+ prompt = f"{passage} {question}"
104
104
 
105
105
  metadata = {
106
106
  "label": "drop",
107
107
  }
108
108
 
109
109
  return self._build_pair(
110
- question=formatted_question,
110
+ question=prompt,
111
111
  correct=correct,
112
112
  incorrect=incorrect,
113
113
  metadata=metadata,
@@ -115,12 +115,10 @@ class EpecExtractor(LMEvalBenchmarkExtractor):
115
115
  correct = str(choices[answer_idx]).strip()
116
116
  incorrect_idx = (answer_idx + 1) % len(choices)
117
117
  incorrect = str(choices[incorrect_idx]).strip()
118
-
119
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
120
118
  metadata = {"label": "epec"}
121
119
 
122
120
  return self._build_pair(
123
- question=formatted_question,
121
+ question=question,
124
122
  correct=correct,
125
123
  incorrect=incorrect,
126
124
  metadata=metadata,
@@ -80,12 +80,10 @@ class EqExtractor(LMEvalBenchmarkExtractor):
80
80
  correct = str(choices[answer_idx]).strip()
81
81
  incorrect_idx = (answer_idx + 1) % len(choices)
82
82
  incorrect = str(choices[incorrect_idx]).strip()
83
-
84
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
85
83
  metadata = {"label": "eq"}
86
84
 
87
85
  return self._build_pair(
88
- question=formatted_question,
86
+ question=question,
89
87
  correct=correct,
90
88
  incorrect=incorrect,
91
89
  metadata=metadata,
@@ -165,14 +165,12 @@ class EqBenchExtractor(LMEvalBenchmarkExtractor):
165
165
  incorrect_idx = (answer_idx + 1) % len(choices)
166
166
  incorrect = choices[incorrect_idx]
167
167
 
168
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
169
-
170
168
  metadata = {
171
169
  "label": "eq_bench",
172
170
  }
173
171
 
174
172
  return self._build_pair(
175
- question=formatted_question,
173
+ question=question,
176
174
  correct=correct,
177
175
  incorrect=incorrect,
178
176
  metadata=metadata,
@@ -123,14 +123,12 @@ class EqBenchCaExtractor(LMEvalBenchmarkExtractor):
123
123
  incorrect_idx = (answer_idx + 1) % len(choices)
124
124
  incorrect = choices[incorrect_idx]
125
125
 
126
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
127
-
128
126
  metadata = {
129
127
  "label": "eq-bench_ca",
130
128
  }
131
129
 
132
130
  return self._build_pair(
133
- question=formatted_question,
131
+ question=question,
134
132
  correct=correct,
135
133
  incorrect=incorrect,
136
134
  metadata=metadata,
@@ -123,14 +123,12 @@ class EqBenchEsExtractor(LMEvalBenchmarkExtractor):
123
123
  incorrect_idx = (answer_idx + 1) % len(choices)
124
124
  incorrect = choices[incorrect_idx]
125
125
 
126
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
127
-
128
126
  metadata = {
129
127
  "label": "eq-bench_es",
130
128
  }
131
129
 
132
130
  return self._build_pair(
133
- question=formatted_question,
131
+ question=question,
134
132
  correct=correct,
135
133
  incorrect=incorrect,
136
134
  metadata=metadata,
@@ -123,14 +123,12 @@ class EsbbqExtractor(LMEvalBenchmarkExtractor):
123
123
  incorrect_idx = (answer_idx + 1) % len(choices)
124
124
  incorrect = choices[incorrect_idx]
125
125
 
126
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
127
-
128
126
  metadata = {
129
127
  "label": "esbbq",
130
128
  }
131
129
 
132
130
  return self._build_pair(
133
- question=formatted_question,
131
+ question=question,
134
132
  correct=correct,
135
133
  incorrect=incorrect,
136
134
  metadata=metadata,
@@ -108,12 +108,10 @@ class EthicsExtractor(LMEvalBenchmarkExtractor):
108
108
  correct = str(choices[answer_idx]).strip()
109
109
  incorrect_idx = (answer_idx + 1) % len(choices)
110
110
  incorrect = str(choices[incorrect_idx]).strip()
111
-
112
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
113
111
  metadata = {"label": "ethics"}
114
112
 
115
113
  return self._build_pair(
116
- question=formatted_question,
114
+ question=question,
117
115
  correct=correct,
118
116
  incorrect=incorrect,
119
117
  metadata=metadata,
@@ -80,12 +80,10 @@ class EusExtractor(LMEvalBenchmarkExtractor):
80
80
  correct = str(choices[answer_idx]).strip()
81
81
  incorrect_idx = (answer_idx + 1) % len(choices)
82
82
  incorrect = str(choices[incorrect_idx]).strip()
83
-
84
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
85
83
  metadata = {"label": "eus"}
86
84
 
87
85
  return self._build_pair(
88
- question=formatted_question,
86
+ question=question,
89
87
  correct=correct,
90
88
  incorrect=incorrect,
91
89
  metadata=metadata,
@@ -196,14 +196,12 @@ class EusExamsExtractor(LMEvalBenchmarkExtractor):
196
196
  incorrect_idx = (answer_idx + 1) % len(choices)
197
197
  incorrect = choices[incorrect_idx]
198
198
 
199
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
200
-
201
199
  metadata = {
202
200
  "label": "eus_exams",
203
201
  }
204
202
 
205
203
  return self._build_pair(
206
- question=formatted_question,
204
+ question=question,
207
205
  correct=correct,
208
206
  incorrect=incorrect,
209
207
  metadata=metadata,
@@ -130,14 +130,12 @@ class EusProficiencyExtractor(LMEvalBenchmarkExtractor):
130
130
  incorrect_idx = (answer_idx + 1) % len(choices)
131
131
  incorrect = choices[incorrect_idx]
132
132
 
133
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
134
-
135
133
  metadata = {
136
134
  "label": "eus_proficiency",
137
135
  }
138
136
 
139
137
  return self._build_pair(
140
- question=formatted_question,
138
+ question=question,
141
139
  correct=correct,
142
140
  incorrect=incorrect,
143
141
  metadata=metadata,
@@ -130,14 +130,12 @@ class EusReadingExtractor(LMEvalBenchmarkExtractor):
130
130
  incorrect_idx = (answer_idx + 1) % len(choices)
131
131
  incorrect = choices[incorrect_idx]
132
132
 
133
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
134
-
135
133
  metadata = {
136
134
  "label": "eus_reading",
137
135
  }
138
136
 
139
137
  return self._build_pair(
140
- question=formatted_question,
138
+ question=question,
141
139
  correct=correct,
142
140
  incorrect=incorrect,
143
141
  metadata=metadata,