wisent 0.7.701__py3-none-any.whl → 0.7.901__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/activation_cache.py +393 -0
  3. wisent/core/activations/activations.py +3 -3
  4. wisent/core/activations/activations_collector.py +9 -5
  5. wisent/core/activations/classifier_inference_strategy.py +12 -11
  6. wisent/core/activations/extraction_strategy.py +256 -84
  7. wisent/core/classifiers/classifiers/core/atoms.py +3 -2
  8. wisent/core/cli/__init__.py +2 -1
  9. wisent/core/cli/agent/apply_steering.py +5 -7
  10. wisent/core/cli/agent/train_classifier.py +19 -7
  11. wisent/core/cli/check_linearity.py +35 -3
  12. wisent/core/cli/cluster_benchmarks.py +4 -6
  13. wisent/core/cli/create_steering_vector.py +6 -4
  14. wisent/core/cli/diagnose_vectors.py +7 -4
  15. wisent/core/cli/estimate_unified_goodness_time.py +6 -4
  16. wisent/core/cli/generate_pairs_from_task.py +9 -56
  17. wisent/core/cli/geometry_search.py +137 -0
  18. wisent/core/cli/get_activations.py +1 -1
  19. wisent/core/cli/method_optimizer.py +4 -3
  20. wisent/core/cli/modify_weights.py +3 -2
  21. wisent/core/cli/optimize_sample_size.py +1 -1
  22. wisent/core/cli/optimize_steering.py +14 -16
  23. wisent/core/cli/optimize_weights.py +2 -1
  24. wisent/core/cli/preview_pairs.py +203 -0
  25. wisent/core/cli/steering_method_trainer.py +3 -3
  26. wisent/core/cli/tasks.py +19 -76
  27. wisent/core/cli/train_unified_goodness.py +3 -3
  28. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +4 -4
  29. wisent/core/contrastive_pairs/diagnostics/linearity.py +7 -0
  30. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
  31. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
  32. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
  33. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
  34. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
  35. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
  36. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
  37. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
  38. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
  39. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
  40. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
  41. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
  42. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
  43. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
  44. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
  45. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
  46. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
  47. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
  48. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
  49. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
  50. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
  51. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
  52. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
  53. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +3 -0
  54. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
  55. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
  56. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
  57. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
  58. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
  59. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
  60. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
  61. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
  62. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
  63. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
  64. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
  65. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
  66. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
  67. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
  68. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
  69. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
  70. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
  82. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
  83. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
  84. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
  85. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
  86. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
  87. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
  88. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
  89. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
  90. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
  91. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
  92. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
  93. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
  94. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
  95. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
  96. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
  102. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
  103. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
  104. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
  105. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
  106. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
  107. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
  108. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
  109. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
  110. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
  111. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
  112. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
  113. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
  114. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
  115. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
  116. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
  117. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
  118. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
  119. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
  120. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
  121. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
  122. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
  123. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
  124. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
  125. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
  126. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
  127. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
  128. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
  129. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
  130. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
  131. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
  132. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
  133. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
  134. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
  135. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
  136. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
  137. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
  138. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
  139. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
  140. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
  141. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
  142. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
  143. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
  144. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
  145. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
  146. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
  147. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
  148. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
  149. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
  150. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
  151. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
  152. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
  153. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
  154. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
  155. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
  156. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
  157. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
  158. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
  159. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
  160. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
  161. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
  162. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
  163. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
  164. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
  165. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
  166. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
  167. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
  168. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
  169. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
  170. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
  171. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
  172. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
  173. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
  174. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
  175. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
  176. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
  177. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
  178. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
  179. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
  180. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
  181. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
  182. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
  183. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
  184. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
  185. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
  186. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
  187. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
  188. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
  189. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
  190. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
  191. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
  192. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
  193. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
  194. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
  195. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
  196. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
  197. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
  198. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
  199. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
  200. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
  201. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
  202. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
  203. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
  204. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
  205. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
  206. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
  207. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
  208. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
  209. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
  210. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
  211. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
  212. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
  213. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
  214. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
  215. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
  216. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
  217. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
  218. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
  219. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
  220. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
  221. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
  222. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
  223. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
  224. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
  225. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
  226. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
  227. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
  228. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
  229. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
  230. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
  231. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
  232. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
  233. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
  234. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
  235. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
  236. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
  237. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
  238. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
  239. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
  240. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
  241. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
  242. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
  243. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
  244. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
  245. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
  246. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
  247. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
  248. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
  249. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
  250. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
  251. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
  252. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
  253. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
  254. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
  255. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
  256. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
  257. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
  258. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
  259. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
  260. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
  261. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
  262. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
  263. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
  264. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
  265. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
  266. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
  267. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
  268. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
  269. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
  270. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
  271. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
  272. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
  273. wisent/core/data_loaders/loaders/lm_loader.py +12 -1
  274. wisent/core/geometry_runner.py +995 -0
  275. wisent/core/geometry_search_space.py +237 -0
  276. wisent/core/hyperparameter_optimizer.py +1 -1
  277. wisent/core/main.py +3 -0
  278. wisent/core/models/core/atoms.py +5 -3
  279. wisent/core/models/wisent_model.py +1 -1
  280. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  281. wisent/core/parser_arguments/check_linearity_parser.py +12 -2
  282. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
  283. wisent/core/parser_arguments/generate_vector_from_task_parser.py +2 -2
  284. wisent/core/parser_arguments/geometry_search_parser.py +61 -0
  285. wisent/core/parser_arguments/main_parser.py +8 -0
  286. wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
  287. wisent/core/steering.py +5 -3
  288. wisent/core/steering_methods/methods/hyperplane.py +2 -1
  289. wisent/core/synthetic/generators/nonsense_generator.py +30 -18
  290. wisent/core/trainers/steering_trainer.py +2 -2
  291. wisent/core/utils/device.py +27 -27
  292. wisent/core/utils/layer_combinations.py +70 -0
  293. wisent/examples/__init__.py +1 -0
  294. wisent/examples/scripts/__init__.py +1 -0
  295. wisent/examples/scripts/count_all_benchmarks.py +121 -0
  296. wisent/examples/scripts/discover_directions.py +469 -0
  297. wisent/examples/scripts/extract_benchmark_info.py +71 -0
  298. wisent/examples/scripts/generate_paper_data.py +384 -0
  299. wisent/examples/scripts/intervention_validation.py +626 -0
  300. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
  301. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
  302. wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
  303. wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
  304. wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
  305. wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
  306. wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
  307. wisent/examples/scripts/search_all_short_names.py +31 -0
  308. wisent/examples/scripts/test_all_benchmarks.py +138 -0
  309. wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
  310. wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
  311. wisent/examples/scripts/test_nonsense_baseline.py +261 -0
  312. wisent/examples/scripts/test_one_benchmark.py +324 -0
  313. wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
  314. wisent/examples/scripts/threshold_analysis.py +434 -0
  315. wisent/examples/scripts/visualization_gallery.py +582 -0
  316. wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
  317. wisent/parameters/lm_eval/category_directions.json +137 -0
  318. wisent/parameters/lm_eval/repair_plan.json +282 -0
  319. wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
  320. wisent/parameters/lm_eval/working_benchmarks.json +206 -0
  321. wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
  322. wisent/tests/test_detector_accuracy.py +1 -1
  323. wisent/tests/visualize_geometry.py +1 -1
  324. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
  325. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/RECORD +329 -295
  326. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
  327. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
  328. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
  329. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
  330. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
@@ -87,12 +87,10 @@ class FlanExtractor(LMEvalBenchmarkExtractor):
87
87
  correct = str(choices[answer_idx]).strip()
88
88
  incorrect_idx = (answer_idx + 1) % len(choices)
89
89
  incorrect = str(choices[incorrect_idx]).strip()
90
-
91
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
92
90
  metadata = {"label": "flan"}
93
91
 
94
92
  return self._build_pair(
95
- question=formatted_question,
93
+ question=question,
96
94
  correct=correct,
97
95
  incorrect=incorrect,
98
96
  metadata=metadata,
@@ -173,14 +173,12 @@ class FrenchBenchExtractor(LMEvalBenchmarkExtractor):
173
173
  incorrect_idx = (answer_idx + 1) % len(choices)
174
174
  incorrect = choices[incorrect_idx]
175
175
 
176
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
177
-
178
176
  metadata = {
179
177
  "label": "french_bench",
180
178
  }
181
179
 
182
180
  return self._build_pair(
183
- question=formatted_question,
181
+ question=question,
184
182
  correct=correct,
185
183
  incorrect=incorrect,
186
184
  metadata=metadata,
@@ -126,14 +126,12 @@ class GalicianBenchExtractor(LMEvalBenchmarkExtractor):
126
126
  incorrect_idx = (answer_idx + 1) % len(choices)
127
127
  incorrect = choices[incorrect_idx]
128
128
 
129
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
130
-
131
129
  metadata = {
132
130
  "label": "galician_bench",
133
131
  }
134
132
 
135
133
  return self._build_pair(
136
- question=formatted_question,
134
+ question=question,
137
135
  correct=correct,
138
136
  incorrect=incorrect,
139
137
  metadata=metadata,
@@ -109,12 +109,12 @@ class GaokaoExtractor(LMEvalBenchmarkExtractor):
109
109
  incorrect_idx = (answer_idx + 1) % len(choices)
110
110
  incorrect = choices[incorrect_idx]
111
111
 
112
- formatted_question = f"Question: {query}\nA. {incorrect}\nB. {correct}"
112
+ prompt = f"Question: {query}"
113
113
 
114
114
  metadata = {"label": "gaokao"}
115
115
 
116
116
  return self._build_pair(
117
- question=formatted_question,
117
+ question=prompt,
118
118
  correct=correct,
119
119
  incorrect=incorrect,
120
120
  metadata=metadata,
@@ -91,12 +91,10 @@ class GlianorexExtractor(LMEvalBenchmarkExtractor):
91
91
  correct = str(choices[answer_idx]).strip()
92
92
  incorrect_idx = (answer_idx + 1) % len(choices)
93
93
  incorrect = str(choices[incorrect_idx]).strip()
94
-
95
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
96
94
  metadata = {"label": "glianorex"}
97
95
 
98
96
  return self._build_pair(
99
- question=formatted_question,
97
+ question=question,
100
98
  correct=correct,
101
99
  incorrect=incorrect,
102
100
  metadata=metadata,
@@ -142,14 +142,12 @@ class GlobalMmluExtractor(LMEvalBenchmarkExtractor):
142
142
  incorrect_idx = (answer_idx + 1) % len(choices)
143
143
  incorrect = choices[incorrect_idx]
144
144
 
145
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
146
-
147
145
  metadata = {
148
146
  "label": "global_mmlu",
149
147
  }
150
148
 
151
149
  return self._build_pair(
152
- question=formatted_question,
150
+ question=question,
153
151
  correct=correct,
154
152
  incorrect=incorrect,
155
153
  metadata=metadata,
@@ -123,14 +123,12 @@ class GlobalPiqaExtractor(LMEvalBenchmarkExtractor):
123
123
  incorrect_idx = (answer_idx + 1) % len(choices)
124
124
  incorrect = choices[incorrect_idx]
125
125
 
126
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
127
-
128
126
  metadata = {
129
127
  "label": "global_piqa",
130
128
  }
131
129
 
132
130
  return self._build_pair(
133
- question=formatted_question,
131
+ question=question,
134
132
  correct=correct,
135
133
  incorrect=incorrect,
136
134
  metadata=metadata,
@@ -83,12 +83,10 @@ class Gpt3Extractor(LMEvalBenchmarkExtractor):
83
83
  correct = str(choices[answer_idx]).strip()
84
84
  incorrect_idx = (answer_idx + 1) % len(choices)
85
85
  incorrect = str(choices[incorrect_idx]).strip()
86
-
87
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
88
86
  metadata = {"label": "gpt3"}
89
87
 
90
88
  return self._build_pair(
91
- question=formatted_question,
89
+ question=question,
92
90
  correct=correct,
93
91
  incorrect=incorrect,
94
92
  metadata=metadata,
@@ -155,14 +155,12 @@ class GroundcocoaExtractor(LMEvalBenchmarkExtractor):
155
155
  incorrect_idx = (answer_idx + 1) % len(choices)
156
156
  incorrect = choices[incorrect_idx]
157
157
 
158
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
159
-
160
158
  metadata = {
161
159
  "label": "groundcocoa",
162
160
  }
163
161
 
164
162
  return self._build_pair(
165
- question=formatted_question,
163
+ question=question,
166
164
  correct=correct,
167
165
  incorrect=incorrect,
168
166
  metadata=metadata,
@@ -123,14 +123,12 @@ class HaeraeExtractor(LMEvalBenchmarkExtractor):
123
123
  incorrect_idx = (answer_idx + 1) % len(choices)
124
124
  incorrect = choices[incorrect_idx]
125
125
 
126
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
127
-
128
126
  metadata = {
129
127
  "label": "haerae",
130
128
  }
131
129
 
132
130
  return self._build_pair(
133
- question=formatted_question,
131
+ question=question,
134
132
  correct=correct,
135
133
  incorrect=incorrect,
136
134
  metadata=metadata,
@@ -83,14 +83,14 @@ class HeadQAExtractor(LMEvalBenchmarkExtractor):
83
83
  correct = answers[answer_idx]
84
84
  incorrect = answers[(answer_idx+1)%len(answers)]
85
85
 
86
- formatted_question = f"Question: {qtext}\nAnswer:\nA. {incorrect}\nB. {correct}"
86
+ prompt = f"Question: {qtext}\nAnswer:"
87
87
 
88
88
  metadata = {
89
89
  "label": "headqa",
90
90
  }
91
91
 
92
92
  return self._build_pair(
93
- question=formatted_question,
93
+ question=prompt,
94
94
  correct=correct,
95
95
  incorrect=incorrect,
96
96
  metadata=metadata,
@@ -96,14 +96,14 @@ class HellaSwagExtractor(LMEvalBenchmarkExtractor):
96
96
  incorrect = max(incorrect_endings, key=len) if incorrect_endings else endings[(label+1)%len(endings)]
97
97
 
98
98
  question = f"{query}"
99
- formatted_question = f"{question}\nA. {incorrect}\nB. {correct}"
99
+ prompt = f"{question}"
100
100
 
101
101
  metadata = {
102
102
  "label": "hellaswag",
103
103
  }
104
104
 
105
105
  return self._build_pair(
106
- question=formatted_question,
106
+ question=prompt,
107
107
  correct=correct,
108
108
  incorrect=incorrect,
109
109
  metadata=metadata,
@@ -122,12 +122,10 @@ class HendrycksEthicsExtractor(LMEvalBenchmarkExtractor):
122
122
  if not activity or not baseline:
123
123
  return None
124
124
 
125
- question = f"Which action results in greater overall happiness?\nA. {activity}\nB. {baseline}"
126
-
127
- # For utilitarianism, we need to compare - just use A as correct for now
128
- # (the actual rating field is empty in the data)
129
- correct = "A"
130
- incorrect = "B"
125
+ # Raw prompt - activity is correct, baseline is incorrect
126
+ question = "Which action results in greater overall happiness?"
127
+ correct = activity
128
+ incorrect = baseline
131
129
 
132
130
  metadata = {"label": "hendrycks_ethics"}
133
131
 
@@ -196,14 +194,12 @@ class HendrycksEthicsExtractor(LMEvalBenchmarkExtractor):
196
194
  incorrect_idx = (answer_idx + 1) % len(choices)
197
195
  incorrect = choices[incorrect_idx]
198
196
 
199
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
200
-
201
197
  metadata = {
202
198
  "label": "hendrycks_ethics",
203
199
  }
204
200
 
205
201
  return self._build_pair(
206
- question=formatted_question,
202
+ question=question,
207
203
  correct=correct,
208
204
  incorrect=incorrect,
209
205
  metadata=metadata,
@@ -138,39 +138,86 @@ class HendrycksMathExtractor(LMEvalBenchmarkExtractor):
138
138
 
139
139
  return None
140
140
 
141
- def _create_incorrect_answer(self, correct: str) -> str:
141
+ def _create_incorrect_answer(self, correct: str, doc: dict = None) -> str:
142
142
  """
143
- Create an incorrect answer by modifying the correct one.
143
+ Create a meaningful incorrect answer by using different plausible wrong values.
144
+
145
+ Strategy:
146
+ 1. For integers: use a different integer (multiply by 2, subtract, etc.)
147
+ 2. For fractions: change numerator/denominator in a plausible way
148
+ 3. For expressions: provide a structurally different but plausible answer
144
149
 
145
150
  Args:
146
151
  correct: The correct answer
152
+ doc: Optional doc for context
147
153
 
148
154
  Returns:
149
- An incorrect answer
155
+ A plausible but incorrect answer
150
156
  """
151
- # Try to parse as number and modify it
157
+ import random
158
+ random.seed(hash(correct) % (2**32)) # Deterministic based on answer
159
+
160
+ # Try to parse as number and create plausible wrong answer
152
161
  try:
153
- # Remove common LaTeX/math formatting
154
162
  clean = correct.replace('$', '').replace(',', '').replace('^\\circ', '').replace('^{\\circ}', '').strip()
155
163
 
156
164
  # Try integer
157
165
  num = int(clean)
158
- return str(num + 1)
166
+ # Use various wrong transformations
167
+ wrong_transforms = [
168
+ num * 2, # doubled
169
+ num // 2 if num > 1 else num * 3, # halved or tripled
170
+ num - 1 if num > 0 else num + 2, # off by different amount
171
+ num + 10, # significantly different
172
+ abs(num) * -1 if num > 0 else abs(num), # sign flip
173
+ ]
174
+ return str(random.choice(wrong_transforms))
159
175
  except ValueError:
160
176
  try:
161
177
  # Try float
162
178
  num = float(clean)
163
- return str(num + 1.0)
179
+ wrong_transforms = [
180
+ num * 2,
181
+ num / 2,
182
+ num - 0.5,
183
+ num + 0.25,
184
+ round(num) if num != round(num) else num + 0.5,
185
+ ]
186
+ return str(random.choice(wrong_transforms))
164
187
  except ValueError:
165
- # Can't parse as number, create a modified version
166
- # For fractions like \frac{8}{17}, modify numerator
167
- frac_match = re.match(r'\\frac\{(\d+)\}\{(\d+)\}', correct)
168
- if frac_match:
169
- num, denom = frac_match.groups()
170
- return f"\\frac{{{int(num) + 1}}}{{{denom}}}"
171
-
172
- # For other cases, just append " + 1"
173
- return f"{correct} + 1"
188
+ pass
189
+
190
+ # For fractions like \frac{8}{17}, create plausible wrong fraction
191
+ frac_match = re.match(r'\\frac\{(\d+)\}\{(\d+)\}', correct)
192
+ if frac_match:
193
+ num, denom = int(frac_match.group(1)), int(frac_match.group(2))
194
+ wrong_fracs = [
195
+ f"\\frac{{{denom}}}{{{num}}}", # inverted
196
+ f"\\frac{{{num}}}{{{denom + 1}}}", # different denominator
197
+ f"\\frac{{{num * 2}}}{{{denom}}}", # doubled numerator
198
+ ]
199
+ return random.choice(wrong_fracs)
200
+
201
+ # For sqrt expressions
202
+ sqrt_match = re.search(r'\\sqrt\{(\d+)\}', correct)
203
+ if sqrt_match:
204
+ val = int(sqrt_match.group(1))
205
+ wrong_vals = [val + 1, val - 1 if val > 1 else val + 2, val * 2]
206
+ return correct.replace(f"\\sqrt{{{val}}}", f"\\sqrt{{{random.choice(wrong_vals)}}}")
207
+
208
+ # For pi expressions
209
+ if '\\pi' in correct:
210
+ if '2\\pi' in correct:
211
+ return correct.replace('2\\pi', '\\pi')
212
+ elif '\\pi' in correct:
213
+ return correct.replace('\\pi', '2\\pi')
214
+
215
+ # For other symbolic answers, provide common wrong alternatives
216
+ common_wrong = ['0', '1', '-1', '2', '\\infty', 'undefined']
217
+ if correct not in common_wrong:
218
+ return random.choice([w for w in common_wrong if w != correct])
219
+
220
+ return "incorrect"
174
221
 
175
222
  @staticmethod
176
223
  def _build_pair(
@@ -150,14 +150,12 @@ class HistoiresMoralesExtractor(LMEvalBenchmarkExtractor):
150
150
  incorrect_idx = (answer_idx + 1) % len(choices)
151
151
  incorrect = choices[incorrect_idx]
152
152
 
153
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
154
-
155
153
  metadata = {
156
154
  "label": "histoires_morales",
157
155
  }
158
156
 
159
157
  return self._build_pair(
160
- question=formatted_question,
158
+ question=question,
161
159
  correct=correct,
162
160
  incorrect=incorrect,
163
161
  metadata=metadata,
@@ -174,14 +174,12 @@ class Hrm8kExtractor(LMEvalBenchmarkExtractor):
174
174
  incorrect_idx = (answer_idx + 1) % len(choices)
175
175
  incorrect = choices[incorrect_idx]
176
176
 
177
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
178
-
179
177
  metadata = {
180
178
  "label": "hrm8k",
181
179
  }
182
180
 
183
181
  return self._build_pair(
184
- question=formatted_question,
182
+ question=question,
185
183
  correct=correct,
186
184
  incorrect=incorrect,
187
185
  metadata=metadata,
@@ -123,14 +123,12 @@ class HumanevalInfillingExtractor(LMEvalBenchmarkExtractor):
123
123
  incorrect_idx = (answer_idx + 1) % len(choices)
124
124
  incorrect = choices[incorrect_idx]
125
125
 
126
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
127
-
128
126
  metadata = {
129
127
  "label": "humaneval_infilling",
130
128
  }
131
129
 
132
130
  return self._build_pair(
133
- question=formatted_question,
131
+ question=question,
134
132
  correct=correct,
135
133
  incorrect=incorrect,
136
134
  metadata=metadata,
@@ -123,14 +123,12 @@ class IcelandicWinograndeExtractor(LMEvalBenchmarkExtractor):
123
123
  incorrect_idx = (answer_idx + 1) % len(choices)
124
124
  incorrect = choices[incorrect_idx]
125
125
 
126
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
127
-
128
126
  metadata = {
129
127
  "label": "icelandic_winogrande",
130
128
  }
131
129
 
132
130
  return self._build_pair(
133
- question=formatted_question,
131
+ question=question,
134
132
  correct=correct,
135
133
  incorrect=incorrect,
136
134
  metadata=metadata,
@@ -80,12 +80,10 @@ class InverseExtractor(LMEvalBenchmarkExtractor):
80
80
  correct = str(choices[answer_idx]).strip()
81
81
  incorrect_idx = (answer_idx + 1) % len(choices)
82
82
  incorrect = str(choices[incorrect_idx]).strip()
83
-
84
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
85
83
  metadata = {"label": "inverse"}
86
84
 
87
85
  return self._build_pair(
88
- question=formatted_question,
86
+ question=question,
89
87
  correct=correct,
90
88
  incorrect=incorrect,
91
89
  metadata=metadata,
@@ -163,14 +163,12 @@ class InverseScalingExtractor(LMEvalBenchmarkExtractor):
163
163
  incorrect_idx = (answer_idx + 1) % len(choices)
164
164
  incorrect = choices[incorrect_idx]
165
165
 
166
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
167
-
168
166
  metadata = {
169
167
  "label": "inverse_scaling",
170
168
  }
171
169
 
172
170
  return self._build_pair(
173
- question=formatted_question,
171
+ question=question,
174
172
  correct=correct,
175
173
  incorrect=incorrect,
176
174
  metadata=metadata,
@@ -80,12 +80,10 @@ class JaExtractor(LMEvalBenchmarkExtractor):
80
80
  correct = str(choices[answer_idx]).strip()
81
81
  incorrect_idx = (answer_idx + 1) % len(choices)
82
82
  incorrect = str(choices[incorrect_idx]).strip()
83
-
84
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
85
83
  metadata = {"label": "ja"}
86
84
 
87
85
  return self._build_pair(
88
- question=formatted_question,
86
+ question=question,
89
87
  correct=correct,
90
88
  incorrect=incorrect,
91
89
  metadata=metadata,
@@ -126,14 +126,12 @@ class JapaneseLeaderboardExtractor(LMEvalBenchmarkExtractor):
126
126
  incorrect_idx = (answer_idx + 1) % len(choices)
127
127
  incorrect = choices[incorrect_idx]
128
128
 
129
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
130
-
131
129
  metadata = {
132
130
  "label": "japanese_leaderboard",
133
131
  }
134
132
 
135
133
  return self._build_pair(
136
- question=formatted_question,
134
+ question=question,
137
135
  correct=correct,
138
136
  incorrect=incorrect,
139
137
  metadata=metadata,
@@ -103,7 +103,7 @@ class JapaneseLeaderboardMultipleChoiceExtractor(LMEvalBenchmarkExtractor):
103
103
  incorrect_idx = (answer_idx + 1) % len(choices)
104
104
  incorrect = str(choices[incorrect_idx]).strip()
105
105
 
106
- formatted_question = f"Question: {question}\\nA. {incorrect}\\nB. {correct}"
106
+ prompt = f"Question: {question}"
107
107
 
108
108
  positive_response = PositiveResponse(model_response=correct)
109
109
  negative_response = NegativeResponse(model_response=incorrect)
@@ -139,14 +139,12 @@ class KmmluExtractor(LMEvalBenchmarkExtractor):
139
139
  incorrect_idx = (answer_idx + 1) % len(choices)
140
140
  incorrect = choices[incorrect_idx]
141
141
 
142
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
143
-
144
142
  metadata = {
145
143
  "label": "kmmlu",
146
144
  }
147
145
 
148
146
  return self._build_pair(
149
- question=formatted_question,
147
+ question=question,
150
148
  correct=correct,
151
149
  incorrect=incorrect,
152
150
  metadata=metadata,
@@ -136,14 +136,12 @@ class KobestExtractor(LMEvalBenchmarkExtractor):
136
136
  incorrect_idx = (answer_idx + 1) % len(choices)
137
137
  incorrect = choices[incorrect_idx]
138
138
 
139
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
140
-
141
139
  metadata = {
142
140
  "label": "kobest",
143
141
  }
144
142
 
145
143
  return self._build_pair(
146
- question=formatted_question,
144
+ question=question,
147
145
  correct=correct,
148
146
  incorrect=incorrect,
149
147
  metadata=metadata,
@@ -118,29 +118,17 @@ class KormedmcqaExtractor(LMEvalBenchmarkExtractor):
118
118
  incorrect_idx = (answer_idx + 1) % len(choices)
119
119
  incorrect = choices[incorrect_idx]
120
120
 
121
- # Format question with all choices
122
- formatted_question = (
123
- f"{question}\n"
124
- f"A. {choices[0]}\n"
125
- f"B. {choices[1]}\n"
126
- f"C. {choices[2]}\n"
127
- f"D. {choices[3]}\n"
128
- f"E. {choices[4]}\n"
129
- f"정답:"
130
- )
121
+ # Raw prompt without MC formatting
122
+ prompt = question
131
123
 
132
124
  metadata = {
133
125
  "label": "kormedmcqa",
134
126
  }
135
127
 
136
- # The correct answer is the letter (A-E)
137
- correct_letter = chr(ord('A') + answer_idx)
138
- incorrect_letter = chr(ord('A') + incorrect_idx)
139
-
140
128
  return self._build_pair(
141
- question=formatted_question,
142
- correct=correct_letter,
143
- incorrect=incorrect_letter,
129
+ question=prompt,
130
+ correct=correct,
131
+ incorrect=incorrect,
144
132
  metadata=metadata,
145
133
  )
146
134
 
@@ -156,14 +156,12 @@ class LambadaClozeExtractor(LMEvalBenchmarkExtractor):
156
156
  incorrect_idx = (answer_idx + 1) % len(choices)
157
157
  incorrect = choices[incorrect_idx]
158
158
 
159
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
160
-
161
159
  metadata = {
162
160
  "label": "lambada_cloze",
163
161
  }
164
162
 
165
163
  return self._build_pair(
166
- question=formatted_question,
164
+ question=question,
167
165
  correct=correct,
168
166
  incorrect=incorrect,
169
167
  metadata=metadata,
@@ -156,14 +156,12 @@ class LambadaMultilingualExtractor(LMEvalBenchmarkExtractor):
156
156
  incorrect_idx = (answer_idx + 1) % len(choices)
157
157
  incorrect = choices[incorrect_idx]
158
158
 
159
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
160
-
161
159
  metadata = {
162
160
  "label": "lambada_multilingual",
163
161
  }
164
162
 
165
163
  return self._build_pair(
166
- question=formatted_question,
164
+ question=question,
167
165
  correct=correct,
168
166
  incorrect=incorrect,
169
167
  metadata=metadata,
@@ -80,12 +80,10 @@ class LawExtractor(LMEvalBenchmarkExtractor):
80
80
  correct = str(choices[answer_idx]).strip()
81
81
  incorrect_idx = (answer_idx + 1) % len(choices)
82
82
  incorrect = str(choices[incorrect_idx]).strip()
83
-
84
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
85
83
  metadata = {"label": "law"}
86
84
 
87
85
  return self._build_pair(
88
- question=formatted_question,
86
+ question=question,
89
87
  correct=correct,
90
88
  incorrect=incorrect,
91
89
  metadata=metadata,
@@ -165,14 +165,12 @@ class LeaderboardExtractor(LMEvalBenchmarkExtractor):
165
165
  incorrect_idx = (answer_idx + 1) % len(choices)
166
166
  incorrect = choices[incorrect_idx]
167
167
 
168
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
169
-
170
168
  metadata = {
171
169
  "label": "leaderboard",
172
170
  }
173
171
 
174
172
  return self._build_pair(
175
- question=formatted_question,
173
+ question=question,
176
174
  correct=correct,
177
175
  incorrect=incorrect,
178
176
  metadata=metadata,
@@ -174,14 +174,12 @@ class LingolyExtractor(LMEvalBenchmarkExtractor):
174
174
  incorrect_idx = (answer_idx + 1) % len(choices)
175
175
  incorrect = choices[incorrect_idx]
176
176
 
177
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
178
-
179
177
  metadata = {
180
178
  "label": "lingoly",
181
179
  }
182
180
 
183
181
  return self._build_pair(
184
- question=formatted_question,
182
+ question=question,
185
183
  correct=correct,
186
184
  incorrect=incorrect,
187
185
  metadata=metadata,