wisent 0.7.701__py3-none-any.whl → 0.7.901__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/activation_cache.py +393 -0
  3. wisent/core/activations/activations.py +3 -3
  4. wisent/core/activations/activations_collector.py +9 -5
  5. wisent/core/activations/classifier_inference_strategy.py +12 -11
  6. wisent/core/activations/extraction_strategy.py +256 -84
  7. wisent/core/classifiers/classifiers/core/atoms.py +3 -2
  8. wisent/core/cli/__init__.py +2 -1
  9. wisent/core/cli/agent/apply_steering.py +5 -7
  10. wisent/core/cli/agent/train_classifier.py +19 -7
  11. wisent/core/cli/check_linearity.py +35 -3
  12. wisent/core/cli/cluster_benchmarks.py +4 -6
  13. wisent/core/cli/create_steering_vector.py +6 -4
  14. wisent/core/cli/diagnose_vectors.py +7 -4
  15. wisent/core/cli/estimate_unified_goodness_time.py +6 -4
  16. wisent/core/cli/generate_pairs_from_task.py +9 -56
  17. wisent/core/cli/geometry_search.py +137 -0
  18. wisent/core/cli/get_activations.py +1 -1
  19. wisent/core/cli/method_optimizer.py +4 -3
  20. wisent/core/cli/modify_weights.py +3 -2
  21. wisent/core/cli/optimize_sample_size.py +1 -1
  22. wisent/core/cli/optimize_steering.py +14 -16
  23. wisent/core/cli/optimize_weights.py +2 -1
  24. wisent/core/cli/preview_pairs.py +203 -0
  25. wisent/core/cli/steering_method_trainer.py +3 -3
  26. wisent/core/cli/tasks.py +19 -76
  27. wisent/core/cli/train_unified_goodness.py +3 -3
  28. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +4 -4
  29. wisent/core/contrastive_pairs/diagnostics/linearity.py +7 -0
  30. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
  31. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
  32. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
  33. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
  34. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
  35. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
  36. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
  37. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
  38. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
  39. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
  40. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
  41. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
  42. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
  43. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
  44. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
  45. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
  46. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
  47. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
  48. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
  49. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
  50. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
  51. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
  52. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
  53. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +3 -0
  54. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
  55. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
  56. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
  57. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
  58. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
  59. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
  60. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
  61. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
  62. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
  63. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
  64. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
  65. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
  66. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
  67. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
  68. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
  69. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
  70. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
  82. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
  83. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
  84. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
  85. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
  86. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
  87. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
  88. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
  89. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
  90. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
  91. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
  92. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
  93. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
  94. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
  95. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
  96. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
  102. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
  103. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
  104. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
  105. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
  106. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
  107. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
  108. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
  109. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
  110. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
  111. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
  112. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
  113. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
  114. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
  115. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
  116. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
  117. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
  118. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
  119. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
  120. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
  121. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
  122. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
  123. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
  124. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
  125. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
  126. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
  127. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
  128. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
  129. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
  130. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
  131. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
  132. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
  133. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
  134. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
  135. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
  136. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
  137. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
  138. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
  139. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
  140. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
  141. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
  142. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
  143. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
  144. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
  145. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
  146. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
  147. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
  148. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
  149. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
  150. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
  151. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
  152. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
  153. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
  154. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
  155. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
  156. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
  157. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
  158. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
  159. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
  160. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
  161. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
  162. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
  163. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
  164. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
  165. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
  166. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
  167. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
  168. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
  169. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
  170. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
  171. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
  172. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
  173. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
  174. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
  175. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
  176. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
  177. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
  178. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
  179. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
  180. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
  181. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
  182. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
  183. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
  184. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
  185. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
  186. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
  187. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
  188. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
  189. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
  190. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
  191. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
  192. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
  193. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
  194. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
  195. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
  196. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
  197. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
  198. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
  199. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
  200. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
  201. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
  202. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
  203. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
  204. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
  205. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
  206. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
  207. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
  208. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
  209. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
  210. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
  211. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
  212. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
  213. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
  214. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
  215. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
  216. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
  217. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
  218. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
  219. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
  220. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
  221. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
  222. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
  223. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
  224. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
  225. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
  226. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
  227. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
  228. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
  229. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
  230. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
  231. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
  232. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
  233. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
  234. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
  235. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
  236. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
  237. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
  238. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
  239. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
  240. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
  241. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
  242. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
  243. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
  244. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
  245. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
  246. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
  247. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
  248. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
  249. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
  250. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
  251. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
  252. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
  253. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
  254. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
  255. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
  256. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
  257. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
  258. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
  259. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
  260. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
  261. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
  262. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
  263. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
  264. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
  265. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
  266. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
  267. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
  268. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
  269. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
  270. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
  271. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
  272. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
  273. wisent/core/data_loaders/loaders/lm_loader.py +12 -1
  274. wisent/core/geometry_runner.py +995 -0
  275. wisent/core/geometry_search_space.py +237 -0
  276. wisent/core/hyperparameter_optimizer.py +1 -1
  277. wisent/core/main.py +3 -0
  278. wisent/core/models/core/atoms.py +5 -3
  279. wisent/core/models/wisent_model.py +1 -1
  280. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  281. wisent/core/parser_arguments/check_linearity_parser.py +12 -2
  282. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
  283. wisent/core/parser_arguments/generate_vector_from_task_parser.py +2 -2
  284. wisent/core/parser_arguments/geometry_search_parser.py +61 -0
  285. wisent/core/parser_arguments/main_parser.py +8 -0
  286. wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
  287. wisent/core/steering.py +5 -3
  288. wisent/core/steering_methods/methods/hyperplane.py +2 -1
  289. wisent/core/synthetic/generators/nonsense_generator.py +30 -18
  290. wisent/core/trainers/steering_trainer.py +2 -2
  291. wisent/core/utils/device.py +27 -27
  292. wisent/core/utils/layer_combinations.py +70 -0
  293. wisent/examples/__init__.py +1 -0
  294. wisent/examples/scripts/__init__.py +1 -0
  295. wisent/examples/scripts/count_all_benchmarks.py +121 -0
  296. wisent/examples/scripts/discover_directions.py +469 -0
  297. wisent/examples/scripts/extract_benchmark_info.py +71 -0
  298. wisent/examples/scripts/generate_paper_data.py +384 -0
  299. wisent/examples/scripts/intervention_validation.py +626 -0
  300. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
  301. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
  302. wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
  303. wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
  304. wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
  305. wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
  306. wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
  307. wisent/examples/scripts/search_all_short_names.py +31 -0
  308. wisent/examples/scripts/test_all_benchmarks.py +138 -0
  309. wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
  310. wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
  311. wisent/examples/scripts/test_nonsense_baseline.py +261 -0
  312. wisent/examples/scripts/test_one_benchmark.py +324 -0
  313. wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
  314. wisent/examples/scripts/threshold_analysis.py +434 -0
  315. wisent/examples/scripts/visualization_gallery.py +582 -0
  316. wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
  317. wisent/parameters/lm_eval/category_directions.json +137 -0
  318. wisent/parameters/lm_eval/repair_plan.json +282 -0
  319. wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
  320. wisent/parameters/lm_eval/working_benchmarks.json +206 -0
  321. wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
  322. wisent/tests/test_detector_accuracy.py +1 -1
  323. wisent/tests/visualize_geometry.py +1 -1
  324. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
  325. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/RECORD +329 -295
  326. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
  327. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
  328. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
  329. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
  330. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
@@ -117,10 +117,9 @@ class OkapiTruthfulqaMultilingualExtractor(LMEvalBenchmarkExtractor):
117
117
  incorrect = choices[incorrect_idx].strip() if isinstance(choices[incorrect_idx], str) else str(choices[incorrect_idx])
118
118
 
119
119
  if correct and incorrect:
120
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
121
120
  metadata = {"label": "okapi/truthfulqa_multilingual"}
122
121
  return self._build_pair(
123
- question=formatted_question,
122
+ question=question,
124
123
  correct=correct,
125
124
  incorrect=incorrect,
126
125
  metadata=metadata,
@@ -180,14 +179,12 @@ class OkapiTruthfulqaMultilingualExtractor(LMEvalBenchmarkExtractor):
180
179
  incorrect_idx = (answer_idx + 1) % len(choices)
181
180
  incorrect = choices[incorrect_idx]
182
181
 
183
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
184
-
185
182
  metadata = {
186
183
  "label": "okapi/truthfulqa_multilingual",
187
184
  }
188
185
 
189
186
  return self._build_pair(
190
- question=formatted_question,
187
+ question=question,
191
188
  correct=correct,
192
189
  incorrect=incorrect,
193
190
  metadata=metadata,
@@ -157,14 +157,12 @@ class OlaphExtractor(LMEvalBenchmarkExtractor):
157
157
  incorrect_idx = (answer_idx + 1) % len(choices)
158
158
  incorrect = choices[incorrect_idx]
159
159
 
160
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
161
-
162
160
  metadata = {
163
161
  "label": "olaph",
164
162
  }
165
163
 
166
164
  return self._build_pair(
167
- question=formatted_question,
165
+ question=question,
168
166
  correct=correct,
169
167
  incorrect=incorrect,
170
168
  metadata=metadata,
@@ -89,14 +89,14 @@ class OpenBookQAExtractor(LMEvalBenchmarkExtractor):
89
89
  incorrect = endings[(answer_idx+1)%len(endings)]
90
90
 
91
91
  question = f"{question_stem}"
92
- formatted_question = f"{question}\nA. {incorrect}\nB. {correct}"
92
+ prompt = f"{question}"
93
93
 
94
94
  metadata = {
95
95
  "label": "openbookqa",
96
96
  }
97
97
 
98
98
  return self._build_pair(
99
- question=formatted_question,
99
+ question=prompt,
100
100
  correct=correct,
101
101
  incorrect=incorrect,
102
102
  metadata=metadata,
@@ -80,12 +80,10 @@ class OptionExtractor(LMEvalBenchmarkExtractor):
80
80
  correct = str(choices[answer_idx]).strip()
81
81
  incorrect_idx = (answer_idx + 1) % len(choices)
82
82
  incorrect = str(choices[incorrect_idx]).strip()
83
-
84
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
85
83
  metadata = {"label": "option"}
86
84
 
87
85
  return self._build_pair(
88
- question=formatted_question,
86
+ question=question,
89
87
  correct=correct,
90
88
  incorrect=incorrect,
91
89
  metadata=metadata,
@@ -83,12 +83,10 @@ class ParafrasejaExtractor(LMEvalBenchmarkExtractor):
83
83
  correct = str(choices[answer_idx]).strip()
84
84
  incorrect_idx = (answer_idx + 1) % len(choices)
85
85
  incorrect = str(choices[incorrect_idx]).strip()
86
-
87
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
88
86
  metadata = {"label": "parafraseja"}
89
87
 
90
88
  return self._build_pair(
91
- question=formatted_question,
89
+ question=question,
92
90
  correct=correct,
93
91
  incorrect=incorrect,
94
92
  metadata=metadata,
@@ -83,12 +83,10 @@ class ParafrasesExtractor(LMEvalBenchmarkExtractor):
83
83
  correct = str(choices[answer_idx]).strip()
84
84
  incorrect_idx = (answer_idx + 1) % len(choices)
85
85
  incorrect = str(choices[incorrect_idx]).strip()
86
-
87
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
88
86
  metadata = {"label": "parafrases"}
89
87
 
90
88
  return self._build_pair(
91
- question=formatted_question,
89
+ question=question,
92
90
  correct=correct,
93
91
  incorrect=incorrect,
94
92
  metadata=metadata,
@@ -80,12 +80,10 @@ class PawsExtractor(LMEvalBenchmarkExtractor):
80
80
  correct = str(choices[answer_idx]).strip()
81
81
  incorrect_idx = (answer_idx + 1) % len(choices)
82
82
  incorrect = str(choices[incorrect_idx]).strip()
83
-
84
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
85
83
  metadata = {"label": "paws"}
86
84
 
87
85
  return self._build_pair(
88
- question=formatted_question,
86
+ question=question,
89
87
  correct=correct,
90
88
  incorrect=incorrect,
91
89
  metadata=metadata,
@@ -125,14 +125,12 @@ class PawsXExtractor(LMEvalBenchmarkExtractor):
125
125
  incorrect_idx = (answer_idx + 1) % len(choices)
126
126
  incorrect = choices[incorrect_idx]
127
127
 
128
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
129
-
130
128
  metadata = {
131
129
  "label": "paws-x",
132
130
  }
133
131
 
134
132
  return self._build_pair(
135
- question=formatted_question,
133
+ question=question,
136
134
  correct=correct,
137
135
  incorrect=incorrect,
138
136
  metadata=metadata,
@@ -82,7 +82,7 @@ class PawsXExtractor(LMEvalBenchmarkExtractor):
82
82
  )
83
83
  return None
84
84
 
85
- formatted_question = f"Is sentence '{sentence1}' paraphrase of sentence '{sentence2}'?\nA. Yes\nB. No"
85
+ prompt = f"Is sentence '{sentence1}' paraphrase of sentence '{sentence2}'?"
86
86
 
87
87
  # label == 1 means paraphrase (positive), label == 0 means not paraphrase (negative)
88
88
  correct = "Yes" if label == 1 else "No"
@@ -93,7 +93,7 @@ class PawsXExtractor(LMEvalBenchmarkExtractor):
93
93
  }
94
94
 
95
95
  return self._build_pair(
96
- question=formatted_question,
96
+ question=prompt,
97
97
  correct=correct,
98
98
  incorrect=incorrect,
99
99
  metadata=metadata,
@@ -219,12 +219,10 @@ class PersonaExtractor(LMEvalBenchmarkExtractor):
219
219
  correct = str(choices[answer_idx]).strip()
220
220
  incorrect_idx = (answer_idx + 1) % len(choices)
221
221
  incorrect = str(choices[incorrect_idx]).strip()
222
-
223
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
224
222
  metadata = {"label": "persona"}
225
223
 
226
224
  return self._build_pair(
227
- question=formatted_question,
225
+ question=question,
228
226
  correct=correct,
229
227
  incorrect=incorrect,
230
228
  metadata=metadata,
@@ -117,12 +117,10 @@ class PhrasesExtractor(LMEvalBenchmarkExtractor):
117
117
  correct = str(choices[answer_idx]).strip()
118
118
  incorrect_idx = (answer_idx + 1) % len(choices)
119
119
  incorrect = str(choices[incorrect_idx]).strip()
120
-
121
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
122
120
  metadata = {"label": "phrases"}
123
121
 
124
122
  return self._build_pair(
125
- question=formatted_question,
123
+ question=question,
126
124
  correct=correct,
127
125
  incorrect=incorrect,
128
126
  metadata=metadata,
@@ -132,14 +132,12 @@ class PileExtractor(LMEvalBenchmarkExtractor):
132
132
  incorrect_idx = (answer_idx + 1) % len(choices)
133
133
  incorrect = choices[incorrect_idx]
134
134
 
135
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
136
-
137
135
  metadata = {
138
136
  "label": "pile",
139
137
  }
140
138
 
141
139
  return self._build_pair(
142
- question=formatted_question,
140
+ question=question,
143
141
  correct=correct,
144
142
  incorrect=incorrect,
145
143
  metadata=metadata,
@@ -84,7 +84,7 @@ class PIQAExtractor(LMEvalBenchmarkExtractor):
84
84
  return None
85
85
 
86
86
  question = f"Question: {goal}\nAnswer:"
87
- formatted_question = f"{question}\nA. {sol1}\nB. {sol2}"
87
+ prompt = f"{question}"
88
88
 
89
89
  correct = sol1 if label == 0 else sol2
90
90
  incorrect = sol2 if label == 0 else sol1
@@ -94,7 +94,7 @@ class PIQAExtractor(LMEvalBenchmarkExtractor):
94
94
  }
95
95
 
96
96
  return self._build_pair(
97
- question=formatted_question,
97
+ question=prompt,
98
98
  correct=correct,
99
99
  incorrect=incorrect,
100
100
  metadata=metadata,
@@ -126,14 +126,12 @@ class PortugueseBenchExtractor(LMEvalBenchmarkExtractor):
126
126
  incorrect_idx = (answer_idx + 1) % len(choices)
127
127
  incorrect = choices[incorrect_idx]
128
128
 
129
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
130
-
131
129
  metadata = {
132
130
  "label": "portuguese_bench",
133
131
  }
134
132
 
135
133
  return self._build_pair(
136
- question=formatted_question,
134
+ question=question,
137
135
  correct=correct,
138
136
  incorrect=incorrect,
139
137
  metadata=metadata,
@@ -80,12 +80,10 @@ class PromptExtractor(LMEvalBenchmarkExtractor):
80
80
  correct = str(choices[answer_idx]).strip()
81
81
  incorrect_idx = (answer_idx + 1) % len(choices)
82
82
  incorrect = str(choices[incorrect_idx]).strip()
83
-
84
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
85
83
  metadata = {"label": "prompt"}
86
84
 
87
85
  return self._build_pair(
88
- question=formatted_question,
86
+ question=question,
89
87
  correct=correct,
90
88
  incorrect=incorrect,
91
89
  metadata=metadata,
@@ -86,14 +86,14 @@ class ProstExtractor(LMEvalBenchmarkExtractor):
86
86
  correct = answers[label]
87
87
  incorrect = answers[(label+1)%len(answers)]
88
88
 
89
- formatted_question = f"{context}\nQuestion: {question}\nAnswer:\nA. {incorrect}\nB. {correct}"
89
+ prompt = f"{context}\nQuestion: {question}\nAnswer:"
90
90
 
91
91
  metadata = {
92
92
  "label": "prost",
93
93
  }
94
94
 
95
95
  return self._build_pair(
96
- question=formatted_question,
96
+ question=prompt,
97
97
  correct=correct,
98
98
  incorrect=incorrect,
99
99
  metadata=metadata,
@@ -80,7 +80,7 @@ class PubMedQAExtractor(LMEvalBenchmarkExtractor):
80
80
  return None
81
81
 
82
82
  formatted_context = " ".join(s.strip() for s in contexts if isinstance(s, str) and s.strip())
83
- formatted_question = f"Abstract: {formatted_context}\nQuestion: {question}\nAnswer:\nA. yes\nB. no"
83
+ prompt = f"Abstract: {formatted_context}\nQuestion: {question}"
84
84
 
85
85
  correct = final_decision
86
86
  incorrect = "yes" if correct == "no" else "no"
@@ -90,7 +90,7 @@ class PubMedQAExtractor(LMEvalBenchmarkExtractor):
90
90
  }
91
91
 
92
92
  return self._build_pair(
93
- question=formatted_question,
93
+ question=prompt,
94
94
  correct=correct,
95
95
  incorrect=incorrect,
96
96
  metadata=metadata,
@@ -90,14 +90,14 @@ class QA4MREExtractor(LMEvalBenchmarkExtractor):
90
90
  correct = answers[answer]
91
91
  incorrect = answers[(answer+1)%len(answers)]
92
92
 
93
- formatted_question = f"{document_str}\nQuestion: {question_str}?\nAnswer:\nA. {incorrect}\nB. {correct}"
93
+ prompt = f"{document_str}\nQuestion: {question_str}?\nAnswer:"
94
94
 
95
95
  metadata = {
96
96
  "label": "qa4mre",
97
97
  }
98
98
 
99
99
  return self._build_pair(
100
- question=formatted_question,
100
+ question=prompt,
101
101
  correct=correct,
102
102
  incorrect=incorrect,
103
103
  metadata=metadata,
@@ -86,7 +86,7 @@ class QasperExtractor(LMEvalBenchmarkExtractor):
86
86
  return None
87
87
 
88
88
 
89
- formatted_question = f"TITLE: {title}\nABSTRACT: {abstract}\nQ: {question}\nA. yes\nB. no"
89
+ prompt = f"TITLE: {title}\nABSTRACT: {abstract}\nQ: {question}"
90
90
 
91
91
  correct = answer
92
92
  incorrect = "yes" if answer == "no" else "no"
@@ -96,7 +96,7 @@ class QasperExtractor(LMEvalBenchmarkExtractor):
96
96
  }
97
97
 
98
98
  return self._build_pair(
99
- question=formatted_question,
99
+ question=prompt,
100
100
  correct=correct,
101
101
  incorrect=incorrect,
102
102
  metadata=metadata,
@@ -80,7 +80,7 @@ class QasperBoolExtractor(LMEvalBenchmarkExtractor):
80
80
  return None
81
81
 
82
82
 
83
- formatted_question = f"TITLE: {title}\nABSTRACT: {abstract}\nQ: {question}\nA. yes\nB. no"
83
+ prompt = f"TITLE: {title}\nABSTRACT: {abstract}\nQ: {question}"
84
84
 
85
85
  correct = answer
86
86
  incorrect = "yes" if answer == "no" else "no"
@@ -90,7 +90,7 @@ class QasperBoolExtractor(LMEvalBenchmarkExtractor):
90
90
  }
91
91
 
92
92
  return self._build_pair(
93
- question=formatted_question,
93
+ question=prompt,
94
94
  correct=correct,
95
95
  incorrect=incorrect,
96
96
  metadata=metadata,
@@ -79,7 +79,7 @@ class QNLIExtractor(LMEvalBenchmarkExtractor):
79
79
  )
80
80
  return None
81
81
 
82
- formatted_question = f"{question}\n{sentence}\nQuestion: Does this response answer the question?\nAnswer:\nA. Yes\nB. No"
82
+ prompt = f"{question}\n{sentence}\nQuestion: Does this response answer the question?"
83
83
 
84
84
  correct = "Yes" if label == 0 else "No"
85
85
  incorrect = "No" if label == 0 else "Yes"
@@ -89,7 +89,7 @@ class QNLIExtractor(LMEvalBenchmarkExtractor):
89
89
  }
90
90
 
91
91
  return self._build_pair(
92
- question=formatted_question,
92
+ question=prompt,
93
93
  correct=correct,
94
94
  incorrect=incorrect,
95
95
  metadata=metadata,
@@ -80,12 +80,10 @@ class QnlieuExtractor(LMEvalBenchmarkExtractor):
80
80
  correct = str(choices[answer_idx]).strip()
81
81
  incorrect_idx = (answer_idx + 1) % len(choices)
82
82
  incorrect = str(choices[incorrect_idx]).strip()
83
-
84
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
85
83
  metadata = {"label": "qnlieu"}
86
84
 
87
85
  return self._build_pair(
88
- question=formatted_question,
86
+ question=question,
89
87
  correct=correct,
90
88
  incorrect=incorrect,
91
89
  metadata=metadata,
@@ -79,7 +79,7 @@ class QQPExtractor(LMEvalBenchmarkExtractor):
79
79
  )
80
80
  return None
81
81
 
82
- formatted_question = f"{question1}\n{question2}\nQuestion: Do both qiestions ask the same thing?\nAnswer:\nA. Yes\nB. No"
82
+ prompt = f"{question1}\n{question2}\nQuestion: Do both questions ask the same thing?"
83
83
 
84
84
  correct = "Yes" if label == 1 else "No"
85
85
  incorrect = "No" if label == 1 else "Yes"
@@ -89,7 +89,7 @@ class QQPExtractor(LMEvalBenchmarkExtractor):
89
89
  }
90
90
 
91
91
  return self._build_pair(
92
- question=formatted_question,
92
+ question=prompt,
93
93
  correct=correct,
94
94
  incorrect=incorrect,
95
95
  metadata=metadata,
@@ -95,14 +95,14 @@ class RACEExtractor(LMEvalBenchmarkExtractor):
95
95
  correct = options[answer_idx]
96
96
  incorrect = options[(answer_idx+1)%len(options)]
97
97
 
98
- formatted_question = f"{article}\nQuestion: {question}?\nAnswer:\nA. {incorrect}\nB. {correct}"
98
+ prompt = f"{article}\nQuestion: {question}?\nAnswer:"
99
99
 
100
100
  metadata = {
101
101
  "label": "race",
102
102
  }
103
103
 
104
104
  return self._build_pair(
105
- question=formatted_question,
105
+ question=prompt,
106
106
  correct=correct,
107
107
  incorrect=incorrect,
108
108
  metadata=metadata,
@@ -80,12 +80,10 @@ class RandomExtractor(LMEvalBenchmarkExtractor):
80
80
  correct = str(choices[answer_idx]).strip()
81
81
  incorrect_idx = (answer_idx + 1) % len(choices)
82
82
  incorrect = str(choices[incorrect_idx]).strip()
83
-
84
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
85
83
  metadata = {"label": "random"}
86
84
 
87
85
  return self._build_pair(
88
- question=formatted_question,
86
+ question=question,
89
87
  correct=correct,
90
88
  incorrect=incorrect,
91
89
  metadata=metadata,
@@ -96,14 +96,14 @@ class RecordExtractor(LMEvalBenchmarkExtractor):
96
96
  # Remove @highlight prefix
97
97
  passage = passage.replace('@highlight', '')
98
98
 
99
- formatted_question = f"Passage: {passage}\n\nQuery: {query}\nWhich option correctly completes the sentence at @placeholder?\nA. {incorrect}\nB. {correct}"
99
+ prompt = f"Passage: {passage}\n\nQuery: {query}\nWhich option correctly completes the sentence at @placeholder?"
100
100
 
101
101
  metadata = {
102
102
  "label": "record",
103
103
  }
104
104
 
105
105
  return self._build_pair(
106
- question=formatted_question,
106
+ question=prompt,
107
107
  correct=correct,
108
108
  incorrect=incorrect,
109
109
  metadata=metadata,
@@ -83,12 +83,10 @@ class ReversedExtractor(LMEvalBenchmarkExtractor):
83
83
  correct = str(choices[answer_idx]).strip()
84
84
  incorrect_idx = (answer_idx + 1) % len(choices)
85
85
  incorrect = str(choices[incorrect_idx]).strip()
86
-
87
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
88
86
  metadata = {"label": "reversed"}
89
87
 
90
88
  return self._build_pair(
91
- question=formatted_question,
89
+ question=question,
92
90
  correct=correct,
93
91
  incorrect=incorrect,
94
92
  metadata=metadata,
@@ -79,7 +79,7 @@ class RTEExtractor(LMEvalBenchmarkExtractor):
79
79
  )
80
80
  return None
81
81
 
82
- formatted_question = f"{sentence1}\nQuestion: {sentence2} True or False?\nAnswer:\nA. True\nB. False"
82
+ prompt = f"{sentence1}\nQuestion: {sentence2} True or False?"
83
83
 
84
84
  correct = "True" if label == 0 else "False"
85
85
  incorrect = "False" if label == 0 else "True"
@@ -89,7 +89,7 @@ class RTEExtractor(LMEvalBenchmarkExtractor):
89
89
  }
90
90
 
91
91
  return self._build_pair(
92
- question=formatted_question,
92
+ question=prompt,
93
93
  correct=correct,
94
94
  incorrect=incorrect,
95
95
  metadata=metadata,
@@ -141,14 +141,12 @@ class RulerExtractor(LMEvalBenchmarkExtractor):
141
141
  incorrect_idx = (answer_idx + 1) % len(choices)
142
142
  incorrect = choices[incorrect_idx]
143
143
 
144
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
145
-
146
144
  metadata = {
147
145
  "label": "ruler",
148
146
  }
149
147
 
150
148
  return self._build_pair(
151
- question=formatted_question,
149
+ question=question,
152
150
  correct=correct,
153
151
  incorrect=incorrect,
154
152
  metadata=metadata,
@@ -84,14 +84,14 @@ class SciQExtractor(LMEvalBenchmarkExtractor):
84
84
  )
85
85
  return None
86
86
 
87
- formatted_question = f"{support}\nQuestion: {question}\nAnswer:\nA. {incorrect}\nB. {correct}"
87
+ prompt = f"{support}\nQuestion: {question}\nAnswer:"
88
88
 
89
89
  metadata = {
90
90
  "label": "sciq",
91
91
  }
92
92
 
93
93
  return self._build_pair(
94
- question=formatted_question,
94
+ question=prompt,
95
95
  correct=correct,
96
96
  incorrect=incorrect,
97
97
  metadata=metadata,
@@ -150,12 +150,10 @@ class ScoreExtractor(LMEvalBenchmarkExtractor):
150
150
  incorrect_idx = (answer_idx + 1) % len(cleaned_choices)
151
151
  incorrect = cleaned_choices[incorrect_idx]
152
152
 
153
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
154
-
155
153
  metadata = {"label": "score_robustness"}
156
154
 
157
155
  return self._build_pair(
158
- question=formatted_question,
156
+ question=question,
159
157
  correct=correct,
160
158
  incorrect=incorrect,
161
159
  metadata=metadata,
@@ -132,14 +132,12 @@ class ScrollsExtractor(LMEvalBenchmarkExtractor):
132
132
  incorrect_idx = (answer_idx + 1) % len(choices)
133
133
  incorrect = choices[incorrect_idx]
134
134
 
135
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
136
-
137
135
  metadata = {
138
136
  "label": "scrolls",
139
137
  }
140
138
 
141
139
  return self._build_pair(
142
- question=formatted_question,
140
+ question=question,
143
141
  correct=correct,
144
142
  incorrect=incorrect,
145
143
  metadata=metadata,
@@ -128,14 +128,12 @@ class ScrollsMultipleChoiceExtractor(LMEvalBenchmarkExtractor):
128
128
  incorrect_idx = (answer_idx + 1) % len(choices)
129
129
  incorrect = choices[incorrect_idx]
130
130
 
131
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
132
-
133
131
  metadata = {
134
132
  "label": "scrolls_mc",
135
133
  }
136
134
 
137
135
  return self._build_pair(
138
- question=formatted_question,
136
+ question=question,
139
137
  correct=correct,
140
138
  incorrect=incorrect,
141
139
  metadata=metadata,
@@ -83,12 +83,10 @@ class SelfExtractor(LMEvalBenchmarkExtractor):
83
83
  correct = str(choices[answer_idx]).strip()
84
84
  incorrect_idx = (answer_idx + 1) % len(choices)
85
85
  incorrect = str(choices[incorrect_idx]).strip()
86
-
87
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
88
86
  metadata = {"label": "self"}
89
87
 
90
88
  return self._build_pair(
91
- question=formatted_question,
89
+ question=question,
92
90
  correct=correct,
93
91
  incorrect=incorrect,
94
92
  metadata=metadata,
@@ -104,12 +104,10 @@ class SglueExtractor(LMEvalBenchmarkExtractor):
104
104
  correct = str(choices[answer_idx]).strip()
105
105
  incorrect_idx = (answer_idx + 1) % len(choices)
106
106
  incorrect = str(choices[incorrect_idx]).strip()
107
-
108
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
109
107
  metadata = {"label": "sglue"}
110
108
 
111
109
  return self._build_pair(
112
- question=formatted_question,
110
+ question=question,
113
111
  correct=correct,
114
112
  incorrect=incorrect,
115
113
  metadata=metadata,
@@ -92,7 +92,8 @@ class SglueRteExtractor(LMEvalBenchmarkExtractor):
92
92
  correct = "False"
93
93
  incorrect = "True"
94
94
 
95
- prompt = f"Premise: {premise}\nHypothesis: {hypothesis} True or False?\nAnswer:\nA. {incorrect}\nB. {correct}"
95
+ # Raw prompt without A./B. formatting
96
+ prompt = f"Premise: {premise}\nHypothesis: {hypothesis} True or False?"
96
97
 
97
98
  metadata = {"label": "sglue_rte"}
98
99