wisent 0.7.701__py3-none-any.whl → 0.7.901__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/activation_cache.py +393 -0
  3. wisent/core/activations/activations.py +3 -3
  4. wisent/core/activations/activations_collector.py +9 -5
  5. wisent/core/activations/classifier_inference_strategy.py +12 -11
  6. wisent/core/activations/extraction_strategy.py +256 -84
  7. wisent/core/classifiers/classifiers/core/atoms.py +3 -2
  8. wisent/core/cli/__init__.py +2 -1
  9. wisent/core/cli/agent/apply_steering.py +5 -7
  10. wisent/core/cli/agent/train_classifier.py +19 -7
  11. wisent/core/cli/check_linearity.py +35 -3
  12. wisent/core/cli/cluster_benchmarks.py +4 -6
  13. wisent/core/cli/create_steering_vector.py +6 -4
  14. wisent/core/cli/diagnose_vectors.py +7 -4
  15. wisent/core/cli/estimate_unified_goodness_time.py +6 -4
  16. wisent/core/cli/generate_pairs_from_task.py +9 -56
  17. wisent/core/cli/geometry_search.py +137 -0
  18. wisent/core/cli/get_activations.py +1 -1
  19. wisent/core/cli/method_optimizer.py +4 -3
  20. wisent/core/cli/modify_weights.py +3 -2
  21. wisent/core/cli/optimize_sample_size.py +1 -1
  22. wisent/core/cli/optimize_steering.py +14 -16
  23. wisent/core/cli/optimize_weights.py +2 -1
  24. wisent/core/cli/preview_pairs.py +203 -0
  25. wisent/core/cli/steering_method_trainer.py +3 -3
  26. wisent/core/cli/tasks.py +19 -76
  27. wisent/core/cli/train_unified_goodness.py +3 -3
  28. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +4 -4
  29. wisent/core/contrastive_pairs/diagnostics/linearity.py +7 -0
  30. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
  31. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
  32. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
  33. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
  34. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
  35. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
  36. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
  37. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
  38. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
  39. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
  40. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
  41. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
  42. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
  43. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
  44. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
  45. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
  46. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
  47. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
  48. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
  49. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
  50. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
  51. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
  52. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
  53. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +3 -0
  54. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
  55. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
  56. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
  57. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
  58. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
  59. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
  60. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
  61. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
  62. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
  63. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
  64. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
  65. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
  66. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
  67. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
  68. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
  69. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
  70. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
  82. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
  83. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
  84. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
  85. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
  86. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
  87. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
  88. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
  89. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
  90. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
  91. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
  92. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
  93. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
  94. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
  95. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
  96. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
  102. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
  103. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
  104. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
  105. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
  106. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
  107. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
  108. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
  109. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
  110. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
  111. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
  112. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
  113. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
  114. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
  115. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
  116. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
  117. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
  118. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
  119. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
  120. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
  121. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
  122. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
  123. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
  124. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
  125. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
  126. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
  127. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
  128. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
  129. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
  130. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
  131. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
  132. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
  133. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
  134. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
  135. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
  136. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
  137. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
  138. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
  139. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
  140. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
  141. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
  142. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
  143. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
  144. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
  145. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
  146. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
  147. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
  148. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
  149. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
  150. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
  151. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
  152. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
  153. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
  154. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
  155. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
  156. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
  157. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
  158. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
  159. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
  160. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
  161. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
  162. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
  163. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
  164. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
  165. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
  166. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
  167. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
  168. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
  169. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
  170. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
  171. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
  172. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
  173. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
  174. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
  175. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
  176. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
  177. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
  178. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
  179. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
  180. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
  181. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
  182. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
  183. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
  184. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
  185. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
  186. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
  187. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
  188. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
  189. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
  190. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
  191. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
  192. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
  193. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
  194. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
  195. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
  196. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
  197. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
  198. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
  199. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
  200. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
  201. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
  202. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
  203. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
  204. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
  205. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
  206. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
  207. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
  208. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
  209. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
  210. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
  211. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
  212. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
  213. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
  214. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
  215. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
  216. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
  217. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
  218. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
  219. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
  220. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
  221. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
  222. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
  223. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
  224. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
  225. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
  226. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
  227. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
  228. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
  229. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
  230. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
  231. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
  232. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
  233. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
  234. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
  235. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
  236. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
  237. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
  238. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
  239. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
  240. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
  241. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
  242. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
  243. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
  244. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
  245. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
  246. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
  247. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
  248. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
  249. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
  250. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
  251. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
  252. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
  253. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
  254. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
  255. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
  256. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
  257. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
  258. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
  259. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
  260. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
  261. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
  262. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
  263. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
  264. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
  265. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
  266. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
  267. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
  268. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
  269. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
  270. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
  271. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
  272. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
  273. wisent/core/data_loaders/loaders/lm_loader.py +12 -1
  274. wisent/core/geometry_runner.py +995 -0
  275. wisent/core/geometry_search_space.py +237 -0
  276. wisent/core/hyperparameter_optimizer.py +1 -1
  277. wisent/core/main.py +3 -0
  278. wisent/core/models/core/atoms.py +5 -3
  279. wisent/core/models/wisent_model.py +1 -1
  280. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  281. wisent/core/parser_arguments/check_linearity_parser.py +12 -2
  282. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
  283. wisent/core/parser_arguments/generate_vector_from_task_parser.py +2 -2
  284. wisent/core/parser_arguments/geometry_search_parser.py +61 -0
  285. wisent/core/parser_arguments/main_parser.py +8 -0
  286. wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
  287. wisent/core/steering.py +5 -3
  288. wisent/core/steering_methods/methods/hyperplane.py +2 -1
  289. wisent/core/synthetic/generators/nonsense_generator.py +30 -18
  290. wisent/core/trainers/steering_trainer.py +2 -2
  291. wisent/core/utils/device.py +27 -27
  292. wisent/core/utils/layer_combinations.py +70 -0
  293. wisent/examples/__init__.py +1 -0
  294. wisent/examples/scripts/__init__.py +1 -0
  295. wisent/examples/scripts/count_all_benchmarks.py +121 -0
  296. wisent/examples/scripts/discover_directions.py +469 -0
  297. wisent/examples/scripts/extract_benchmark_info.py +71 -0
  298. wisent/examples/scripts/generate_paper_data.py +384 -0
  299. wisent/examples/scripts/intervention_validation.py +626 -0
  300. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
  301. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
  302. wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
  303. wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
  304. wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
  305. wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
  306. wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
  307. wisent/examples/scripts/search_all_short_names.py +31 -0
  308. wisent/examples/scripts/test_all_benchmarks.py +138 -0
  309. wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
  310. wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
  311. wisent/examples/scripts/test_nonsense_baseline.py +261 -0
  312. wisent/examples/scripts/test_one_benchmark.py +324 -0
  313. wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
  314. wisent/examples/scripts/threshold_analysis.py +434 -0
  315. wisent/examples/scripts/visualization_gallery.py +582 -0
  316. wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
  317. wisent/parameters/lm_eval/category_directions.json +137 -0
  318. wisent/parameters/lm_eval/repair_plan.json +282 -0
  319. wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
  320. wisent/parameters/lm_eval/working_benchmarks.json +206 -0
  321. wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
  322. wisent/tests/test_detector_accuracy.py +1 -1
  323. wisent/tests/visualize_geometry.py +1 -1
  324. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
  325. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/RECORD +329 -295
  326. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
  327. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
  328. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
  329. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
  330. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
@@ -168,14 +168,12 @@ class XcopaExtractor(LMEvalBenchmarkExtractor):
168
168
  incorrect_idx = (answer_idx + 1) % len(choices)
169
169
  incorrect = choices[incorrect_idx]
170
170
 
171
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
172
-
173
171
  metadata = {
174
172
  "label": "xcopa",
175
173
  }
176
174
 
177
175
  return self._build_pair(
178
- question=formatted_question,
176
+ question=question,
179
177
  correct=correct,
180
178
  incorrect=incorrect,
181
179
  metadata=metadata,
@@ -120,12 +120,10 @@ class XlsumExtractor(LMEvalBenchmarkExtractor):
120
120
  correct = str(choices[answer_idx]).strip()
121
121
  incorrect_idx = (answer_idx + 1) % len(choices)
122
122
  incorrect = str(choices[incorrect_idx]).strip()
123
-
124
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
125
123
  metadata = {"label": "xlsum"}
126
124
 
127
125
  return self._build_pair(
128
- question=formatted_question,
126
+ question=question,
129
127
  correct=correct,
130
128
  incorrect=incorrect,
131
129
  metadata=metadata,
@@ -102,14 +102,14 @@ class XNLIExtractor(LMEvalBenchmarkExtractor):
102
102
  correct = labels[label]
103
103
  incorrect = labels[(label+1)%3]
104
104
 
105
- formatted_question = f"Decide the relationship of the hypothesis '{hypothesis}' to the premise '{premise}\nA. {incorrect}\nB. {correct}"
105
+ prompt = f"Decide the relationship of the hypothesis '{hypothesis}' to the premise '{premise}"
106
106
 
107
107
  metadata = {
108
108
  "label": "xnli",
109
109
  }
110
110
 
111
111
  return self._build_pair(
112
- question=formatted_question,
112
+ question=prompt,
113
113
  correct=correct,
114
114
  incorrect=incorrect,
115
115
  metadata=metadata,
@@ -110,7 +110,7 @@ class XquadExtractor(LMEvalBenchmarkExtractor):
110
110
 
111
111
  metadata = {"label": "xquad"}
112
112
  return self._build_pair(
113
- question=formatted_question,
113
+ question=question,
114
114
  correct=correct_answer,
115
115
  incorrect=incorrect_answer,
116
116
  metadata=metadata,
@@ -174,14 +174,12 @@ class XquadExtractor(LMEvalBenchmarkExtractor):
174
174
  incorrect_idx = (answer_idx + 1) % len(choices)
175
175
  incorrect = choices[incorrect_idx]
176
176
 
177
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
178
-
179
177
  metadata = {
180
178
  "label": "xquad",
181
179
  }
182
180
 
183
181
  return self._build_pair(
184
- question=formatted_question,
182
+ question=question,
185
183
  correct=correct,
186
184
  incorrect=incorrect,
187
185
  metadata=metadata,
@@ -99,15 +99,14 @@ class XStoryClozeExtractor(LMEvalBenchmarkExtractor):
99
99
  correct = endings[answer]
100
100
  incorrect = endings[(answer+1)%len(endings)]
101
101
 
102
- formatted_question = " ".join(s.strip() for s in inputs if s)
103
- formatted_question = f"{formatted_question}\n \nA. {incorrect}\nB. {correct}"
102
+ prompt = " ".join(s.strip() for s in inputs if s)
104
103
 
105
104
  metadata = {
106
105
  "label": "xstorycloze",
107
106
  }
108
107
 
109
108
  return self._build_pair(
110
- question=formatted_question,
109
+ question=prompt,
111
110
  correct=correct,
112
111
  incorrect=incorrect,
113
112
  metadata=metadata,
@@ -95,14 +95,14 @@ class XWinogradExtractor(LMEvalBenchmarkExtractor):
95
95
  correct = options[answer]
96
96
  incorrect = options[(answer+1)%len(options)]
97
97
 
98
- formatted_question = f"Fill in the blank: {sentence}\nA. {incorrect}\nB. {correct}"
98
+ prompt = f"Fill in the blank: {sentence}"
99
99
 
100
100
  metadata = {
101
101
  "label": "xwinograd",
102
102
  }
103
103
 
104
104
  return self._build_pair(
105
- question=formatted_question,
105
+ question=prompt,
106
106
  correct=correct,
107
107
  incorrect=incorrect,
108
108
  metadata=metadata,
@@ -126,14 +126,12 @@ class ZhoblimpExtractor(LMEvalBenchmarkExtractor):
126
126
  incorrect_idx = (answer_idx + 1) % len(choices)
127
127
  incorrect = choices[incorrect_idx]
128
128
 
129
- formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
130
-
131
129
  metadata = {
132
130
  "label": "zhoblimp",
133
131
  }
134
132
 
135
133
  return self._build_pair(
136
- question=formatted_question,
134
+ question=question,
137
135
  correct=correct,
138
136
  incorrect=incorrect,
139
137
  metadata=metadata,
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import random
3
4
  from typing import TYPE_CHECKING
4
5
 
5
6
  from wisent.core.contrastive_pairs.lm_eval_pairs.lm_extractor_registry import get_extractor
@@ -10,17 +11,178 @@ if TYPE_CHECKING:
10
11
  from lm_eval.api.task import ConfigurableTask
11
12
  from wisent.core.contrastive_pairs.core.pair import ContrastivePair
12
13
 
13
- __all__ = ["build_contrastive_pairs"]
14
+ __all__ = ["build_contrastive_pairs", "lm_build_contrastive_pairs"]
14
15
  _LOG = setup_logger(__name__)
15
16
 
16
17
 
18
+ def _flatten_task_dict(task_dict: dict, prefix: str = "") -> list[tuple[str, "ConfigurableTask"]]:
19
+ """
20
+ Recursively flatten nested group tasks into a list of (name, ConfigurableTask) tuples.
21
+
22
+ arguments:
23
+ task_dict: Dict of task_name -> ConfigurableTask or nested dict
24
+ prefix: Prefix for nested task names
25
+
26
+ returns:
27
+ List of (full_task_name, ConfigurableTask) tuples (leaf tasks only)
28
+ """
29
+ from lm_eval.api.task import ConfigurableTask
30
+
31
+ result = []
32
+ for name, task in task_dict.items():
33
+ full_name = f"{prefix}/{name}" if prefix else name
34
+ if isinstance(task, ConfigurableTask):
35
+ result.append((full_name, task))
36
+ elif isinstance(task, dict):
37
+ # Nested group - recurse
38
+ result.extend(_flatten_task_dict(task, full_name))
39
+ return result
40
+
41
+
42
+ def _add_evaluator_to_pairs(
43
+ pairs: list["ContrastivePair"],
44
+ evaluator_name: str | None,
45
+ task_name: str,
46
+ ) -> list["ContrastivePair"]:
47
+ """Add evaluator_name and task_name to each pair's metadata."""
48
+ from dataclasses import replace
49
+
50
+ result = []
51
+ for pair in pairs:
52
+ metadata = dict(pair.metadata) if pair.metadata else {}
53
+ metadata["evaluator_name"] = evaluator_name
54
+ metadata["source_task"] = task_name
55
+ result.append(replace(pair, metadata=metadata))
56
+ return result
57
+
58
+
59
+ def build_contrastive_pairs(
60
+ task_name: str,
61
+ limit: int | None = None,
62
+ ) -> list["ContrastivePair"]:
63
+ """
64
+ Unified loader for contrastive pairs - handles both HuggingFace and lm-eval tasks.
65
+
66
+ Automatically:
67
+ - Detects if task is HF or lm-eval
68
+ - Handles group tasks (including nested groups) by sampling from all subtasks
69
+ - Adds evaluator_name to each pair's metadata
70
+
71
+ arguments:
72
+ task_name:
73
+ Name of the benchmark/task (e.g., "winogrande", "mmlu", "humaneval").
74
+ limit:
75
+ Optional upper bound on the number of pairs to return.
76
+ Values <= 0 are treated as "no limit".
77
+
78
+ returns:
79
+ A list of ContrastivePair objects, each with metadata containing
80
+ 'evaluator_name' and 'source_task'.
81
+ """
82
+ log = bind(_LOG, task=task_name or "unknown")
83
+ log.info("Building contrastive pairs (unified)", extra={"limit": limit})
84
+
85
+ # Normalize limit
86
+ max_items = None if (limit is None or limit <= 0) else int(limit)
87
+
88
+ # Get extractor
89
+ extractor = get_extractor(task_name)
90
+ log.info("Using extractor", extra={"extractor": extractor.__class__.__name__})
91
+
92
+ # Get evaluator_name from extractor
93
+ evaluator_name = getattr(extractor, 'evaluator_name', None)
94
+
95
+ # HuggingFace extractor - load directly
96
+ if isinstance(extractor, HuggingFaceBenchmarkExtractor):
97
+ log.info("HuggingFace task - loading directly")
98
+ pairs = extractor.extract_contrastive_pairs(limit=max_items)
99
+ return _add_evaluator_to_pairs(pairs, evaluator_name, task_name)
100
+
101
+ # lm-eval extractor - need to load task
102
+ log.info("lm-eval task - loading via LMEvalDataLoader")
103
+ from wisent.core.data_loaders.loaders.lm_loader import LMEvalDataLoader
104
+
105
+ loader = LMEvalDataLoader()
106
+ try:
107
+ task_obj = loader.load_lm_eval_task(task_name)
108
+ except Exception as e:
109
+ log.error(f"Failed to load lm-eval task: {e}")
110
+ raise
111
+
112
+ # Single task (ConfigurableTask)
113
+ from lm_eval.api.task import ConfigurableTask
114
+ if isinstance(task_obj, ConfigurableTask):
115
+ log.info("Single task")
116
+ pairs = extractor.extract_contrastive_pairs(task_obj, limit=max_items)
117
+ return _add_evaluator_to_pairs(pairs, evaluator_name, task_name)
118
+
119
+ # Group task (dict) - flatten and sample from all subtasks
120
+ if isinstance(task_obj, dict):
121
+ leaf_tasks = _flatten_task_dict(task_obj)
122
+ log.info(f"Group task with {len(leaf_tasks)} leaf subtasks")
123
+
124
+ if not leaf_tasks:
125
+ log.warning("No leaf tasks found in group")
126
+ return []
127
+
128
+ # Shuffle to get random sampling across subtasks
129
+ random.shuffle(leaf_tasks)
130
+
131
+ # Calculate pairs per subtask
132
+ if max_items is None:
133
+ pairs_per_task = None
134
+ else:
135
+ # Distribute limit across subtasks, minimum 1 per task
136
+ pairs_per_task = max(1, max_items // len(leaf_tasks))
137
+
138
+ all_pairs = []
139
+ for subtask_name, subtask in leaf_tasks:
140
+ try:
141
+ # Get the leaf task name (last part after /)
142
+ leaf_name = subtask_name.split("/")[-1] if "/" in subtask_name else subtask_name
143
+
144
+ # Try to get extractor for the specific subtask first
145
+ try:
146
+ subtask_extractor = get_extractor(leaf_name)
147
+ except:
148
+ # Fall back to parent extractor
149
+ subtask_extractor = extractor
150
+
151
+ subtask_evaluator = getattr(subtask_extractor, 'evaluator_name', evaluator_name)
152
+
153
+ subtask_pairs = subtask_extractor.extract_contrastive_pairs(subtask, limit=pairs_per_task)
154
+ subtask_pairs = _add_evaluator_to_pairs(subtask_pairs, subtask_evaluator, subtask_name)
155
+ all_pairs.extend(subtask_pairs)
156
+
157
+ # Stop if we have enough
158
+ if max_items is not None and len(all_pairs) >= max_items:
159
+ break
160
+ except Exception as e:
161
+ log.warning(f"Failed to extract from subtask {subtask_name}: {e}")
162
+ continue
163
+
164
+ # Shuffle final result and trim to limit
165
+ random.shuffle(all_pairs)
166
+ if max_items is not None:
167
+ all_pairs = all_pairs[:max_items]
168
+
169
+ log.info(f"Extracted {len(all_pairs)} pairs from group task")
170
+ return all_pairs
171
+
172
+ log.error(f"Unexpected task_obj type: {type(task_obj)}")
173
+ return []
174
+
175
+
17
176
  def lm_build_contrastive_pairs(
18
177
  task_name: str,
19
- lm_eval_task: ConfigurableTask | None,
178
+ lm_eval_task: "ConfigurableTask | None",
20
179
  limit: int | None = None,
21
- ) -> list[ContrastivePair]:
180
+ ) -> list["ContrastivePair"]:
22
181
  """
23
- Resolve the task's extractor (lazy-loaded) and return contrastive pairs.
182
+ Legacy function - resolve the task's extractor and return contrastive pairs.
183
+
184
+ For new code, prefer using build_contrastive_pairs() which handles
185
+ task loading automatically.
24
186
 
25
187
  arguments:
26
188
  task_name:
@@ -47,10 +209,15 @@ def lm_build_contrastive_pairs(
47
209
  max_items = None if (limit is None or limit <= 0) else int(limit)
48
210
 
49
211
  log.info("Extracting contrastive pairs", extra={"max_items": max_items})
212
+
213
+ # Get evaluator_name from extractor
214
+ evaluator_name = getattr(extractor, 'evaluator_name', None)
50
215
 
51
216
  # 3) Delegate: extractor loads docs and builds pairs
52
217
  # HuggingFace extractors don't need lm_eval_task - they load data directly from HuggingFace
53
218
  if isinstance(extractor, HuggingFaceBenchmarkExtractor):
54
- return extractor.extract_contrastive_pairs(limit=max_items)
219
+ pairs = extractor.extract_contrastive_pairs(limit=max_items)
55
220
  else:
56
- return extractor.extract_contrastive_pairs(lm_eval_task, limit=max_items)
221
+ pairs = extractor.extract_contrastive_pairs(lm_eval_task, limit=max_items)
222
+
223
+ return _add_evaluator_to_pairs(pairs, evaluator_name, task_name)
@@ -10,6 +10,10 @@ os.environ['TF_NUM_INTEROP_THREADS'] = '1'
10
10
  os.environ['TF_NUM_INTRAOP_THREADS'] = '1'
11
11
  os.environ['OMP_NUM_THREADS'] = '1'
12
12
 
13
+ # Allow code evaluation for code-related tasks (humaneval, etc.)
14
+ # Required by HuggingFace evaluate library for code_eval metric
15
+ os.environ['HF_ALLOW_CODE_EVAL'] = '1'
16
+
13
17
  # Enable trust_remote_code for all datasets (required for meddialog and others)
14
18
  # This uses lm-eval's recommended approach from PR #1998
15
19
  import datasets.config
@@ -294,6 +298,8 @@ class LMEvalDataLoader(BaseDataLoader):
294
298
  "tinytruthfulqa": "tinyTruthfulQA",
295
299
  "tinywinogrande": "tinyWinogrande",
296
300
  "paws-x": "pawsx",
301
+ # afrobench subtasks
302
+ "afrobench_adr": "adr",
297
303
  }
298
304
 
299
305
  # Use mapped name if available, otherwise use original
@@ -302,7 +308,9 @@ class LMEvalDataLoader(BaseDataLoader):
302
308
  log.info(f"Mapping task '{task_name}' to lm-eval task '{lm_eval_task_name}'")
303
309
 
304
310
  # Tasks that require case-sensitive names (don't lowercase these)
305
- case_sensitive_prefixes = {"tinyBenchmarks"}
311
+ # AraDiCE tasks have mixed case (e.g., AraDiCE_ArabicMMLU_lev)
312
+ # aexams tasks have mixed case (e.g., aexams_IslamicStudies)
313
+ case_sensitive_prefixes = {"tinyBenchmarks", "AraDiCE", "aexams_"}
306
314
 
307
315
  # Normalize task name to lowercase for lm-eval-harness compatibility
308
316
  # Many lm-eval tasks use lowercase names (e.g., "aradice" not "AraDICE")
@@ -379,6 +387,9 @@ class LMEvalDataLoader(BaseDataLoader):
379
387
  "noreval": ["ask_gec_p0", "ask_gec_p1", "ask_gec_p2", "ask_gec_p3", "ask_gec_p4", "ncb", "norbelebele_p0", "norbelebele_p1", "norbelebele_p2", "norbelebele_p3", "norbelebele_p4", "norcommonsenseqa_nno_p0", "norcommonsenseqa_nno_p1", "norcommonsenseqa_nno_p2", "norcommonsenseqa_nno_p3", "norcommonsenseqa_nno_p4", "norcommonsenseqa_nob_p0", "norcommonsenseqa_nob_p1", "norcommonsenseqa_nob_p2", "norcommonsenseqa_nob_p3", "norcommonsenseqa_nob_p4", "norec_document_p0", "norec_document_p1", "norec_document_p2", "norec_document_p3", "norec_document_p4", "norec_sentence_p0", "norec_sentence_p1", "norec_sentence_p2", "norec_sentence_p3", "norec_sentence_p4", "noridiom_nno_p0", "noridiom_nno_p1", "noridiom_nno_p2", "noridiom_nno_p3", "noridiom_nno_p4", "noridiom_nob_p0", "noridiom_nob_p1", "noridiom_nob_p2", "noridiom_nob_p3", "noridiom_nob_p4", "noropenbookqa_nno_p0", "noropenbookqa_nno_p1", "noropenbookqa_nno_p2", "noropenbookqa_nno_p3", "noropenbookqa_nno_p4", "noropenbookqa_nob_p0", "noropenbookqa_nob_p1", "noropenbookqa_nob_p2", "noropenbookqa_nob_p3", "noropenbookqa_nob_p4", "norquad_p0", "norquad_p1", "norquad_p2", "norquad_p3", "norquad_p4", "norrewrite_instruct", "norsumm_nno_p0", "norsumm_nno_p1", "norsumm_nno_p2", "norsumm_nno_p3", "norsumm_nno_p4", "norsumm_nno_p5", "norsumm_nob_p0", "norsumm_nob_p1", "norsumm_nob_p2", "norsumm_nob_p3", "norsumm_nob_p4", "norsumm_nob_p5", "norsummarize_instruct", "nortruthfulqa_gen_nno_p0", "nortruthfulqa_gen_nno_p1", "nortruthfulqa_gen_nno_p2", "nortruthfulqa_gen_nno_p3", "nortruthfulqa_gen_nno_p4", "nortruthfulqa_gen_nob_p0", "nortruthfulqa_gen_nob_p1", "nortruthfulqa_gen_nob_p2", "nortruthfulqa_gen_nob_p3", "nortruthfulqa_gen_nob_p4", "nortruthfulqa_mc_nno_p0", "nortruthfulqa_mc_nno_p1", "nortruthfulqa_mc_nno_p2", "nortruthfulqa_mc_nno_p3", "nortruthfulqa_mc_nno_p4", "nortruthfulqa_mc_nob_p0", "nortruthfulqa_mc_nob_p1", "nortruthfulqa_mc_nob_p2", "nortruthfulqa_mc_nob_p3", "nortruthfulqa_mc_nob_p4", "nrk_quiz_qa_nno_p0", "nrk_quiz_qa_nno_p1", "nrk_quiz_qa_nno_p2", "nrk_quiz_qa_nno_p3", "nrk_quiz_qa_nno_p4", "nrk_quiz_qa_nob_p0", "nrk_quiz_qa_nob_p1", "nrk_quiz_qa_nob_p2", "nrk_quiz_qa_nob_p3", "nrk_quiz_qa_nob_p4", "tatoeba_eng_nno_p0", "tatoeba_eng_nno_p1", "tatoeba_eng_nno_p2", "tatoeba_eng_nno_p3", "tatoeba_eng_nob_p0", "tatoeba_eng_nob_p1", "tatoeba_eng_nob_p2", "tatoeba_eng_nob_p3", "tatoeba_nno_eng_p0", "tatoeba_nno_eng_p1", "tatoeba_nno_eng_p2", "tatoeba_nno_eng_p3", "tatoeba_nob_eng_p0", "tatoeba_nob_eng_p1", "tatoeba_nob_eng_p2", "tatoeba_nob_eng_p3"],
380
388
  "storycloze": ["xstorycloze_en"],
381
389
  "instructhumaneval": ["humaneval_instruct"],
390
+ # African language benchmarks
391
+ "afrimgsm": ["afrimgsm_amh_prompt_1", "afrimgsm_eng_prompt_1", "afrimgsm_fra_prompt_1", "afrimgsm_hau_prompt_1", "afrimgsm_ibo_prompt_1", "afrimgsm_kin_prompt_1", "afrimgsm_swa_prompt_1", "afrimgsm_yor_prompt_1"],
392
+ "afrimmlu": ["afrimmlu_direct_amh_prompt_1", "afrimmlu_direct_eng_prompt_1", "afrimmlu_direct_fra_prompt_1", "afrimmlu_direct_hau_prompt_1", "afrimmlu_direct_ibo_prompt_1", "afrimmlu_direct_kin_prompt_1", "afrimmlu_direct_swa_prompt_1", "afrimmlu_direct_yor_prompt_1"],
382
393
  }
383
394
 
384
395
  # Check if task is explicitly disabled