wisent 0.7.701__py3-none-any.whl → 0.7.901__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/activation_cache.py +393 -0
  3. wisent/core/activations/activations.py +3 -3
  4. wisent/core/activations/activations_collector.py +9 -5
  5. wisent/core/activations/classifier_inference_strategy.py +12 -11
  6. wisent/core/activations/extraction_strategy.py +256 -84
  7. wisent/core/classifiers/classifiers/core/atoms.py +3 -2
  8. wisent/core/cli/__init__.py +2 -1
  9. wisent/core/cli/agent/apply_steering.py +5 -7
  10. wisent/core/cli/agent/train_classifier.py +19 -7
  11. wisent/core/cli/check_linearity.py +35 -3
  12. wisent/core/cli/cluster_benchmarks.py +4 -6
  13. wisent/core/cli/create_steering_vector.py +6 -4
  14. wisent/core/cli/diagnose_vectors.py +7 -4
  15. wisent/core/cli/estimate_unified_goodness_time.py +6 -4
  16. wisent/core/cli/generate_pairs_from_task.py +9 -56
  17. wisent/core/cli/geometry_search.py +137 -0
  18. wisent/core/cli/get_activations.py +1 -1
  19. wisent/core/cli/method_optimizer.py +4 -3
  20. wisent/core/cli/modify_weights.py +3 -2
  21. wisent/core/cli/optimize_sample_size.py +1 -1
  22. wisent/core/cli/optimize_steering.py +14 -16
  23. wisent/core/cli/optimize_weights.py +2 -1
  24. wisent/core/cli/preview_pairs.py +203 -0
  25. wisent/core/cli/steering_method_trainer.py +3 -3
  26. wisent/core/cli/tasks.py +19 -76
  27. wisent/core/cli/train_unified_goodness.py +3 -3
  28. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +4 -4
  29. wisent/core/contrastive_pairs/diagnostics/linearity.py +7 -0
  30. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
  31. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
  32. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
  33. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
  34. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
  35. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
  36. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
  37. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
  38. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
  39. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
  40. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
  41. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
  42. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
  43. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
  44. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
  45. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
  46. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
  47. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
  48. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
  49. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
  50. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
  51. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
  52. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
  53. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +3 -0
  54. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
  55. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
  56. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
  57. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
  58. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
  59. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
  60. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
  61. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
  62. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
  63. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
  64. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
  65. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
  66. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
  67. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
  68. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
  69. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
  70. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
  82. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
  83. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
  84. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
  85. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
  86. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
  87. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
  88. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
  89. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
  90. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
  91. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
  92. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
  93. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
  94. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
  95. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
  96. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
  102. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
  103. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
  104. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
  105. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
  106. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
  107. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
  108. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
  109. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
  110. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
  111. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
  112. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
  113. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
  114. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
  115. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
  116. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
  117. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
  118. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
  119. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
  120. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
  121. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
  122. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
  123. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
  124. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
  125. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
  126. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
  127. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
  128. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
  129. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
  130. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
  131. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
  132. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
  133. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
  134. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
  135. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
  136. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
  137. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
  138. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
  139. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
  140. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
  141. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
  142. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
  143. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
  144. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
  145. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
  146. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
  147. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
  148. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
  149. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
  150. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
  151. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
  152. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
  153. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
  154. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
  155. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
  156. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
  157. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
  158. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
  159. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
  160. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
  161. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
  162. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
  163. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
  164. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
  165. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
  166. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
  167. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
  168. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
  169. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
  170. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
  171. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
  172. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
  173. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
  174. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
  175. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
  176. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
  177. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
  178. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
  179. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
  180. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
  181. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
  182. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
  183. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
  184. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
  185. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
  186. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
  187. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
  188. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
  189. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
  190. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
  191. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
  192. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
  193. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
  194. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
  195. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
  196. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
  197. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
  198. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
  199. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
  200. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
  201. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
  202. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
  203. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
  204. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
  205. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
  206. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
  207. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
  208. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
  209. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
  210. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
  211. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
  212. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
  213. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
  214. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
  215. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
  216. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
  217. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
  218. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
  219. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
  220. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
  221. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
  222. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
  223. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
  224. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
  225. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
  226. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
  227. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
  228. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
  229. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
  230. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
  231. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
  232. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
  233. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
  234. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
  235. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
  236. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
  237. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
  238. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
  239. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
  240. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
  241. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
  242. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
  243. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
  244. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
  245. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
  246. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
  247. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
  248. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
  249. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
  250. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
  251. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
  252. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
  253. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
  254. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
  255. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
  256. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
  257. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
  258. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
  259. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
  260. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
  261. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
  262. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
  263. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
  264. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
  265. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
  266. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
  267. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
  268. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
  269. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
  270. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
  271. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
  272. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
  273. wisent/core/data_loaders/loaders/lm_loader.py +12 -1
  274. wisent/core/geometry_runner.py +995 -0
  275. wisent/core/geometry_search_space.py +237 -0
  276. wisent/core/hyperparameter_optimizer.py +1 -1
  277. wisent/core/main.py +3 -0
  278. wisent/core/models/core/atoms.py +5 -3
  279. wisent/core/models/wisent_model.py +1 -1
  280. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  281. wisent/core/parser_arguments/check_linearity_parser.py +12 -2
  282. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
  283. wisent/core/parser_arguments/generate_vector_from_task_parser.py +2 -2
  284. wisent/core/parser_arguments/geometry_search_parser.py +61 -0
  285. wisent/core/parser_arguments/main_parser.py +8 -0
  286. wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
  287. wisent/core/steering.py +5 -3
  288. wisent/core/steering_methods/methods/hyperplane.py +2 -1
  289. wisent/core/synthetic/generators/nonsense_generator.py +30 -18
  290. wisent/core/trainers/steering_trainer.py +2 -2
  291. wisent/core/utils/device.py +27 -27
  292. wisent/core/utils/layer_combinations.py +70 -0
  293. wisent/examples/__init__.py +1 -0
  294. wisent/examples/scripts/__init__.py +1 -0
  295. wisent/examples/scripts/count_all_benchmarks.py +121 -0
  296. wisent/examples/scripts/discover_directions.py +469 -0
  297. wisent/examples/scripts/extract_benchmark_info.py +71 -0
  298. wisent/examples/scripts/generate_paper_data.py +384 -0
  299. wisent/examples/scripts/intervention_validation.py +626 -0
  300. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
  301. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
  302. wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
  303. wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
  304. wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
  305. wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
  306. wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
  307. wisent/examples/scripts/search_all_short_names.py +31 -0
  308. wisent/examples/scripts/test_all_benchmarks.py +138 -0
  309. wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
  310. wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
  311. wisent/examples/scripts/test_nonsense_baseline.py +261 -0
  312. wisent/examples/scripts/test_one_benchmark.py +324 -0
  313. wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
  314. wisent/examples/scripts/threshold_analysis.py +434 -0
  315. wisent/examples/scripts/visualization_gallery.py +582 -0
  316. wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
  317. wisent/parameters/lm_eval/category_directions.json +137 -0
  318. wisent/parameters/lm_eval/repair_plan.json +282 -0
  319. wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
  320. wisent/parameters/lm_eval/working_benchmarks.json +206 -0
  321. wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
  322. wisent/tests/test_detector_accuracy.py +1 -1
  323. wisent/tests/visualize_geometry.py +1 -1
  324. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
  325. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/RECORD +329 -295
  326. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
  327. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
  328. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
  329. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
  330. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,324 @@
1
+ {
2
+ "task_name": "afrimmlu",
3
+ "model_name": "mock",
4
+ "evaluator_name": "log_likelihoods",
5
+ "num_pairs": 15,
6
+ "all_correct": true,
7
+ "pairs": [
8
+ {
9
+ "pair_id": 0,
10
+ "prompt": "Ked\u1ee5 \u1ee5d\u1ecb emume Steeti a ch\u1ecdr\u1ecd?...",
11
+ "positive_response": "[",
12
+ "negative_response": "'",
13
+ "positive_evaluation": {
14
+ "ground_truth": "TRUTHFUL",
15
+ "confidence": 1.0,
16
+ "expected": "TRUTHFUL",
17
+ "correct": true,
18
+ "details": "Predicted: '[' (log_prob=-0.500), Expected: '['"
19
+ },
20
+ "negative_evaluation": {
21
+ "ground_truth": "UNTRUTHFUL",
22
+ "confidence": 0.0,
23
+ "expected": "UNTRUTHFUL",
24
+ "correct": true,
25
+ "details": "Predicted: '[' (log_prob=-0.500), Expected: '''"
26
+ },
27
+ "both_correct": true
28
+ },
29
+ {
30
+ "pair_id": 1,
31
+ "prompt": "\u12a8\u121a\u12a8\u1270\u1209\u1275 \u12cd\u1235\u1325 \u12e8\u12cb\u130b (P) \u1260\u134d\u1341\u121d \u1349\u12ad\u12ad\u122d \u12cd\u1235\u1325 \u1208\u121d\u1295 \u12a8\u1205\u12f3\u130d \u1308\u1262 (MR) \u130b\u122d \u12a5\u1295\u12f0\u121a\u1270\u12ab\u12a8\u120d\u1363 \u12a5\u1293 \u12cb\u130b (P) \u1260\u1265\u1278\u129d\u1290\u1275 \u12c8\u12ed\u121d \u134d\u133d\u121d\u1293 \u1260\u130e\u12f0\u1208\u12cd \u1349\u12ad\u12ad\u122d \u12a8\u12a0\u1290\u1235\u1270\u129b \u1308\u1262 \u1208\u121d\u1295 \u12a5\u1295\u12f0\u121a\u1260\u120d\u1325 \u1260\u1275\u12ad\u12ad\u120d \u12e8\u121a\u12eb\u1235\u1228\u12f3\u12cd \u12e8\u1275\u129b\u12cd \u1290\u12cd? I. MR = P = \u1208\u134d\u1339\u121d \u12cd\u12f5\u12f5\u122d \u1260\u12a0\u130d\u12f5\u121d \u1270\u130d\u1263\u122d \u120b\u12ed \u134d\u120b\u130e\u1275. II. P > MR \u134d\u133d\u121d\u1293 \u12e8\u130e\u12f0\u1208\u12cd \u1349\u12ad\u12ad\u122d \u12a5\u1295\u12f0 \u1241\u120d\u1241\u120d \u1270\u1295\u1238\u122b\u1273\u127d \u1270\u130d\u1263\u122b\u1275\u1362 III. \u134d\u120b\u130e\u1275 \u12a5\u1293 \u12cb\u130b \u134d\u133d\u121d\u1293 \u12e8\u130e\u12f0\u1208\u12cd \u12cd\u12f5\u12f5\u122d \u12a5\u1295\u12f0 \u124b\u121a \u1270\u130d\u1263\u122d \u1290\u12cd \u12e8\u121a\u12c8\u12a8\u1209\u1275\u1362...",
32
+ "positive_response": "'",
33
+ "negative_response": "I",
34
+ "positive_evaluation": {
35
+ "ground_truth": "TRUTHFUL",
36
+ "confidence": 1.0,
37
+ "expected": "TRUTHFUL",
38
+ "correct": true,
39
+ "details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
40
+ },
41
+ "negative_evaluation": {
42
+ "ground_truth": "UNTRUTHFUL",
43
+ "confidence": 0.0,
44
+ "expected": "UNTRUTHFUL",
45
+ "correct": true,
46
+ "details": "Predicted: ''' (log_prob=-0.500), Expected: 'I'"
47
+ },
48
+ "both_correct": true
49
+ },
50
+ {
51
+ "pair_id": 2,
52
+ "prompt": "Na 1997, \u1ee4l\u1ecd ak\u1ee5 \u1ee5wa ch\u1ecdp\u1ee5tara na fraksh\u1ecdn nke nd\u1ecb b\u1ecb n'\u1ee5wa niile bi na mba nd\u1ecb na-akpata obere ego--ya b\u1ee5 mba kwa onye \u1ecdb\u1ee5la n'eri \u1ee5gw\u1ecd \u1ecdnwa nke $1230 ma\u1ecdb\u1ee5 n'erughi --d\u1ecb ihe d\u1ecb ka...",
53
+ "positive_response": "5",
54
+ "negative_response": "%",
55
+ "positive_evaluation": {
56
+ "ground_truth": "TRUTHFUL",
57
+ "confidence": 1.0,
58
+ "expected": "TRUTHFUL",
59
+ "correct": true,
60
+ "details": "Predicted: '5' (log_prob=-0.500), Expected: '5'"
61
+ },
62
+ "negative_evaluation": {
63
+ "ground_truth": "UNTRUTHFUL",
64
+ "confidence": 0.0,
65
+ "expected": "UNTRUTHFUL",
66
+ "correct": true,
67
+ "details": "Predicted: '5' (log_prob=-0.500), Expected: '%'"
68
+ },
69
+ "both_correct": true
70
+ },
71
+ {
72
+ "pair_id": 3,
73
+ "prompt": "Mu gihe cyacyo cy'agahebuzo, ni irihe tandukaniro riri hagati y'abantu bo mu Majyepfo n'abo mu bindi bice bya Leta Zunze Ubumwe z'Amerika bemera ko amashuri atandukana n'ayandi?...",
74
+ "positive_response": "'",
75
+ "negative_response": "8",
76
+ "positive_evaluation": {
77
+ "ground_truth": "TRUTHFUL",
78
+ "confidence": 1.0,
79
+ "expected": "TRUTHFUL",
80
+ "correct": true,
81
+ "details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
82
+ },
83
+ "negative_evaluation": {
84
+ "ground_truth": "UNTRUTHFUL",
85
+ "confidence": 0.0,
86
+ "expected": "UNTRUTHFUL",
87
+ "correct": true,
88
+ "details": "Predicted: ''' (log_prob=-0.500), Expected: '8'"
89
+ },
90
+ "both_correct": true
91
+ },
92
+ {
93
+ "pair_id": 4,
94
+ "prompt": "Ni iyihe mibare yabaturage itubwira igipimo cyuburumbuke aho umubare wabana bazavuka uzaba uhagije kugira ngo usimbure ababyeyi no kwishyura impfu zabana bapfa bakiri bato?...",
95
+ "positive_response": "'",
96
+ "negative_response": "U",
97
+ "positive_evaluation": {
98
+ "ground_truth": "TRUTHFUL",
99
+ "confidence": 1.0,
100
+ "expected": "TRUTHFUL",
101
+ "correct": true,
102
+ "details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
103
+ },
104
+ "negative_evaluation": {
105
+ "ground_truth": "UNTRUTHFUL",
106
+ "confidence": 0.0,
107
+ "expected": "UNTRUTHFUL",
108
+ "correct": true,
109
+ "details": "Predicted: ''' (log_prob=-0.500), Expected: 'U'"
110
+ },
111
+ "both_correct": true
112
+ },
113
+ {
114
+ "pair_id": 5,
115
+ "prompt": "A 2016, kusan mutane nawa a Amurka ba su da matsuguni?...",
116
+ "positive_response": "'",
117
+ "negative_response": "5",
118
+ "positive_evaluation": {
119
+ "ground_truth": "TRUTHFUL",
120
+ "confidence": 1.0,
121
+ "expected": "TRUTHFUL",
122
+ "correct": true,
123
+ "details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
124
+ },
125
+ "negative_evaluation": {
126
+ "ground_truth": "UNTRUTHFUL",
127
+ "confidence": 0.0,
128
+ "expected": "UNTRUTHFUL",
129
+ "correct": true,
130
+ "details": "Predicted: ''' (log_prob=-0.500), Expected: '5'"
131
+ },
132
+ "both_correct": true
133
+ },
134
+ {
135
+ "pair_id": 6,
136
+ "prompt": "Pour quel nombre l'\u00e9quation ci-apr\u00e8s est-elle vraie? : 26,1 mm = _ dm....",
137
+ "positive_response": "[",
138
+ "negative_response": "'",
139
+ "positive_evaluation": {
140
+ "ground_truth": "TRUTHFUL",
141
+ "confidence": 1.0,
142
+ "expected": "TRUTHFUL",
143
+ "correct": true,
144
+ "details": "Predicted: '[' (log_prob=-0.500), Expected: '['"
145
+ },
146
+ "negative_evaluation": {
147
+ "ground_truth": "UNTRUTHFUL",
148
+ "confidence": 0.0,
149
+ "expected": "UNTRUTHFUL",
150
+ "correct": true,
151
+ "details": "Predicted: '[' (log_prob=-0.500), Expected: '''"
152
+ },
153
+ "both_correct": true
154
+ },
155
+ {
156
+ "pair_id": 7,
157
+ "prompt": "Je, kuna vikwazo vyovyote vya kuingizwa kwa uhalifu wa kimila chini ya sheria ya Kiingereza?...",
158
+ "positive_response": "a",
159
+ "negative_response": "k",
160
+ "positive_evaluation": {
161
+ "ground_truth": "TRUTHFUL",
162
+ "confidence": 1.0,
163
+ "expected": "TRUTHFUL",
164
+ "correct": true,
165
+ "details": "Predicted: 'a' (log_prob=-0.500), Expected: 'a'"
166
+ },
167
+ "negative_evaluation": {
168
+ "ground_truth": "UNTRUTHFUL",
169
+ "confidence": 0.0,
170
+ "expected": "UNTRUTHFUL",
171
+ "correct": true,
172
+ "details": "Predicted: 'a' (log_prob=-0.500), Expected: 'k'"
173
+ },
174
+ "both_correct": true
175
+ },
176
+ {
177
+ "pair_id": 8,
178
+ "prompt": "Akwai wasu iyakoki dan shigar laifukan al'ada a \u0199ar\u0199ashin dokar Ingilishi?...",
179
+ "positive_response": "a",
180
+ "negative_response": "b",
181
+ "positive_evaluation": {
182
+ "ground_truth": "TRUTHFUL",
183
+ "confidence": 1.0,
184
+ "expected": "TRUTHFUL",
185
+ "correct": true,
186
+ "details": "Predicted: 'a' (log_prob=-0.500), Expected: 'a'"
187
+ },
188
+ "negative_evaluation": {
189
+ "ground_truth": "UNTRUTHFUL",
190
+ "confidence": 0.0,
191
+ "expected": "UNTRUTHFUL",
192
+ "correct": true,
193
+ "details": "Predicted: 'a' (log_prob=-0.500), Expected: 'b'"
194
+ },
195
+ "both_correct": true
196
+ },
197
+ {
198
+ "pair_id": 9,
199
+ "prompt": "Kufikia 2019, ni takriban asilimia ngapi ya watu duniani wanaishi katika umaskini uliokithiri (chini ya $1.90 kwa siku)?...",
200
+ "positive_response": "'",
201
+ "negative_response": "5",
202
+ "positive_evaluation": {
203
+ "ground_truth": "TRUTHFUL",
204
+ "confidence": 1.0,
205
+ "expected": "TRUTHFUL",
206
+ "correct": true,
207
+ "details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
208
+ },
209
+ "negative_evaluation": {
210
+ "ground_truth": "UNTRUTHFUL",
211
+ "confidence": 0.0,
212
+ "expected": "UNTRUTHFUL",
213
+ "correct": true,
214
+ "details": "Predicted: ''' (log_prob=-0.500), Expected: '5'"
215
+ },
216
+ "both_correct": true
217
+ },
218
+ {
219
+ "pair_id": 10,
220
+ "prompt": "Which treaties are considered as 'source of international law' under article 38 ICJ Statute?...",
221
+ "positive_response": "'",
222
+ "negative_response": "A",
223
+ "positive_evaluation": {
224
+ "ground_truth": "TRUTHFUL",
225
+ "confidence": 1.0,
226
+ "expected": "TRUTHFUL",
227
+ "correct": true,
228
+ "details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
229
+ },
230
+ "negative_evaluation": {
231
+ "ground_truth": "UNTRUTHFUL",
232
+ "confidence": 0.0,
233
+ "expected": "UNTRUTHFUL",
234
+ "correct": true,
235
+ "details": "Predicted: ''' (log_prob=-0.500), Expected: 'A'"
236
+ },
237
+ "both_correct": true
238
+ },
239
+ {
240
+ "pair_id": 11,
241
+ "prompt": "Umupaka w'ibishoboka byo gukora uzagaragara nk'umurongo ugororotse igihe...",
242
+ "positive_response": "b",
243
+ "negative_response": "u",
244
+ "positive_evaluation": {
245
+ "ground_truth": "TRUTHFUL",
246
+ "confidence": 1.0,
247
+ "expected": "TRUTHFUL",
248
+ "correct": true,
249
+ "details": "Predicted: 'b' (log_prob=-0.500), Expected: 'b'"
250
+ },
251
+ "negative_evaluation": {
252
+ "ground_truth": "UNTRUTHFUL",
253
+ "confidence": 0.0,
254
+ "expected": "UNTRUTHFUL",
255
+ "correct": true,
256
+ "details": "Predicted: 'b' (log_prob=-0.500), Expected: 'u'"
257
+ },
258
+ "both_correct": true
259
+ },
260
+ {
261
+ "pair_id": 12,
262
+ "prompt": "T\u00ed \u00ecd\u00e1s\u00ed l\u00e1ti \u00ecta t\u00ed \u00f2 d\u00e1ra b\u00e1 darap\u1ecd\u0300 m\u1ecd\u0301 \u00ecp\u00e8s\u00e8 ohun \u00e8l\u00f2 al\u00e1d\u00e0ni, \u00e8wo n\u00edn\u00fa \u00e0w\u1ecdn w\u1ecd\u0300ny\u00ec\u00ed ni \u00ecg\u00e9s\u1eb9\u0300 l\u00e1ti \u1ecd\u0300d\u1ecd\u0300 \u00ecj\u1ecdba t\u00ed \u00f3 \u1e63e\u00e9\u1e63e k\u00f3 gb\u00e9 \u1ecdj\u00e0 n\u00e1\u00e0 l\u1ecd s\u00edbi \u00e0b\u00e1j\u00e1de t\u00f3 d\u00e1ra?...",
263
+ "positive_response": "T",
264
+ "negative_response": "i",
265
+ "positive_evaluation": {
266
+ "ground_truth": "TRUTHFUL",
267
+ "confidence": 1.0,
268
+ "expected": "TRUTHFUL",
269
+ "correct": true,
270
+ "details": "Predicted: 'T' (log_prob=-0.500), Expected: 'T'"
271
+ },
272
+ "negative_evaluation": {
273
+ "ground_truth": "UNTRUTHFUL",
274
+ "confidence": 0.0,
275
+ "expected": "UNTRUTHFUL",
276
+ "correct": true,
277
+ "details": "Predicted: 'T' (log_prob=-0.500), Expected: 'i'"
278
+ },
279
+ "both_correct": true
280
+ },
281
+ {
282
+ "pair_id": 13,
283
+ "prompt": "What was GDP per capita in the United States in 1850 when adjusting for inflation and PPP in 2011 prices?...",
284
+ "positive_response": "'",
285
+ "negative_response": "A",
286
+ "positive_evaluation": {
287
+ "ground_truth": "TRUTHFUL",
288
+ "confidence": 1.0,
289
+ "expected": "TRUTHFUL",
290
+ "correct": true,
291
+ "details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
292
+ },
293
+ "negative_evaluation": {
294
+ "ground_truth": "UNTRUTHFUL",
295
+ "confidence": 0.0,
296
+ "expected": "UNTRUTHFUL",
297
+ "correct": true,
298
+ "details": "Predicted: ''' (log_prob=-0.500), Expected: 'A'"
299
+ },
300
+ "both_correct": true
301
+ },
302
+ {
303
+ "pair_id": 14,
304
+ "prompt": "Ni uwuhe mubare w'abana bafite hagati y'imyaka 13 na 15 muri Misiri bavuze ko bahohotewe nibura rimwe mu mezi abiri ashize, guhera mu mwaka wa 2015?...",
305
+ "positive_response": "0",
306
+ "negative_response": "%",
307
+ "positive_evaluation": {
308
+ "ground_truth": "TRUTHFUL",
309
+ "confidence": 1.0,
310
+ "expected": "TRUTHFUL",
311
+ "correct": true,
312
+ "details": "Predicted: '0' (log_prob=-0.500), Expected: '0'"
313
+ },
314
+ "negative_evaluation": {
315
+ "ground_truth": "UNTRUTHFUL",
316
+ "confidence": 0.0,
317
+ "expected": "UNTRUTHFUL",
318
+ "correct": true,
319
+ "details": "Predicted: '0' (log_prob=-0.500), Expected: '%'"
320
+ },
321
+ "both_correct": true
322
+ }
323
+ ]
324
+ }
@@ -0,0 +1,92 @@
1
+ [
2
+ {
3
+ "pair_id": 0,
4
+ "prompt": "Ked\u1ee5 \u1ee5d\u1ecb emume Steeti a ch\u1ecdr\u1ecd?",
5
+ "positive_response": "[",
6
+ "negative_response": "'"
7
+ },
8
+ {
9
+ "pair_id": 1,
10
+ "prompt": "\u12a8\u121a\u12a8\u1270\u1209\u1275 \u12cd\u1235\u1325 \u12e8\u12cb\u130b (P) \u1260\u134d\u1341\u121d \u1349\u12ad\u12ad\u122d \u12cd\u1235\u1325 \u1208\u121d\u1295 \u12a8\u1205\u12f3\u130d \u1308\u1262 (MR) \u130b\u122d \u12a5\u1295\u12f0\u121a\u1270\u12ab\u12a8\u120d\u1363 \u12a5\u1293 \u12cb\u130b (P) \u1260\u1265\u1278\u129d\u1290\u1275 \u12c8\u12ed\u121d \u134d\u133d\u121d\u1293 \u1260\u130e\u12f0\u1208\u12cd \u1349\u12ad\u12ad\u122d \u12a8\u12a0\u1290\u1235\u1270\u129b \u1308\u1262 \u1208\u121d\u1295 \u12a5\u1295\u12f0\u121a\u1260\u120d\u1325 \u1260\u1275\u12ad\u12ad\u120d \u12e8\u121a\u12eb\u1235\u1228\u12f3\u12cd \u12e8\u1275\u129b\u12cd \u1290\u12cd? I. MR = P = \u1208\u134d\u1339\u121d \u12cd\u12f5\u12f5\u122d \u1260\u12a0\u130d\u12f5\u121d \u1270\u130d\u1263\u122d \u120b\u12ed \u134d\u120b\u130e\u1275. II. P > MR \u134d\u133d\u121d\u1293 \u12e8\u130e\u12f0\u1208\u12cd \u1349\u12ad\u12ad\u122d \u12a5\u1295\u12f0 \u1241\u120d\u1241\u120d \u1270\u1295\u1238\u122b\u1273\u127d \u1270\u130d\u1263\u122b\u1275\u1362 III. \u134d\u120b\u130e\u1275 \u12a5\u1293 \u12cb\u130b \u134d\u133d\u121d\u1293 \u12e8\u130e\u12f0\u1208\u12cd \u12cd\u12f5\u12f5\u122d \u12a5\u1295\u12f0 \u124b\u121a \u1270\u130d\u1263\u122d \u1290\u12cd \u12e8\u121a\u12c8\u12a8\u1209\u1275\u1362",
11
+ "positive_response": "'",
12
+ "negative_response": "I"
13
+ },
14
+ {
15
+ "pair_id": 2,
16
+ "prompt": "Na 1997, \u1ee4l\u1ecd ak\u1ee5 \u1ee5wa ch\u1ecdp\u1ee5tara na fraksh\u1ecdn nke nd\u1ecb b\u1ecb n'\u1ee5wa niile bi na mba nd\u1ecb na-akpata obere ego--ya b\u1ee5 mba kwa onye \u1ecdb\u1ee5la n'eri \u1ee5gw\u1ecd \u1ecdnwa nke $1230 ma\u1ecdb\u1ee5 n'erughi --d\u1ecb ihe d\u1ecb ka",
17
+ "positive_response": "5",
18
+ "negative_response": "%"
19
+ },
20
+ {
21
+ "pair_id": 3,
22
+ "prompt": "Mu gihe cyacyo cy'agahebuzo, ni irihe tandukaniro riri hagati y'abantu bo mu Majyepfo n'abo mu bindi bice bya Leta Zunze Ubumwe z'Amerika bemera ko amashuri atandukana n'ayandi?",
23
+ "positive_response": "'",
24
+ "negative_response": "8"
25
+ },
26
+ {
27
+ "pair_id": 4,
28
+ "prompt": "Ni iyihe mibare yabaturage itubwira igipimo cyuburumbuke aho umubare wabana bazavuka uzaba uhagije kugira ngo usimbure ababyeyi no kwishyura impfu zabana bapfa bakiri bato?",
29
+ "positive_response": "'",
30
+ "negative_response": "U"
31
+ },
32
+ {
33
+ "pair_id": 5,
34
+ "prompt": "A 2016, kusan mutane nawa a Amurka ba su da matsuguni?",
35
+ "positive_response": "'",
36
+ "negative_response": "5"
37
+ },
38
+ {
39
+ "pair_id": 6,
40
+ "prompt": "Pour quel nombre l'\u00e9quation ci-apr\u00e8s est-elle vraie? : 26,1 mm = _ dm.",
41
+ "positive_response": "[",
42
+ "negative_response": "'"
43
+ },
44
+ {
45
+ "pair_id": 7,
46
+ "prompt": "Je, kuna vikwazo vyovyote vya kuingizwa kwa uhalifu wa kimila chini ya sheria ya Kiingereza?",
47
+ "positive_response": "a",
48
+ "negative_response": "k"
49
+ },
50
+ {
51
+ "pair_id": 8,
52
+ "prompt": "Akwai wasu iyakoki dan shigar laifukan al'ada a \u0199ar\u0199ashin dokar Ingilishi?",
53
+ "positive_response": "a",
54
+ "negative_response": "b"
55
+ },
56
+ {
57
+ "pair_id": 9,
58
+ "prompt": "Kufikia 2019, ni takriban asilimia ngapi ya watu duniani wanaishi katika umaskini uliokithiri (chini ya $1.90 kwa siku)?",
59
+ "positive_response": "'",
60
+ "negative_response": "5"
61
+ },
62
+ {
63
+ "pair_id": 10,
64
+ "prompt": "Which treaties are considered as 'source of international law' under article 38 ICJ Statute?",
65
+ "positive_response": "'",
66
+ "negative_response": "A"
67
+ },
68
+ {
69
+ "pair_id": 11,
70
+ "prompt": "Umupaka w'ibishoboka byo gukora uzagaragara nk'umurongo ugororotse igihe",
71
+ "positive_response": "b",
72
+ "negative_response": "u"
73
+ },
74
+ {
75
+ "pair_id": 12,
76
+ "prompt": "T\u00ed \u00ecd\u00e1s\u00ed l\u00e1ti \u00ecta t\u00ed \u00f2 d\u00e1ra b\u00e1 darap\u1ecd\u0300 m\u1ecd\u0301 \u00ecp\u00e8s\u00e8 ohun \u00e8l\u00f2 al\u00e1d\u00e0ni, \u00e8wo n\u00edn\u00fa \u00e0w\u1ecdn w\u1ecd\u0300ny\u00ec\u00ed ni \u00ecg\u00e9s\u1eb9\u0300 l\u00e1ti \u1ecd\u0300d\u1ecd\u0300 \u00ecj\u1ecdba t\u00ed \u00f3 \u1e63e\u00e9\u1e63e k\u00f3 gb\u00e9 \u1ecdj\u00e0 n\u00e1\u00e0 l\u1ecd s\u00edbi \u00e0b\u00e1j\u00e1de t\u00f3 d\u00e1ra?",
77
+ "positive_response": "T",
78
+ "negative_response": "i"
79
+ },
80
+ {
81
+ "pair_id": 13,
82
+ "prompt": "What was GDP per capita in the United States in 1850 when adjusting for inflation and PPP in 2011 prices?",
83
+ "positive_response": "'",
84
+ "negative_response": "A"
85
+ },
86
+ {
87
+ "pair_id": 14,
88
+ "prompt": "Ni uwuhe mubare w'abana bafite hagati y'imyaka 13 na 15 muri Misiri bavuze ko bahohotewe nibura rimwe mu mezi abiri ashize, guhera mu mwaka wa 2015?",
89
+ "positive_response": "0",
90
+ "negative_response": "%"
91
+ }
92
+ ]
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env python3
2
+ """Search for all short task names that might match Tag."""
3
+
4
+ import sys
5
+ sys.path.insert(0, '/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source')
6
+
7
+ from lm_eval.tasks import TaskManager
8
+
9
+ def main():
10
+ tm = TaskManager()
11
+
12
+ # Get all 3-letter task names
13
+ three_letter = [t for t in tm.task_index.keys() if len(t) == 3]
14
+ print(f"Found {len(three_letter)} tasks with exactly 3 letters:")
15
+ for task in sorted(three_letter):
16
+ print(f" - {task}")
17
+
18
+ # Get all 3-4 letter task names starting with T
19
+ short_t = [t for t in tm.task_index.keys() if t.lower().startswith('t') and 3 <= len(t) <= 4]
20
+ print(f"\nFound {len(short_t)} tasks with 3-4 letters starting with 't':")
21
+ for task in sorted(short_t):
22
+ print(f" - {task}")
23
+
24
+ # Search for anything with T, A, G in sequence (case insensitive)
25
+ tag_pattern = [t for t in tm.task_index.keys() if 't' in t.lower() and 'a' in t.lower() and 'g' in t.lower()]
26
+ print(f"\nFound {len(tag_pattern)} tasks containing t, a, and g:")
27
+ for task in sorted(tag_pattern)[:20]: # Show first 20
28
+ print(f" - {task}")
29
+
30
+ if __name__ == "__main__":
31
+ main()
@@ -0,0 +1,138 @@
1
+ """Test all benchmarks to verify extractor and evaluator work."""
2
+
3
+ import json
4
+ import os
5
+ import sys
6
+ import signal
7
+ from contextlib import contextmanager
8
+ from pathlib import Path
9
+ from wisent.examples.scripts.test_one_benchmark import test_benchmark
10
+
11
+ # Set environment variable to trust remote code for datasets like meddialog
12
+ os.environ['HF_DATASETS_TRUST_REMOTE_CODE'] = '1'
13
+ # Set environment variable to allow code eval for coding tasks like humaneval, instructhumaneval
14
+ os.environ['HF_ALLOW_CODE_EVAL'] = '1'
15
+
16
+
17
+ class TimeoutError(Exception):
18
+ """Raised when a test times out."""
19
+ pass
20
+
21
+
22
+ @contextmanager
23
+ def timeout(seconds):
24
+ """Context manager for timing out operations."""
25
+ def signal_handler(signum, frame):
26
+ raise TimeoutError(f"Test timed out after {seconds} seconds")
27
+
28
+ # Set the signal handler and alarm
29
+ old_handler = signal.signal(signal.SIGALRM, signal_handler)
30
+ signal.alarm(seconds)
31
+
32
+ try:
33
+ yield
34
+ finally:
35
+ signal.alarm(0)
36
+ signal.signal(signal.SIGALRM, old_handler)
37
+
38
+
39
+ def load_benchmarks():
40
+ """Load benchmarks from central registry."""
41
+ from wisent.core.benchmark_registry import get_all_benchmarks, get_broken_tasks
42
+
43
+ broken_tasks = get_broken_tasks()
44
+ if broken_tasks:
45
+ print(f"Skipping {len(broken_tasks)} broken benchmarks: {', '.join(broken_tasks)}")
46
+
47
+ return get_all_benchmarks()
48
+
49
+
50
+ BENCHMARKS = load_benchmarks()
51
+
52
+
53
+ def test_all_benchmarks(model_name: str = "meta-llama/Llama-3.1-8B-Instruct", output_dir: str = ".", start_index: int = 0):
54
+ """Test all benchmarks.
55
+
56
+ Args:
57
+ model_name: Model to use for testing
58
+ output_dir: Directory to save results
59
+ start_index: Index to start testing from (0-based)
60
+
61
+ Returns:
62
+ Dictionary with results for each benchmark
63
+ """
64
+ results = {
65
+ "model": model_name,
66
+ "total": len(BENCHMARKS),
67
+ "passed": 0,
68
+ "failed": 0,
69
+ "benchmarks": {}
70
+ }
71
+
72
+ print(f"\n{'='*70}")
73
+ print(f"Testing {len(BENCHMARKS)} benchmarks with {model_name}")
74
+ if start_index > 0:
75
+ print(f"Starting from benchmark {start_index + 1} ({BENCHMARKS[start_index]})")
76
+ print(f"{'='*70}\n")
77
+
78
+ for i, benchmark in enumerate(BENCHMARKS, 1):
79
+ if i - 1 < start_index:
80
+ continue
81
+ print(f"[{i}/{len(BENCHMARKS)}] Testing {benchmark}...")
82
+
83
+ try:
84
+ with timeout(1200):
85
+ success = test_benchmark(benchmark, model_name, output_dir)
86
+ results["benchmarks"][benchmark] = {
87
+ "status": "passed" if success else "failed",
88
+ "success": success
89
+ }
90
+
91
+ if success:
92
+ results["passed"] += 1
93
+ print(f"  PASSED\n")
94
+ else:
95
+ results["failed"] += 1
96
+ print(f"  FAILED\n")
97
+
98
+ except TimeoutError as e:
99
+ results["benchmarks"][benchmark] = {
100
+ "status": "timeout",
101
+ "success": False,
102
+ "error": str(e)
103
+ }
104
+ results["failed"] += 1
105
+ print(f" TIMEOUT: {e}\n")
106
+
107
+ except Exception as e:
108
+ results["benchmarks"][benchmark] = {
109
+ "status": "error",
110
+ "success": False,
111
+ "error": str(e)
112
+ }
113
+ results["failed"] += 1
114
+ print(f"  ERROR: {e}\n")
115
+
116
+ print(f"\n{'='*70}")
117
+ print(f"SUMMARY")
118
+ print(f"{'='*70}")
119
+ print(f"Total: {results['total']}")
120
+ print(f"Passed: {results['passed']}")
121
+ print(f"Failed: {results['failed']}")
122
+ print(f"Success rate: {results['passed']/results['total']*100:.1f}%")
123
+ print(f"{'='*70}\n")
124
+
125
+ return results
126
+
127
+
128
+ if __name__ == "__main__":
129
+ model = sys.argv[1] if len(sys.argv) > 1 else "meta-llama/Llama-3.1-8B-Instruct"
130
+ # Default to results directory in scripts folder
131
+ default_output = Path(__file__).parent / "results"
132
+ output_dir = sys.argv[2] if len(sys.argv) > 2 else str(default_output)
133
+ start_index = int(sys.argv[3]) if len(sys.argv) > 3 else 0
134
+
135
+ results = test_all_benchmarks(model, output_dir, start_index)
136
+
137
+ # Exit with appropriate code
138
+ sys.exit(0 if results["failed"] == 0 else 1)
@@ -0,0 +1,28 @@
1
+ """Test all benchmarks to verify extractor and evaluator work."""
2
+
3
+ import sys
4
+ import signal
5
+ from contextmanager import contextmanager
6
+ from wisent.examples.scripts.test_one_benchmark import test_benchmark
7
+
8
+
9
+ class TimeoutError(Exception):
10
+ """Raised when a test times out."""
11
+ pass
12
+
13
+
14
+ @contextmanager
15
+ def timeout(seconds):
16
+ """Context manager for timing out operations."""
17
+ def signal_handler(signum, frame):
18
+ raise TimeoutError(f"Test timed out after {seconds} seconds")
19
+
20
+ # Set the signal handler and alarm
21
+ old_handler = signal.signal(signal.SIGALRM, signal_handler)
22
+ signal.alarm(seconds)
23
+
24
+ try:
25
+ yield
26
+ finally:
27
+ signal.alarm(0)
28
+ signal.signal(signal.SIGALRM, old_handler)