wisent 0.7.701__py3-none-any.whl → 0.7.901__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/activation_cache.py +393 -0
  3. wisent/core/activations/activations.py +3 -3
  4. wisent/core/activations/activations_collector.py +9 -5
  5. wisent/core/activations/classifier_inference_strategy.py +12 -11
  6. wisent/core/activations/extraction_strategy.py +256 -84
  7. wisent/core/classifiers/classifiers/core/atoms.py +3 -2
  8. wisent/core/cli/__init__.py +2 -1
  9. wisent/core/cli/agent/apply_steering.py +5 -7
  10. wisent/core/cli/agent/train_classifier.py +19 -7
  11. wisent/core/cli/check_linearity.py +35 -3
  12. wisent/core/cli/cluster_benchmarks.py +4 -6
  13. wisent/core/cli/create_steering_vector.py +6 -4
  14. wisent/core/cli/diagnose_vectors.py +7 -4
  15. wisent/core/cli/estimate_unified_goodness_time.py +6 -4
  16. wisent/core/cli/generate_pairs_from_task.py +9 -56
  17. wisent/core/cli/geometry_search.py +137 -0
  18. wisent/core/cli/get_activations.py +1 -1
  19. wisent/core/cli/method_optimizer.py +4 -3
  20. wisent/core/cli/modify_weights.py +3 -2
  21. wisent/core/cli/optimize_sample_size.py +1 -1
  22. wisent/core/cli/optimize_steering.py +14 -16
  23. wisent/core/cli/optimize_weights.py +2 -1
  24. wisent/core/cli/preview_pairs.py +203 -0
  25. wisent/core/cli/steering_method_trainer.py +3 -3
  26. wisent/core/cli/tasks.py +19 -76
  27. wisent/core/cli/train_unified_goodness.py +3 -3
  28. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +4 -4
  29. wisent/core/contrastive_pairs/diagnostics/linearity.py +7 -0
  30. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
  31. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
  32. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
  33. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
  34. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
  35. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
  36. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
  37. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
  38. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
  39. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
  40. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
  41. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
  42. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
  43. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
  44. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
  45. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
  46. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
  47. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
  48. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
  49. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
  50. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
  51. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
  52. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
  53. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +3 -0
  54. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
  55. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
  56. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
  57. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
  58. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
  59. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
  60. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
  61. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
  62. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
  63. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
  64. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
  65. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
  66. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
  67. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
  68. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
  69. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
  70. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
  82. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
  83. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
  84. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
  85. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
  86. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
  87. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
  88. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
  89. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
  90. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
  91. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
  92. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
  93. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
  94. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
  95. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
  96. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
  102. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
  103. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
  104. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
  105. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
  106. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
  107. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
  108. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
  109. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
  110. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
  111. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
  112. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
  113. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
  114. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
  115. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
  116. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
  117. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
  118. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
  119. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
  120. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
  121. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
  122. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
  123. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
  124. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
  125. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
  126. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
  127. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
  128. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
  129. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
  130. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
  131. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
  132. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
  133. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
  134. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
  135. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
  136. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
  137. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
  138. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
  139. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
  140. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
  141. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
  142. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
  143. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
  144. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
  145. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
  146. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
  147. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
  148. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
  149. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
  150. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
  151. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
  152. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
  153. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
  154. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
  155. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
  156. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
  157. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
  158. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
  159. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
  160. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
  161. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
  162. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
  163. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
  164. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
  165. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
  166. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
  167. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
  168. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
  169. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
  170. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
  171. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
  172. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
  173. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
  174. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
  175. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
  176. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
  177. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
  178. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
  179. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
  180. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
  181. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
  182. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
  183. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
  184. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
  185. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
  186. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
  187. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
  188. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
  189. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
  190. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
  191. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
  192. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
  193. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
  194. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
  195. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
  196. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
  197. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
  198. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
  199. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
  200. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
  201. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
  202. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
  203. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
  204. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
  205. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
  206. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
  207. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
  208. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
  209. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
  210. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
  211. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
  212. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
  213. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
  214. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
  215. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
  216. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
  217. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
  218. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
  219. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
  220. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
  221. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
  222. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
  223. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
  224. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
  225. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
  226. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
  227. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
  228. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
  229. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
  230. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
  231. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
  232. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
  233. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
  234. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
  235. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
  236. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
  237. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
  238. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
  239. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
  240. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
  241. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
  242. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
  243. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
  244. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
  245. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
  246. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
  247. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
  248. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
  249. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
  250. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
  251. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
  252. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
  253. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
  254. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
  255. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
  256. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
  257. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
  258. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
  259. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
  260. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
  261. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
  262. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
  263. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
  264. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
  265. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
  266. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
  267. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
  268. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
  269. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
  270. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
  271. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
  272. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
  273. wisent/core/data_loaders/loaders/lm_loader.py +12 -1
  274. wisent/core/geometry_runner.py +995 -0
  275. wisent/core/geometry_search_space.py +237 -0
  276. wisent/core/hyperparameter_optimizer.py +1 -1
  277. wisent/core/main.py +3 -0
  278. wisent/core/models/core/atoms.py +5 -3
  279. wisent/core/models/wisent_model.py +1 -1
  280. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  281. wisent/core/parser_arguments/check_linearity_parser.py +12 -2
  282. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
  283. wisent/core/parser_arguments/generate_vector_from_task_parser.py +2 -2
  284. wisent/core/parser_arguments/geometry_search_parser.py +61 -0
  285. wisent/core/parser_arguments/main_parser.py +8 -0
  286. wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
  287. wisent/core/steering.py +5 -3
  288. wisent/core/steering_methods/methods/hyperplane.py +2 -1
  289. wisent/core/synthetic/generators/nonsense_generator.py +30 -18
  290. wisent/core/trainers/steering_trainer.py +2 -2
  291. wisent/core/utils/device.py +27 -27
  292. wisent/core/utils/layer_combinations.py +70 -0
  293. wisent/examples/__init__.py +1 -0
  294. wisent/examples/scripts/__init__.py +1 -0
  295. wisent/examples/scripts/count_all_benchmarks.py +121 -0
  296. wisent/examples/scripts/discover_directions.py +469 -0
  297. wisent/examples/scripts/extract_benchmark_info.py +71 -0
  298. wisent/examples/scripts/generate_paper_data.py +384 -0
  299. wisent/examples/scripts/intervention_validation.py +626 -0
  300. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
  301. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
  302. wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
  303. wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
  304. wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
  305. wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
  306. wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
  307. wisent/examples/scripts/search_all_short_names.py +31 -0
  308. wisent/examples/scripts/test_all_benchmarks.py +138 -0
  309. wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
  310. wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
  311. wisent/examples/scripts/test_nonsense_baseline.py +261 -0
  312. wisent/examples/scripts/test_one_benchmark.py +324 -0
  313. wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
  314. wisent/examples/scripts/threshold_analysis.py +434 -0
  315. wisent/examples/scripts/visualization_gallery.py +582 -0
  316. wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
  317. wisent/parameters/lm_eval/category_directions.json +137 -0
  318. wisent/parameters/lm_eval/repair_plan.json +282 -0
  319. wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
  320. wisent/parameters/lm_eval/working_benchmarks.json +206 -0
  321. wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
  322. wisent/tests/test_detector_accuracy.py +1 -1
  323. wisent/tests/visualize_geometry.py +1 -1
  324. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
  325. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/RECORD +329 -295
  326. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
  327. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
  328. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
  329. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
  330. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,230 @@
1
+ """Test contrastive pairs generation for all supported benchmarks.
2
+
3
+ Generates example pairs for each benchmark and shows how they look
4
+ with different extraction strategies.
5
+ """
6
+
7
+ import json
8
+ import signal
9
+ import sys
10
+ from pathlib import Path
11
+
12
+
13
+ class TimeoutError(Exception):
14
+ pass
15
+
16
+
17
+ def timeout_handler(signum, frame):
18
+ raise TimeoutError("Timeout")
19
+
20
+
21
+ class MockTokenizer:
22
+ """Mock tokenizer for previewing extraction strategies."""
23
+
24
+ def apply_chat_template(self, messages, tokenize=False, add_generation_prompt=False):
25
+ if len(messages) == 1:
26
+ return f"<|user|>\n{messages[0]['content']}\n<|assistant|>\n"
27
+ elif len(messages) == 2:
28
+ return f"<|user|>\n{messages[0]['content']}\n<|assistant|>\n{messages[1]['content']}<|end|>"
29
+ return str(messages)
30
+
31
+ def __call__(self, text, add_special_tokens=False):
32
+ return {"input_ids": text.split()}
33
+
34
+
35
+ def format_pair_with_strategies(pair, tokenizer):
36
+ """Format a contrastive pair with all extraction strategies.
37
+
38
+ Returns dict with raw data and formatted versions for each strategy.
39
+ """
40
+ from wisent.core.activations.extraction_strategy import (
41
+ ExtractionStrategy,
42
+ build_extraction_texts,
43
+ )
44
+
45
+ result = {
46
+ "raw": {
47
+ "prompt": pair.prompt,
48
+ "positive": pair.positive_response.model_response,
49
+ "negative": pair.negative_response.model_response,
50
+ },
51
+ "strategies": {}
52
+ }
53
+
54
+ strategies = [
55
+ "chat_last",
56
+ "chat_mean",
57
+ "mc_balanced",
58
+ "completion_last",
59
+ "completion_mean",
60
+ "mc_completion",
61
+ ]
62
+
63
+ for strategy_name in strategies:
64
+ try:
65
+ strategy = ExtractionStrategy(strategy_name)
66
+
67
+ # Build texts for positive response
68
+ if strategy in (ExtractionStrategy.MC_BALANCED, ExtractionStrategy.MC_COMPLETION):
69
+ pos_full, pos_answer, pos_prompt = build_extraction_texts(
70
+ strategy,
71
+ pair.prompt,
72
+ pair.positive_response.model_response,
73
+ tokenizer,
74
+ other_response=pair.negative_response.model_response,
75
+ is_positive=True,
76
+ auto_convert_strategy=False,
77
+ )
78
+ neg_full, neg_answer, neg_prompt = build_extraction_texts(
79
+ strategy,
80
+ pair.prompt,
81
+ pair.negative_response.model_response,
82
+ tokenizer,
83
+ other_response=pair.positive_response.model_response,
84
+ is_positive=False,
85
+ auto_convert_strategy=False,
86
+ )
87
+ else:
88
+ pos_full, pos_answer, pos_prompt = build_extraction_texts(
89
+ strategy,
90
+ pair.prompt,
91
+ pair.positive_response.model_response,
92
+ tokenizer,
93
+ auto_convert_strategy=False,
94
+ )
95
+ neg_full, neg_answer, neg_prompt = build_extraction_texts(
96
+ strategy,
97
+ pair.prompt,
98
+ pair.negative_response.model_response,
99
+ tokenizer,
100
+ auto_convert_strategy=False,
101
+ )
102
+
103
+ result["strategies"][strategy_name] = {
104
+ "positive": {
105
+ "full_text": pos_full,
106
+ "answer_token": pos_answer,
107
+ "prompt_only": pos_prompt,
108
+ },
109
+ "negative": {
110
+ "full_text": neg_full,
111
+ "answer_token": neg_answer,
112
+ "prompt_only": neg_prompt,
113
+ }
114
+ }
115
+ except Exception as e:
116
+ result["strategies"][strategy_name] = {"error": str(e)}
117
+
118
+ return result
119
+
120
+
121
+ def test_all_benchmarks(timeout_per_task: int = 30, limit: int = 2):
122
+ """Test contrastive pairs generation for all supported benchmarks.
123
+
124
+ Args:
125
+ timeout_per_task: Timeout in seconds per benchmark
126
+ limit: Number of pairs to generate per benchmark
127
+
128
+ Returns:
129
+ Dictionary with results including example pairs with all strategies
130
+ """
131
+ from wisent.core.contrastive_pairs.lm_eval_pairs.lm_task_pairs_generation import build_contrastive_pairs
132
+ from wisent.core.benchmark_registry import get_all_benchmarks, get_broken_tasks
133
+
134
+ all_benchmarks = get_all_benchmarks()
135
+ broken = set(get_broken_tasks())
136
+
137
+ # Filter out broken benchmarks
138
+ benchmarks = [b for b in all_benchmarks if b not in broken]
139
+
140
+ print(f"Testing {len(benchmarks)} benchmarks (excluded {len(broken)} broken)")
141
+ print(f"Timeout per task: {timeout_per_task}s, limit: {limit} pairs")
142
+ print()
143
+
144
+ tokenizer = MockTokenizer()
145
+
146
+ results = {
147
+ "total": len(benchmarks),
148
+ "ok": 0,
149
+ "failed": 0,
150
+ "timeout": 0,
151
+ "benchmarks": {}
152
+ }
153
+
154
+ for i, benchmark in enumerate(benchmarks):
155
+ signal.signal(signal.SIGALRM, timeout_handler)
156
+ signal.alarm(timeout_per_task)
157
+
158
+ try:
159
+ pairs = build_contrastive_pairs(benchmark, limit=limit)
160
+ signal.alarm(0)
161
+
162
+ if pairs and len(pairs) > 0:
163
+ results["ok"] += 1
164
+
165
+ # Format pairs with all strategies
166
+ formatted_pairs = []
167
+ for pair in pairs:
168
+ formatted_pairs.append(format_pair_with_strategies(pair, tokenizer))
169
+
170
+ results["benchmarks"][benchmark] = {
171
+ "status": "ok",
172
+ "num_pairs": len(pairs),
173
+ "pairs": formatted_pairs
174
+ }
175
+ print(f"[{i+1}/{len(benchmarks)}] OK: {benchmark} - {len(pairs)} pairs")
176
+ else:
177
+ results["failed"] += 1
178
+ results["benchmarks"][benchmark] = {"status": "no_pairs", "num_pairs": 0}
179
+ print(f"[{i+1}/{len(benchmarks)}] FAIL: {benchmark} - no pairs returned")
180
+
181
+ except TimeoutError:
182
+ signal.alarm(0)
183
+ results["timeout"] += 1
184
+ results["benchmarks"][benchmark] = {"status": "timeout"}
185
+ print(f"[{i+1}/{len(benchmarks)}] TIMEOUT: {benchmark}")
186
+
187
+ except Exception as e:
188
+ signal.alarm(0)
189
+ results["failed"] += 1
190
+ error_msg = str(e)[:200]
191
+ results["benchmarks"][benchmark] = {"status": "error", "error": error_msg}
192
+ print(f"[{i+1}/{len(benchmarks)}] ERROR: {benchmark} - {error_msg[:100]}")
193
+
194
+ # Summary
195
+ print()
196
+ print("=" * 60)
197
+ print("SUMMARY")
198
+ print("=" * 60)
199
+ print(f"Total tested: {results['total']}")
200
+ print(f"OK: {results['ok']}")
201
+ print(f"Failed: {results['failed']}")
202
+ print(f"Timeout: {results['timeout']}")
203
+ print(f"Success rate: {results['ok']/results['total']*100:.1f}%")
204
+ print("=" * 60)
205
+
206
+ return results
207
+
208
+
209
+ def main():
210
+ import argparse
211
+
212
+ parser = argparse.ArgumentParser(description="Test contrastive pairs for all supported benchmarks")
213
+ parser.add_argument("--timeout", "-t", type=int, default=30, help="Timeout per task in seconds (default: 30)")
214
+ parser.add_argument("--limit", "-l", type=int, default=2, help="Number of pairs per benchmark (default: 2)")
215
+ parser.add_argument("--output", "-o", type=str, required=True, help="Output JSON file for results")
216
+
217
+ args = parser.parse_args()
218
+
219
+ results = test_all_benchmarks(timeout_per_task=args.timeout, limit=args.limit)
220
+
221
+ with open(args.output, 'w') as f:
222
+ json.dump(results, f, indent=2, ensure_ascii=False)
223
+ print(f"\nResults saved to: {args.output}")
224
+
225
+ # Exit with error code if any failures
226
+ sys.exit(0 if results["failed"] == 0 and results["timeout"] == 0 else 1)
227
+
228
+
229
+ if __name__ == "__main__":
230
+ main()
@@ -0,0 +1,261 @@
1
+ """
2
+ Test whether our activation extraction gives meaningful signal
3
+ by comparing real contrastive pairs vs nonsense random pairs.
4
+
5
+ If nonsense pairs give similar Cohen's d / separation as real pairs,
6
+ then our signal is meaningless.
7
+ """
8
+
9
+ import argparse
10
+ import random
11
+ import string
12
+ import torch
13
+ import numpy as np
14
+ from typing import List, Tuple
15
+ from sklearn.svm import LinearSVC
16
+ from sklearn.preprocessing import StandardScaler
17
+
18
+ from wisent.core.models.wisent_model import WisentModel
19
+ from wisent.core.activations.extraction_strategy import ExtractionStrategy
20
+ from wisent.core.activations.activations_collector import ActivationCollector
21
+ from wisent.core.contrastive_pairs.core.pair import ContrastivePair
22
+ from wisent.core.contrastive_pairs.core.response import PositiveResponse, NegativeResponse
23
+
24
+
25
+ WORD_LIST = [
26
+ "water", "sumo", "half", "purple", "elephant", "calculator", "yesterday",
27
+ "moon", "basket", "thinking", "telephone", "mountain", "running", "quickly",
28
+ "tomorrow", "happiness", "keyboard", "window", "dancing", "coffee", "planet",
29
+ "singing", "computer", "orange", "flying", "bicycle", "dream", "ocean",
30
+ "pencil", "laughing", "cloud", "table", "walking", "music", "river", "chair",
31
+ "jumping", "sun", "book", "swimming", "star", "door", "cooking", "tree",
32
+ "writing", "sky", "flower", "playing", "rain", "paper", "sleeping", "green",
33
+ "seven", "under", "before", "strange", "ancient", "modern", "simple"
34
+ ]
35
+
36
+ def generate_nonsense_text(length: int = None) -> str:
37
+ """Generate word salad - real words, no meaning."""
38
+ if length is None:
39
+ length = random.randint(3, 10)
40
+ words = random.choices(WORD_LIST, k=length)
41
+ return ' '.join(words)
42
+
43
+
44
+ def generate_nonsense_pairs(n: int = 50) -> List[ContrastivePair]:
45
+ """Generate pairs with random nonsense text."""
46
+ pairs = []
47
+ for i in range(n):
48
+ prompt = generate_nonsense_text(10)
49
+ positive = generate_nonsense_text(15)
50
+ negative = generate_nonsense_text(15)
51
+ pairs.append(ContrastivePair(
52
+ prompt=prompt,
53
+ positive_response=PositiveResponse(model_response=positive),
54
+ negative_response=NegativeResponse(model_response=negative),
55
+ ))
56
+ return pairs
57
+
58
+
59
+ def generate_real_pairs(n: int = 50) -> List[ContrastivePair]:
60
+ """Generate real contrastive pairs with semantic meaning."""
61
+ templates = [
62
+ ("Is the Earth flat?", "No, the Earth is approximately spherical.", "Yes, the Earth is flat."),
63
+ ("What is 2+2?", "4", "5"),
64
+ ("Is water wet?", "Yes, water is wet.", "No, water is not wet."),
65
+ ("What color is the sky?", "Blue", "Green"),
66
+ ("Is the sun a star?", "Yes, the sun is a star.", "No, the sun is a planet."),
67
+ ("What is the capital of France?", "Paris", "London"),
68
+ ("Is Python a programming language?", "Yes, Python is a programming language.", "No, Python is a snake."),
69
+ ("What is 10 * 5?", "50", "100"),
70
+ ("Is ice cold?", "Yes, ice is cold.", "No, ice is hot."),
71
+ ("What year did WW2 end?", "1945", "1939"),
72
+ ]
73
+
74
+ pairs = []
75
+ for i in range(n):
76
+ template = templates[i % len(templates)]
77
+ # Add some variation
78
+ variation = f" (instance {i})"
79
+ pairs.append(ContrastivePair(
80
+ prompt=template[0] + variation,
81
+ positive_response=PositiveResponse(model_response=template[1]),
82
+ negative_response=NegativeResponse(model_response=template[2]),
83
+ ))
84
+ return pairs
85
+
86
+
87
+ def compute_cohens_d(pos_acts: np.ndarray, neg_acts: np.ndarray) -> float:
88
+ """Compute Cohen's d effect size."""
89
+ pos_mean = np.mean(pos_acts, axis=0)
90
+ neg_mean = np.mean(neg_acts, axis=0)
91
+
92
+ pos_var = np.var(pos_acts, axis=0)
93
+ neg_var = np.var(neg_acts, axis=0)
94
+
95
+ n1, n2 = len(pos_acts), len(neg_acts)
96
+ pooled_std = np.sqrt(((n1 - 1) * pos_var + (n2 - 1) * neg_var) / (n1 + n2 - 2))
97
+ pooled_std = np.mean(pooled_std) # average across dimensions
98
+
99
+ if pooled_std < 1e-10:
100
+ return 0.0
101
+
102
+ diff = np.linalg.norm(pos_mean - neg_mean)
103
+ return diff / pooled_std
104
+
105
+
106
+ def compute_linear_separability(pos_acts: np.ndarray, neg_acts: np.ndarray) -> float:
107
+ """Compute linear separability score using SVM."""
108
+ X = np.vstack([pos_acts, neg_acts])
109
+ y = np.array([1] * len(pos_acts) + [0] * len(neg_acts))
110
+
111
+ scaler = StandardScaler()
112
+ X_scaled = scaler.fit_transform(X)
113
+
114
+ svm = LinearSVC(max_iter=1000, dual=False)
115
+ svm.fit(X_scaled, y)
116
+
117
+ return svm.score(X_scaled, y)
118
+
119
+
120
+ def collect_activations(
121
+ model: WisentModel,
122
+ pairs: List[ContrastivePair],
123
+ strategy: ExtractionStrategy,
124
+ layer: int,
125
+ ) -> Tuple[np.ndarray, np.ndarray]:
126
+ """Collect activations for positive and negative responses."""
127
+ collector = ActivationCollector(model)
128
+
129
+ pos_acts = []
130
+ neg_acts = []
131
+
132
+ for pair in pairs:
133
+ try:
134
+ # Collect both positive and negative using the collect method
135
+ result = collector.collect(pair, strategy=strategy)
136
+
137
+ # result is a ContrastivePair with activations
138
+ pos_layer_acts = result.positive_response.layers_activations
139
+ neg_layer_acts = result.negative_response.layers_activations
140
+
141
+ # Extract layer (keys are strings like '1', '2', etc, and 1-indexed)
142
+ layer_key = str(layer + 1) # Convert to 1-indexed string
143
+ if pos_layer_acts is not None and neg_layer_acts is not None:
144
+ if layer_key in pos_layer_acts and layer_key in neg_layer_acts:
145
+ pos_acts.append(pos_layer_acts[layer_key].cpu().numpy())
146
+ neg_acts.append(neg_layer_acts[layer_key].cpu().numpy())
147
+ except Exception as e:
148
+ print(f"Error collecting pair: {e}")
149
+ continue
150
+
151
+ return np.array(pos_acts), np.array(neg_acts)
152
+
153
+
154
+ def main():
155
+ parser = argparse.ArgumentParser(description="Test nonsense baseline vs real pairs")
156
+ parser.add_argument("--model", type=str, default="meta-llama/Llama-3.2-1B-Instruct")
157
+ parser.add_argument("--n-pairs", type=int, default=50)
158
+ parser.add_argument("--strategies", type=str, nargs="+",
159
+ default=["chat_mean", "chat_max_norm", "chat_last"])
160
+ parser.add_argument("--layers", type=int, nargs="+", default=None,
161
+ help="Layers to test. Default: [0, 25%, 50%, 75%, last]")
162
+ args = parser.parse_args()
163
+
164
+ print(f"Loading model: {args.model}")
165
+ model = WisentModel(args.model)
166
+ num_layers = model.num_layers
167
+
168
+ # Default layers if not specified
169
+ if args.layers is None:
170
+ args.layers = [
171
+ 0,
172
+ num_layers // 4,
173
+ num_layers // 2,
174
+ 3 * num_layers // 4,
175
+ num_layers - 1,
176
+ ]
177
+
178
+ print(f"Model has {num_layers} layers")
179
+ print(f"Testing layers: {args.layers}")
180
+ print(f"Testing strategies: {args.strategies}")
181
+ print(f"Pairs per condition: {args.n_pairs}")
182
+ print()
183
+
184
+ # Generate pairs
185
+ print("Generating pairs...")
186
+ real_pairs = generate_real_pairs(args.n_pairs)
187
+ nonsense_pairs = generate_nonsense_pairs(args.n_pairs)
188
+
189
+ results = []
190
+
191
+ for strategy_name in args.strategies:
192
+ strategy = ExtractionStrategy(strategy_name)
193
+
194
+ for layer in args.layers:
195
+ print(f"\n{'='*60}")
196
+ print(f"Strategy: {strategy_name}, Layer: {layer} ({100*layer/num_layers:.0f}%)")
197
+ print('='*60)
198
+
199
+ # Real pairs
200
+ print(" Collecting REAL pairs...")
201
+ real_pos, real_neg = collect_activations(model, real_pairs, strategy, layer)
202
+
203
+ if len(real_pos) < 10 or len(real_neg) < 10:
204
+ print(" WARNING: Too few activations collected for real pairs")
205
+ continue
206
+
207
+ real_cohens_d = compute_cohens_d(real_pos, real_neg)
208
+ real_linear = compute_linear_separability(real_pos, real_neg)
209
+
210
+ # Nonsense pairs
211
+ print(" Collecting NONSENSE pairs...")
212
+ nonsense_pos, nonsense_neg = collect_activations(model, nonsense_pairs, strategy, layer)
213
+
214
+ if len(nonsense_pos) < 10 or len(nonsense_neg) < 10:
215
+ print(" WARNING: Too few activations collected for nonsense pairs")
216
+ continue
217
+
218
+ nonsense_cohens_d = compute_cohens_d(nonsense_pos, nonsense_neg)
219
+ nonsense_linear = compute_linear_separability(nonsense_pos, nonsense_neg)
220
+
221
+ # Compare
222
+ print(f"\n REAL pairs: Cohen's d = {real_cohens_d:8.2f}, Linear = {real_linear:.3f}")
223
+ print(f" NONSENSE pairs: Cohen's d = {nonsense_cohens_d:8.2f}, Linear = {nonsense_linear:.3f}")
224
+ print(f" RATIO (real/nonsense): Cohen's d = {real_cohens_d/max(nonsense_cohens_d, 0.01):.2f}x")
225
+
226
+ if real_cohens_d > nonsense_cohens_d * 2:
227
+ verdict = "SIGNAL IS REAL"
228
+ elif real_cohens_d > nonsense_cohens_d * 1.2:
229
+ verdict = "WEAK SIGNAL"
230
+ else:
231
+ verdict = "NO SIGNAL (nonsense is similar!)"
232
+
233
+ print(f" VERDICT: {verdict}")
234
+
235
+ results.append({
236
+ "strategy": strategy_name,
237
+ "layer": layer,
238
+ "layer_pct": 100 * layer / num_layers,
239
+ "real_cohens_d": real_cohens_d,
240
+ "real_linear": real_linear,
241
+ "nonsense_cohens_d": nonsense_cohens_d,
242
+ "nonsense_linear": nonsense_linear,
243
+ "ratio": real_cohens_d / max(nonsense_cohens_d, 0.01),
244
+ "verdict": verdict,
245
+ })
246
+
247
+ # Summary
248
+ print("\n" + "="*80)
249
+ print("SUMMARY")
250
+ print("="*80)
251
+ print(f"{'Strategy':<15} {'Layer':<10} {'Real d':<10} {'Nonsense d':<12} {'Ratio':<8} {'Verdict'}")
252
+ print("-"*80)
253
+
254
+ for r in results:
255
+ print(f"{r['strategy']:<15} {r['layer']:>3} ({r['layer_pct']:>3.0f}%) "
256
+ f"{r['real_cohens_d']:>8.2f} {r['nonsense_cohens_d']:>10.2f} "
257
+ f"{r['ratio']:>6.2f}x {r['verdict']}")
258
+
259
+
260
+ if __name__ == "__main__":
261
+ main()