wisent 0.7.701__py3-none-any.whl → 0.7.901__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/activation_cache.py +393 -0
  3. wisent/core/activations/activations.py +3 -3
  4. wisent/core/activations/activations_collector.py +9 -5
  5. wisent/core/activations/classifier_inference_strategy.py +12 -11
  6. wisent/core/activations/extraction_strategy.py +256 -84
  7. wisent/core/classifiers/classifiers/core/atoms.py +3 -2
  8. wisent/core/cli/__init__.py +2 -1
  9. wisent/core/cli/agent/apply_steering.py +5 -7
  10. wisent/core/cli/agent/train_classifier.py +19 -7
  11. wisent/core/cli/check_linearity.py +35 -3
  12. wisent/core/cli/cluster_benchmarks.py +4 -6
  13. wisent/core/cli/create_steering_vector.py +6 -4
  14. wisent/core/cli/diagnose_vectors.py +7 -4
  15. wisent/core/cli/estimate_unified_goodness_time.py +6 -4
  16. wisent/core/cli/generate_pairs_from_task.py +9 -56
  17. wisent/core/cli/geometry_search.py +137 -0
  18. wisent/core/cli/get_activations.py +1 -1
  19. wisent/core/cli/method_optimizer.py +4 -3
  20. wisent/core/cli/modify_weights.py +3 -2
  21. wisent/core/cli/optimize_sample_size.py +1 -1
  22. wisent/core/cli/optimize_steering.py +14 -16
  23. wisent/core/cli/optimize_weights.py +2 -1
  24. wisent/core/cli/preview_pairs.py +203 -0
  25. wisent/core/cli/steering_method_trainer.py +3 -3
  26. wisent/core/cli/tasks.py +19 -76
  27. wisent/core/cli/train_unified_goodness.py +3 -3
  28. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +4 -4
  29. wisent/core/contrastive_pairs/diagnostics/linearity.py +7 -0
  30. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
  31. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
  32. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
  33. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
  34. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
  35. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
  36. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
  37. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
  38. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
  39. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
  40. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
  41. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
  42. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
  43. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
  44. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
  45. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
  46. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
  47. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
  48. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
  49. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
  50. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
  51. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
  52. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
  53. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +3 -0
  54. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
  55. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
  56. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
  57. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
  58. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
  59. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
  60. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
  61. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
  62. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
  63. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
  64. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
  65. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
  66. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
  67. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
  68. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
  69. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
  70. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
  82. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
  83. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
  84. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
  85. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
  86. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
  87. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
  88. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
  89. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
  90. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
  91. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
  92. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
  93. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
  94. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
  95. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
  96. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
  102. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
  103. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
  104. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
  105. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
  106. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
  107. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
  108. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
  109. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
  110. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
  111. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
  112. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
  113. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
  114. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
  115. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
  116. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
  117. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
  118. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
  119. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
  120. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
  121. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
  122. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
  123. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
  124. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
  125. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
  126. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
  127. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
  128. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
  129. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
  130. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
  131. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
  132. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
  133. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
  134. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
  135. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
  136. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
  137. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
  138. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
  139. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
  140. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
  141. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
  142. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
  143. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
  144. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
  145. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
  146. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
  147. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
  148. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
  149. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
  150. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
  151. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
  152. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
  153. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
  154. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
  155. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
  156. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
  157. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
  158. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
  159. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
  160. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
  161. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
  162. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
  163. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
  164. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
  165. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
  166. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
  167. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
  168. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
  169. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
  170. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
  171. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
  172. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
  173. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
  174. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
  175. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
  176. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
  177. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
  178. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
  179. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
  180. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
  181. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
  182. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
  183. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
  184. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
  185. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
  186. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
  187. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
  188. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
  189. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
  190. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
  191. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
  192. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
  193. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
  194. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
  195. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
  196. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
  197. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
  198. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
  199. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
  200. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
  201. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
  202. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
  203. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
  204. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
  205. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
  206. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
  207. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
  208. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
  209. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
  210. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
  211. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
  212. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
  213. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
  214. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
  215. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
  216. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
  217. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
  218. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
  219. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
  220. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
  221. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
  222. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
  223. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
  224. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
  225. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
  226. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
  227. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
  228. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
  229. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
  230. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
  231. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
  232. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
  233. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
  234. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
  235. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
  236. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
  237. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
  238. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
  239. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
  240. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
  241. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
  242. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
  243. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
  244. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
  245. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
  246. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
  247. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
  248. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
  249. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
  250. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
  251. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
  252. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
  253. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
  254. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
  255. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
  256. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
  257. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
  258. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
  259. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
  260. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
  261. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
  262. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
  263. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
  264. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
  265. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
  266. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
  267. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
  268. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
  269. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
  270. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
  271. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
  272. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
  273. wisent/core/data_loaders/loaders/lm_loader.py +12 -1
  274. wisent/core/geometry_runner.py +995 -0
  275. wisent/core/geometry_search_space.py +237 -0
  276. wisent/core/hyperparameter_optimizer.py +1 -1
  277. wisent/core/main.py +3 -0
  278. wisent/core/models/core/atoms.py +5 -3
  279. wisent/core/models/wisent_model.py +1 -1
  280. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  281. wisent/core/parser_arguments/check_linearity_parser.py +12 -2
  282. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
  283. wisent/core/parser_arguments/generate_vector_from_task_parser.py +2 -2
  284. wisent/core/parser_arguments/geometry_search_parser.py +61 -0
  285. wisent/core/parser_arguments/main_parser.py +8 -0
  286. wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
  287. wisent/core/steering.py +5 -3
  288. wisent/core/steering_methods/methods/hyperplane.py +2 -1
  289. wisent/core/synthetic/generators/nonsense_generator.py +30 -18
  290. wisent/core/trainers/steering_trainer.py +2 -2
  291. wisent/core/utils/device.py +27 -27
  292. wisent/core/utils/layer_combinations.py +70 -0
  293. wisent/examples/__init__.py +1 -0
  294. wisent/examples/scripts/__init__.py +1 -0
  295. wisent/examples/scripts/count_all_benchmarks.py +121 -0
  296. wisent/examples/scripts/discover_directions.py +469 -0
  297. wisent/examples/scripts/extract_benchmark_info.py +71 -0
  298. wisent/examples/scripts/generate_paper_data.py +384 -0
  299. wisent/examples/scripts/intervention_validation.py +626 -0
  300. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
  301. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
  302. wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
  303. wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
  304. wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
  305. wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
  306. wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
  307. wisent/examples/scripts/search_all_short_names.py +31 -0
  308. wisent/examples/scripts/test_all_benchmarks.py +138 -0
  309. wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
  310. wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
  311. wisent/examples/scripts/test_nonsense_baseline.py +261 -0
  312. wisent/examples/scripts/test_one_benchmark.py +324 -0
  313. wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
  314. wisent/examples/scripts/threshold_analysis.py +434 -0
  315. wisent/examples/scripts/visualization_gallery.py +582 -0
  316. wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
  317. wisent/parameters/lm_eval/category_directions.json +137 -0
  318. wisent/parameters/lm_eval/repair_plan.json +282 -0
  319. wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
  320. wisent/parameters/lm_eval/working_benchmarks.json +206 -0
  321. wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
  322. wisent/tests/test_detector_accuracy.py +1 -1
  323. wisent/tests/visualize_geometry.py +1 -1
  324. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
  325. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/RECORD +329 -295
  326. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
  327. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
  328. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
  329. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
  330. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,434 @@
1
+ """
2
+ Threshold Analysis for RepScan.
3
+
4
+ Analyzes sensitivity of diagnosis to threshold choices:
5
+ - ROC curves for existence threshold
6
+ - Precision/recall tradeoff
7
+ - Null distribution analysis
8
+ - Synthetic validation
9
+
10
+ Usage:
11
+ python -m wisent.examples.scripts.threshold_analysis --model Qwen/Qwen3-8B
12
+ """
13
+
14
+ import argparse
15
+ import json
16
+ import subprocess
17
+ from pathlib import Path
18
+ from typing import Dict, List, Any, Optional, Tuple
19
+ from dataclasses import dataclass, field, asdict
20
+ import random
21
+
22
+ import torch
23
+ import numpy as np
24
+ from sklearn.metrics import roc_curve, auc, precision_recall_curve
25
+
26
+ S3_BUCKET = "wisent-bucket"
27
+ S3_PREFIX = "threshold_analysis"
28
+
29
+
30
+ def s3_upload_file(local_path: Path, model_name: str) -> None:
31
+ """Upload a single file to S3."""
32
+ model_prefix = model_name.replace('/', '_')
33
+ s3_path = f"s3://{S3_BUCKET}/{S3_PREFIX}/{model_prefix}/{local_path.name}"
34
+ try:
35
+ subprocess.run(
36
+ ["aws", "s3", "cp", str(local_path), s3_path, "--quiet"],
37
+ check=True,
38
+ capture_output=True,
39
+ )
40
+ print(f" Uploaded to S3: {s3_path}")
41
+ except Exception as e:
42
+ print(f" S3 upload failed: {e}")
43
+
44
+
45
+ @dataclass
46
+ class ThresholdAnalysisResult:
47
+ """Result of threshold analysis."""
48
+ # Existence threshold analysis
49
+ existence_thresholds: List[float]
50
+ existence_tpr: List[float] # True positive rate
51
+ existence_fpr: List[float] # False positive rate
52
+ existence_auc: float
53
+ optimal_existence_threshold: float
54
+
55
+ # Gap threshold analysis
56
+ gap_thresholds: List[float]
57
+ gap_precision: List[float]
58
+ gap_recall: List[float]
59
+ gap_f1: List[float]
60
+ optimal_gap_threshold: float
61
+
62
+ # Null distribution stats
63
+ null_mean_knn: float
64
+ null_std_knn: float
65
+ null_mean_linear: float
66
+ null_std_linear: float
67
+
68
+ # Sensitivity analysis
69
+ sensitivity_matrix: Dict[str, Dict[str, float]] # threshold -> diagnosis distribution
70
+
71
+
72
+ def generate_null_distribution(
73
+ model: "WisentModel",
74
+ n_samples: int = 100,
75
+ hidden_dim: int = 4096,
76
+ ) -> Tuple[List[float], List[float]]:
77
+ """
78
+ Generate null distribution by testing random/nonsense data.
79
+
80
+ Args:
81
+ model: WisentModel instance
82
+ n_samples: Number of random samples
83
+ hidden_dim: Hidden dimension
84
+
85
+ Returns:
86
+ (knn_scores, linear_scores) for random data
87
+ """
88
+ from wisent.core.geometry_runner import compute_knn_accuracy, compute_linear_probe_accuracy
89
+
90
+ knn_scores = []
91
+ linear_scores = []
92
+
93
+ for _ in range(n_samples):
94
+ # Generate random activations (no real signal)
95
+ pos = torch.randn(50, hidden_dim)
96
+ neg = torch.randn(50, hidden_dim)
97
+
98
+ knn = compute_knn_accuracy(pos, neg, k=10)
99
+ linear = compute_linear_probe_accuracy(pos, neg)
100
+
101
+ knn_scores.append(knn)
102
+ linear_scores.append(linear)
103
+
104
+ return knn_scores, linear_scores
105
+
106
+
107
+ def generate_synthetic_data(
108
+ structure: str,
109
+ n_samples: int = 50,
110
+ hidden_dim: int = 100,
111
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
112
+ """
113
+ Generate synthetic data with known structure for validation.
114
+
115
+ Args:
116
+ structure: 'linear', 'xor', 'spirals', 'random'
117
+ n_samples: Samples per class
118
+ hidden_dim: Dimension
119
+
120
+ Returns:
121
+ (pos_activations, neg_activations)
122
+ """
123
+ if structure == "linear":
124
+ # Linear separable: positive class shifted in one direction
125
+ direction = torch.randn(hidden_dim)
126
+ direction = direction / direction.norm()
127
+
128
+ pos = torch.randn(n_samples, hidden_dim) + 2 * direction
129
+ neg = torch.randn(n_samples, hidden_dim) - 2 * direction
130
+
131
+ elif structure == "xor":
132
+ # XOR pattern: nonlinear but separable
133
+ base = torch.randn(n_samples, hidden_dim)
134
+
135
+ # Positive: (high dim1 AND high dim2) OR (low dim1 AND low dim2)
136
+ pos_mask1 = (base[:n_samples//2, 0] > 0) & (base[:n_samples//2, 1] > 0)
137
+ pos_mask2 = (base[n_samples//2:, 0] < 0) & (base[n_samples//2:, 1] < 0)
138
+
139
+ pos = torch.randn(n_samples, hidden_dim)
140
+ pos[:n_samples//2, 0] = torch.abs(pos[:n_samples//2, 0]) + 1
141
+ pos[:n_samples//2, 1] = torch.abs(pos[:n_samples//2, 1]) + 1
142
+ pos[n_samples//2:, 0] = -torch.abs(pos[n_samples//2:, 0]) - 1
143
+ pos[n_samples//2:, 1] = -torch.abs(pos[n_samples//2:, 1]) - 1
144
+
145
+ neg = torch.randn(n_samples, hidden_dim)
146
+ neg[:n_samples//2, 0] = torch.abs(neg[:n_samples//2, 0]) + 1
147
+ neg[:n_samples//2, 1] = -torch.abs(neg[:n_samples//2, 1]) - 1
148
+ neg[n_samples//2:, 0] = -torch.abs(neg[n_samples//2:, 0]) - 1
149
+ neg[n_samples//2:, 1] = torch.abs(neg[n_samples//2:, 1]) + 1
150
+
151
+ elif structure == "spirals":
152
+ # Interleaved spirals: nonlinear separable
153
+ t_pos = torch.linspace(0, 4*np.pi, n_samples)
154
+ t_neg = torch.linspace(0, 4*np.pi, n_samples) + np.pi
155
+
156
+ pos = torch.zeros(n_samples, hidden_dim)
157
+ pos[:, 0] = t_pos * torch.cos(t_pos) + 0.5 * torch.randn(n_samples)
158
+ pos[:, 1] = t_pos * torch.sin(t_pos) + 0.5 * torch.randn(n_samples)
159
+ pos[:, 2:] = torch.randn(n_samples, hidden_dim - 2) * 0.1
160
+
161
+ neg = torch.zeros(n_samples, hidden_dim)
162
+ neg[:, 0] = t_neg * torch.cos(t_neg) + 0.5 * torch.randn(n_samples)
163
+ neg[:, 1] = t_neg * torch.sin(t_neg) + 0.5 * torch.randn(n_samples)
164
+ neg[:, 2:] = torch.randn(n_samples, hidden_dim - 2) * 0.1
165
+
166
+ else: # random
167
+ pos = torch.randn(n_samples, hidden_dim)
168
+ neg = torch.randn(n_samples, hidden_dim)
169
+
170
+ return pos, neg
171
+
172
+
173
+ def compute_roc_for_existence(
174
+ real_results: List[Dict],
175
+ null_scores: List[float],
176
+ ) -> Tuple[List[float], List[float], List[float], float]:
177
+ """
178
+ Compute ROC curve for existence threshold.
179
+
180
+ Args:
181
+ real_results: Results from real benchmarks
182
+ null_scores: kNN scores from null distribution
183
+
184
+ Returns:
185
+ (thresholds, tpr, fpr, auc)
186
+ """
187
+ # Labels: 1 for real data (should be detected), 0 for null (should not)
188
+ real_knn = [r["nonlinear_metrics"]["knn_accuracy_k10"] for r in real_results]
189
+
190
+ scores = real_knn + null_scores
191
+ labels = [1] * len(real_knn) + [0] * len(null_scores)
192
+
193
+ fpr, tpr, thresholds = roc_curve(labels, scores)
194
+ roc_auc = auc(fpr, tpr)
195
+
196
+ return thresholds.tolist(), tpr.tolist(), fpr.tolist(), roc_auc
197
+
198
+
199
+ def compute_precision_recall_for_gap(
200
+ results: List[Dict],
201
+ ground_truth_linear: List[bool],
202
+ ) -> Tuple[List[float], List[float], List[float], List[float]]:
203
+ """
204
+ Compute precision-recall for gap threshold (linear vs nonlinear).
205
+
206
+ Args:
207
+ results: Results from benchmarks
208
+ ground_truth_linear: Ground truth labels (True = linear, False = nonlinear)
209
+
210
+ Returns:
211
+ (thresholds, precision, recall, f1)
212
+ """
213
+ # Gap = signal_strength - linear_probe_accuracy
214
+ gaps = [r["signal_strength"] - r["linear_probe_accuracy"] for r in results]
215
+
216
+ # Labels: 1 for nonlinear (gap > threshold), 0 for linear
217
+ labels = [0 if gt else 1 for gt in ground_truth_linear]
218
+
219
+ precision, recall, thresholds = precision_recall_curve(labels, gaps)
220
+
221
+ # Compute F1
222
+ f1 = [2 * p * r / (p + r + 1e-10) for p, r in zip(precision, recall)]
223
+
224
+ return thresholds.tolist(), precision.tolist(), recall.tolist(), f1
225
+
226
+
227
+ def run_sensitivity_analysis(
228
+ results: List[Dict],
229
+ existence_thresholds: List[float] = [0.5, 0.55, 0.6, 0.65, 0.7],
230
+ gap_thresholds: List[float] = [0.05, 0.10, 0.15, 0.20, 0.25],
231
+ ) -> Dict[str, Dict[str, float]]:
232
+ """
233
+ Run sensitivity analysis across threshold combinations.
234
+
235
+ Args:
236
+ results: Results from benchmarks
237
+ existence_thresholds: Thresholds to test for existence
238
+ gap_thresholds: Thresholds to test for gap
239
+
240
+ Returns:
241
+ Nested dict: {exist_thresh: {gap_thresh: {diagnosis: percentage}}}
242
+ """
243
+ sensitivity = {}
244
+
245
+ for exist_t in existence_thresholds:
246
+ sensitivity[str(exist_t)] = {}
247
+
248
+ for gap_t in gap_thresholds:
249
+ diagnoses = {"LINEAR": 0, "NONLINEAR": 0, "NO_SIGNAL": 0}
250
+
251
+ for r in results:
252
+ signal = r["signal_strength"]
253
+ gap = signal - r["linear_probe_accuracy"]
254
+
255
+ if signal < exist_t:
256
+ diagnoses["NO_SIGNAL"] += 1
257
+ elif gap < gap_t:
258
+ diagnoses["LINEAR"] += 1
259
+ else:
260
+ diagnoses["NONLINEAR"] += 1
261
+
262
+ total = len(results)
263
+ sensitivity[str(exist_t)][str(gap_t)] = {
264
+ k: v / total * 100 for k, v in diagnoses.items()
265
+ }
266
+
267
+ return sensitivity
268
+
269
+
270
+ def load_diagnosis_results(model_name: str, output_dir: Path) -> List[Dict]:
271
+ """Load all diagnosis results."""
272
+ model_prefix = model_name.replace('/', '_')
273
+
274
+ # Try to download from S3
275
+ try:
276
+ subprocess.run(
277
+ ["aws", "s3", "sync",
278
+ f"s3://{S3_BUCKET}/direction_discovery/{model_prefix}/",
279
+ str(output_dir / "diagnosis"),
280
+ "--quiet"],
281
+ check=False,
282
+ capture_output=True,
283
+ )
284
+ except Exception:
285
+ pass
286
+
287
+ # Load all results
288
+ all_results = []
289
+ diagnosis_dir = output_dir / "diagnosis"
290
+
291
+ if diagnosis_dir.exists():
292
+ for f in diagnosis_dir.glob(f"{model_prefix}_*.json"):
293
+ if "summary" not in f.name:
294
+ with open(f) as fp:
295
+ data = json.load(fp)
296
+ all_results.extend(data.get("results", []))
297
+
298
+ return all_results
299
+
300
+
301
+ def run_threshold_analysis(model_name: str):
302
+ """
303
+ Run full threshold analysis.
304
+
305
+ Args:
306
+ model_name: Model to analyze
307
+ """
308
+ print("=" * 70)
309
+ print("THRESHOLD ANALYSIS")
310
+ print("=" * 70)
311
+ print(f"Model: {model_name}")
312
+
313
+ output_dir = Path("/tmp/threshold_analysis")
314
+ output_dir.mkdir(parents=True, exist_ok=True)
315
+
316
+ # Load diagnosis results
317
+ results = load_diagnosis_results(model_name, output_dir)
318
+ if not results:
319
+ print("ERROR: No diagnosis results found.")
320
+ return
321
+
322
+ print(f"Loaded {len(results)} results")
323
+
324
+ # 1. Generate null distribution
325
+ print("\n1. Generating null distribution...")
326
+ null_knn, null_linear = generate_null_distribution(None, n_samples=100, hidden_dim=4096)
327
+
328
+ print(f" Null kNN: mean={np.mean(null_knn):.3f}, std={np.std(null_knn):.3f}")
329
+ print(f" Null linear: mean={np.mean(null_linear):.3f}, std={np.std(null_linear):.3f}")
330
+
331
+ # 2. ROC for existence threshold
332
+ print("\n2. Computing ROC for existence threshold...")
333
+ thresholds, tpr, fpr, roc_auc = compute_roc_for_existence(results, null_knn)
334
+
335
+ # Find optimal threshold (Youden's J)
336
+ j_scores = [t - f for t, f in zip(tpr, fpr)]
337
+ optimal_idx = np.argmax(j_scores)
338
+ optimal_exist = thresholds[optimal_idx] if optimal_idx < len(thresholds) else 0.6
339
+
340
+ print(f" AUC: {roc_auc:.3f}")
341
+ print(f" Optimal existence threshold: {optimal_exist:.3f}")
342
+
343
+ # 3. Synthetic validation
344
+ print("\n3. Synthetic validation...")
345
+ from wisent.core.geometry_runner import compute_knn_accuracy, compute_linear_probe_accuracy
346
+
347
+ synthetic_results = {}
348
+ for structure in ["linear", "xor", "spirals", "random"]:
349
+ pos, neg = generate_synthetic_data(structure)
350
+ knn = compute_knn_accuracy(pos, neg, k=10)
351
+ linear = compute_linear_probe_accuracy(pos, neg)
352
+ gap = knn - linear
353
+
354
+ synthetic_results[structure] = {
355
+ "knn": knn,
356
+ "linear": linear,
357
+ "gap": gap,
358
+ }
359
+ print(f" {structure}: kNN={knn:.3f}, linear={linear:.3f}, gap={gap:.3f}")
360
+
361
+ # Validate that gap threshold separates linear from nonlinear
362
+ linear_gap = synthetic_results["linear"]["gap"]
363
+ xor_gap = synthetic_results["xor"]["gap"]
364
+ spirals_gap = synthetic_results["spirals"]["gap"]
365
+
366
+ # Good gap threshold should be > linear_gap and < min(xor_gap, spirals_gap)
367
+ optimal_gap = (linear_gap + min(xor_gap, spirals_gap)) / 2
368
+ print(f"\n Suggested gap threshold: {optimal_gap:.3f}")
369
+
370
+ # 4. Sensitivity analysis
371
+ print("\n4. Running sensitivity analysis...")
372
+ sensitivity = run_sensitivity_analysis(results)
373
+
374
+ print("\n Diagnosis distribution (% of benchmarks):")
375
+ print(" " + "-" * 60)
376
+ print(f" {'Exist':>6} | {'Gap':>6} | {'LINEAR':>8} | {'NONLINEAR':>10} | {'NO_SIGNAL':>10}")
377
+ print(" " + "-" * 60)
378
+
379
+ for exist_t, gap_data in sensitivity.items():
380
+ for gap_t, diagnoses in gap_data.items():
381
+ print(f" {exist_t:>6} | {gap_t:>6} | {diagnoses['LINEAR']:>7.1f}% | "
382
+ f"{diagnoses['NONLINEAR']:>9.1f}% | {diagnoses['NO_SIGNAL']:>9.1f}%")
383
+
384
+ # 5. Save results
385
+ analysis_result = ThresholdAnalysisResult(
386
+ existence_thresholds=thresholds[:100], # Limit for JSON
387
+ existence_tpr=tpr[:100],
388
+ existence_fpr=fpr[:100],
389
+ existence_auc=roc_auc,
390
+ optimal_existence_threshold=float(optimal_exist),
391
+ gap_thresholds=[0.05, 0.10, 0.15, 0.20, 0.25],
392
+ gap_precision=[], # Would need ground truth
393
+ gap_recall=[],
394
+ gap_f1=[],
395
+ optimal_gap_threshold=float(optimal_gap),
396
+ null_mean_knn=float(np.mean(null_knn)),
397
+ null_std_knn=float(np.std(null_knn)),
398
+ null_mean_linear=float(np.mean(null_linear)),
399
+ null_std_linear=float(np.std(null_linear)),
400
+ sensitivity_matrix=sensitivity,
401
+ )
402
+
403
+ model_prefix = model_name.replace('/', '_')
404
+ results_file = output_dir / f"{model_prefix}_threshold_analysis.json"
405
+
406
+ with open(results_file, "w") as f:
407
+ json.dump(asdict(analysis_result), f, indent=2)
408
+
409
+ print(f"\nResults saved to: {results_file}")
410
+ s3_upload_file(results_file, model_name)
411
+
412
+ # Summary
413
+ print("\n" + "=" * 70)
414
+ print("RECOMMENDATIONS")
415
+ print("=" * 70)
416
+ print(f"\n1. Existence threshold: {optimal_exist:.2f}")
417
+ print(f" - Based on ROC analysis (AUC={roc_auc:.3f})")
418
+ print(f" - Null distribution: kNN={np.mean(null_knn):.3f} ± {np.std(null_knn):.3f}")
419
+
420
+ print(f"\n2. Gap threshold: {optimal_gap:.2f}")
421
+ print(f" - Based on synthetic validation")
422
+ print(f" - Linear structure gap: {linear_gap:.3f}")
423
+ print(f" - XOR structure gap: {xor_gap:.3f}")
424
+ print(f" - Spirals structure gap: {spirals_gap:.3f}")
425
+
426
+ return analysis_result
427
+
428
+
429
+ if __name__ == "__main__":
430
+ parser = argparse.ArgumentParser(description="Threshold analysis for RepScan")
431
+ parser.add_argument("--model", type=str, default="Qwen/Qwen3-8B", help="Model to analyze")
432
+ args = parser.parse_args()
433
+
434
+ run_threshold_analysis(args.model)