wisent 0.7.379__py3-none-any.whl → 0.7.701__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (725) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/__init__.py +22 -6
  3. wisent/core/activations/activations.py +21 -39
  4. wisent/core/activations/activations_collector.py +141 -373
  5. wisent/core/activations/classifier_inference_strategy.py +194 -0
  6. wisent/core/activations/core/atoms.py +8 -92
  7. wisent/core/activations/extraction_strategy.py +308 -0
  8. wisent/core/agent/diagnose/response_diagnostics.py +3 -3
  9. wisent/core/agent/diagnose.py +3 -3
  10. wisent/core/autonomous_agent.py +2 -2
  11. wisent/core/cli/agent/apply_steering.py +23 -27
  12. wisent/core/cli/agent/evaluate_response.py +18 -20
  13. wisent/core/cli/agent/train_classifier.py +18 -20
  14. wisent/core/cli/cluster_benchmarks.py +472 -0
  15. wisent/core/cli/create_steering_vector.py +13 -5
  16. wisent/core/cli/generate_vector_from_task.py +4 -0
  17. wisent/core/cli/get_activations.py +12 -36
  18. wisent/core/cli/method_optimizer.py +859 -0
  19. wisent/core/cli/optimize.py +44 -5
  20. wisent/core/cli/optimize_classification.py +5 -6
  21. wisent/core/cli/optimize_sample_size.py +8 -22
  22. wisent/core/cli/optimize_steering.py +429 -153
  23. wisent/core/cli/optimize_weights.py +65 -6
  24. wisent/core/cli/steering_method_trainer.py +5 -4
  25. wisent/core/cli/steering_search_space.py +20 -15
  26. wisent/core/cli/tasks.py +14 -43
  27. wisent/core/cli/train_unified_goodness.py +17 -18
  28. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +1578 -173
  29. wisent/core/contrastive_pairs/diagnostics/linearity.py +63 -80
  30. wisent/core/contrastive_pairs/diagnostics/vector_quality.py +6 -5
  31. wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py +5 -19
  32. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py +11 -5
  33. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py +146 -32
  34. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py +2 -2
  35. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py +98 -57
  36. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py +8 -8
  37. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py +1 -1
  38. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -5
  39. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval_aqua_rat.py +129 -0
  40. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py +11 -6
  41. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +1 -1
  42. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py +47 -6
  43. wisent/core/evaluators/benchmark_specific/apps_evaluator.py +133 -0
  44. wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py +6 -1
  45. wisent/core/evaluators/benchmark_specific/conala_evaluator.py +31 -168
  46. wisent/core/evaluators/custom/examples/humanization_coherent.py +89 -35
  47. wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py +2 -20
  48. wisent/core/evaluators/personalization/coherence.py +46 -0
  49. wisent/core/hyperparameter_optimizer.py +13 -13
  50. wisent/core/lm_eval_harness_ground_truth.py +7 -11
  51. wisent/core/main.py +3 -0
  52. wisent/core/models/wisent_model.py +8 -7
  53. wisent/core/opti/methods/opti_weights.py +29 -2
  54. wisent/core/optuna/classifier/activation_generator.py +14 -12
  55. wisent/core/optuna/steering/steering_optimization.py +14 -9
  56. wisent/core/parser_arguments/cluster_benchmarks_parser.py +31 -0
  57. wisent/core/parser_arguments/generate_vector_from_task_parser.py +20 -0
  58. wisent/core/parser_arguments/main_parser.py +8 -0
  59. wisent/core/parser_arguments/optimize_steering_parser.py +117 -10
  60. wisent/core/parser_arguments/optimize_weights_parser.py +6 -0
  61. wisent/core/parser_arguments/tasks_parser.py +7 -19
  62. wisent/core/steering_methods/core/atoms.py +1 -2
  63. wisent/core/steering_methods/methods/caa.py +1 -1
  64. wisent/core/steering_methods/methods/hyperplane.py +74 -0
  65. wisent/core/steering_methods/methods/prism.py +1 -2
  66. wisent/core/steering_methods/methods/pulse.py +39 -8
  67. wisent/core/steering_methods/methods/titan.py +59 -14
  68. wisent/core/steering_methods/registry.py +52 -12
  69. wisent/core/steering_optimizer.py +15 -15
  70. wisent/core/trainers/steering_trainer.py +9 -18
  71. wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json +19 -70
  72. wisent/scripts/run_quality_metrics_sweep.sh +22 -27
  73. wisent/tests/test_aggregation_geometry.py +236 -0
  74. wisent/tests/test_detector_accuracy.py +163 -0
  75. wisent/tests/test_geometry_exhaustive.py +1202 -0
  76. wisent/tests/visualize_geometry.py +255 -61
  77. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/METADATA +1 -1
  78. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/RECORD +82 -714
  79. wisent/core/activations/prompt_construction_strategy.py +0 -47
  80. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text.py +0 -15
  81. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_go.py +0 -64
  82. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_java.py +0 -65
  83. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_javascript.py +0 -65
  84. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_php.py +0 -65
  85. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_python.py +0 -65
  86. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_ruby.py +0 -65
  87. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/freebase.py +0 -99
  88. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py +0 -180
  89. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py +0 -129
  90. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py +0 -142
  91. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py +0 -155
  92. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code2text.py +0 -161
  93. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/codexglue.py +0 -107
  94. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livemathbench.py +0 -155
  95. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polymath.py +0 -155
  96. wisent/examples/scripts/results/benchmark_descriptions.json +0 -1244
  97. wisent/examples/scripts/results/benchmark_evaluation_methods.json +0 -66
  98. wisent/examples/scripts/results/benchmark_evaluator_mapping.json +0 -2781
  99. wisent/examples/scripts/results/benchmark_evaluator_mapping_updated.json +0 -30536
  100. wisent/examples/scripts/results/benchmark_evaluators_clean.json +0 -469
  101. wisent/examples/scripts/results/benchmark_methods_summary.json +0 -260
  102. wisent/examples/scripts/results/benchmark_pair_creation_methods.json +0 -66
  103. wisent/examples/scripts/results/benchmark_pair_totals.json +0 -269
  104. wisent/examples/scripts/results/benchmark_tags.json +0 -917
  105. wisent/examples/scripts/results/benchmark_test_summary_nov4.json +0 -71
  106. wisent/examples/scripts/results/coding_benchmarks_test_code_status.json +0 -150
  107. wisent/examples/scripts/results/failing_benchmarks.json +0 -946
  108. wisent/examples/scripts/results/failing_benchmarks_list.json +0 -41
  109. wisent/examples/scripts/results/failing_benchmarks_test_results.json +0 -945
  110. wisent/examples/scripts/results/missing_benchmark_tags.json +0 -341
  111. wisent/examples/scripts/results/test_20_newsgroups_evaluation.json +0 -30
  112. wisent/examples/scripts/results/test_20_newsgroups_pairs.json +0 -8
  113. wisent/examples/scripts/results/test_AraDICE_evaluation.json +0 -51
  114. wisent/examples/scripts/results/test_AraDICE_pairs.json +0 -14
  115. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_evaluation.json +0 -30
  116. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_pairs.json +0 -8
  117. wisent/examples/scripts/results/test_ArabCulture_evaluation.json +0 -51
  118. wisent/examples/scripts/results/test_ArabCulture_pairs.json +0 -14
  119. wisent/examples/scripts/results/test_Tag_evaluation.json +0 -30
  120. wisent/examples/scripts/results/test_Tag_pairs.json +0 -8
  121. wisent/examples/scripts/results/test_aclue_evaluation.json +0 -51
  122. wisent/examples/scripts/results/test_aclue_pairs.json +0 -14
  123. wisent/examples/scripts/results/test_acp_bench_evaluation.json +0 -51
  124. wisent/examples/scripts/results/test_acp_bench_hard_evaluation.json +0 -51
  125. wisent/examples/scripts/results/test_acp_bench_hard_pairs.json +0 -14
  126. wisent/examples/scripts/results/test_acp_bench_pairs.json +0 -14
  127. wisent/examples/scripts/results/test_advanced_ai_risk_evaluation.json +0 -51
  128. wisent/examples/scripts/results/test_advanced_ai_risk_pairs.json +0 -14
  129. wisent/examples/scripts/results/test_aexams_evaluation.json +0 -51
  130. wisent/examples/scripts/results/test_aexams_pairs.json +0 -14
  131. wisent/examples/scripts/results/test_afrimgsm_direct_amh_evaluation.json +0 -30
  132. wisent/examples/scripts/results/test_afrimgsm_direct_amh_pairs.json +0 -8
  133. wisent/examples/scripts/results/test_afrimmlu_direct_amh_evaluation.json +0 -30
  134. wisent/examples/scripts/results/test_afrimmlu_direct_amh_pairs.json +0 -8
  135. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_evaluation.json +0 -30
  136. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_pairs.json +0 -8
  137. wisent/examples/scripts/results/test_ag_news_evaluation.json +0 -30
  138. wisent/examples/scripts/results/test_ag_news_pairs.json +0 -8
  139. wisent/examples/scripts/results/test_agieval_evaluation.json +0 -51
  140. wisent/examples/scripts/results/test_agieval_pairs.json +0 -14
  141. wisent/examples/scripts/results/test_aime2024_evaluation.json +0 -30
  142. wisent/examples/scripts/results/test_aime2024_pairs.json +0 -8
  143. wisent/examples/scripts/results/test_aime2025_evaluation.json +0 -30
  144. wisent/examples/scripts/results/test_aime2025_pairs.json +0 -8
  145. wisent/examples/scripts/results/test_aime_evaluation.json +0 -30
  146. wisent/examples/scripts/results/test_aime_pairs.json +0 -8
  147. wisent/examples/scripts/results/test_anagrams1_evaluation.json +0 -30
  148. wisent/examples/scripts/results/test_anagrams1_pairs.json +0 -8
  149. wisent/examples/scripts/results/test_anagrams2_evaluation.json +0 -30
  150. wisent/examples/scripts/results/test_anagrams2_pairs.json +0 -8
  151. wisent/examples/scripts/results/test_anli_evaluation.json +0 -30
  152. wisent/examples/scripts/results/test_anli_pairs.json +0 -8
  153. wisent/examples/scripts/results/test_apps_evaluation.json +0 -30
  154. wisent/examples/scripts/results/test_apps_pairs.json +0 -8
  155. wisent/examples/scripts/results/test_arabic_exams_evaluation.json +0 -30
  156. wisent/examples/scripts/results/test_arabic_exams_pairs.json +0 -8
  157. wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json +0 -51
  158. wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json +0 -14
  159. wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json +0 -51
  160. wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json +0 -14
  161. wisent/examples/scripts/results/test_arabicmmlu_evaluation.json +0 -51
  162. wisent/examples/scripts/results/test_arabicmmlu_pairs.json +0 -14
  163. wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json +0 -51
  164. wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json +0 -14
  165. wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json +0 -51
  166. wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json +0 -14
  167. wisent/examples/scripts/results/test_arc_ar_evaluation.json +0 -30
  168. wisent/examples/scripts/results/test_arc_ar_pairs.json +0 -8
  169. wisent/examples/scripts/results/test_arc_challenge_evaluation.json +0 -30
  170. wisent/examples/scripts/results/test_arc_challenge_pairs.json +0 -8
  171. wisent/examples/scripts/results/test_arc_easy_evaluation.json +0 -30
  172. wisent/examples/scripts/results/test_arc_easy_pairs.json +0 -8
  173. wisent/examples/scripts/results/test_argument_topic_evaluation.json +0 -30
  174. wisent/examples/scripts/results/test_argument_topic_pairs.json +0 -8
  175. wisent/examples/scripts/results/test_arithmetic_evaluation.json +0 -51
  176. wisent/examples/scripts/results/test_arithmetic_pairs.json +0 -14
  177. wisent/examples/scripts/results/test_asdiv_evaluation.json +0 -30
  178. wisent/examples/scripts/results/test_asdiv_pairs.json +0 -8
  179. wisent/examples/scripts/results/test_assin_entailment_evaluation.json +0 -30
  180. wisent/examples/scripts/results/test_assin_entailment_pairs.json +0 -8
  181. wisent/examples/scripts/results/test_atis_evaluation.json +0 -30
  182. wisent/examples/scripts/results/test_atis_pairs.json +0 -8
  183. wisent/examples/scripts/results/test_babi_evaluation.json +0 -30
  184. wisent/examples/scripts/results/test_babi_pairs.json +0 -8
  185. wisent/examples/scripts/results/test_babilong_evaluation.json +0 -30
  186. wisent/examples/scripts/results/test_babilong_pairs.json +0 -8
  187. wisent/examples/scripts/results/test_bangla_mmlu_evaluation.json +0 -30
  188. wisent/examples/scripts/results/test_bangla_mmlu_pairs.json +0 -8
  189. wisent/examples/scripts/results/test_banking77_evaluation.json +0 -30
  190. wisent/examples/scripts/results/test_banking77_pairs.json +0 -8
  191. wisent/examples/scripts/results/test_basque/test_basque-glue_pairs.json +0 -14
  192. wisent/examples/scripts/results/test_basque-glue_evaluation.json +0 -51
  193. wisent/examples/scripts/results/test_basque-glue_pairs.json +0 -14
  194. wisent/examples/scripts/results/test_basque2/test_basque-glue_evaluation.json +0 -51
  195. wisent/examples/scripts/results/test_basque2/test_basque-glue_pairs.json +0 -14
  196. wisent/examples/scripts/results/test_basque_bench_evaluation.json +0 -51
  197. wisent/examples/scripts/results/test_basque_bench_pairs.json +0 -14
  198. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_evaluation.json +0 -51
  199. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_pairs.json +0 -14
  200. wisent/examples/scripts/results/test_basqueglue_evaluation.json +0 -51
  201. wisent/examples/scripts/results/test_basqueglue_pairs.json +0 -14
  202. wisent/examples/scripts/results/test_bbh_evaluation.json +0 -51
  203. wisent/examples/scripts/results/test_bbh_pairs.json +0 -14
  204. wisent/examples/scripts/results/test_bbq_evaluation.json +0 -30
  205. wisent/examples/scripts/results/test_bbq_pairs.json +0 -8
  206. wisent/examples/scripts/results/test_bec2016eu_evaluation.json +0 -51
  207. wisent/examples/scripts/results/test_bec2016eu_pairs.json +0 -14
  208. wisent/examples/scripts/results/test_belebele_evaluation.json +0 -51
  209. wisent/examples/scripts/results/test_belebele_pairs.json +0 -14
  210. wisent/examples/scripts/results/test_benchmarks_evaluation.json +0 -51
  211. wisent/examples/scripts/results/test_benchmarks_pairs.json +0 -14
  212. wisent/examples/scripts/results/test_bertaqa_evaluation.json +0 -51
  213. wisent/examples/scripts/results/test_bertaqa_pairs.json +0 -14
  214. wisent/examples/scripts/results/test_bhtc_v2_evaluation.json +0 -30
  215. wisent/examples/scripts/results/test_bhtc_v2_pairs.json +0 -8
  216. wisent/examples/scripts/results/test_bigbench_evaluation.json +0 -51
  217. wisent/examples/scripts/results/test_bigbench_pairs.json +0 -14
  218. wisent/examples/scripts/results/test_blimp_evaluation.json +0 -51
  219. wisent/examples/scripts/results/test_blimp_pairs.json +0 -14
  220. wisent/examples/scripts/results/test_boolq/test_boolq_evaluation.json +0 -30
  221. wisent/examples/scripts/results/test_boolq/test_boolq_pairs.json +0 -8
  222. wisent/examples/scripts/results/test_boolq-seq2seq_evaluation.json +0 -30
  223. wisent/examples/scripts/results/test_boolq-seq2seq_pairs.json +0 -8
  224. wisent/examples/scripts/results/test_boolq_evaluation.json +0 -30
  225. wisent/examples/scripts/results/test_boolq_pairs.json +0 -8
  226. wisent/examples/scripts/results/test_c4_evaluation.json +0 -30
  227. wisent/examples/scripts/results/test_c4_pairs.json +0 -8
  228. wisent/examples/scripts/results/test_cabreu_evaluation.json +0 -30
  229. wisent/examples/scripts/results/test_cabreu_pairs.json +0 -8
  230. wisent/examples/scripts/results/test_careqa_evaluation.json +0 -30
  231. wisent/examples/scripts/results/test_careqa_pairs.json +0 -8
  232. wisent/examples/scripts/results/test_catalan_bench_evaluation.json +0 -51
  233. wisent/examples/scripts/results/test_catalan_bench_pairs.json +0 -14
  234. wisent/examples/scripts/results/test_catalanqa_evaluation.json +0 -30
  235. wisent/examples/scripts/results/test_catalanqa_pairs.json +0 -8
  236. wisent/examples/scripts/results/test_catcola_evaluation.json +0 -30
  237. wisent/examples/scripts/results/test_catcola_pairs.json +0 -8
  238. wisent/examples/scripts/results/test_cb_evaluation.json +0 -30
  239. wisent/examples/scripts/results/test_cb_pairs.json +0 -8
  240. wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json +0 -51
  241. wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json +0 -14
  242. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json +0 -30
  243. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json +0 -8
  244. wisent/examples/scripts/results/test_ceval_evaluation.json +0 -51
  245. wisent/examples/scripts/results/test_ceval_pairs.json +0 -14
  246. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json +0 -51
  247. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json +0 -14
  248. wisent/examples/scripts/results/test_chain_of_thought_evaluation.json +0 -51
  249. wisent/examples/scripts/results/test_chain_of_thought_pairs.json +0 -14
  250. wisent/examples/scripts/results/test_chartqa_evaluation.json +0 -30
  251. wisent/examples/scripts/results/test_chartqa_pairs.json +0 -8
  252. wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json +0 -30
  253. wisent/examples/scripts/results/test_claim_stance_topic_pairs.json +0 -8
  254. wisent/examples/scripts/results/test_cmmlu_evaluation.json +0 -51
  255. wisent/examples/scripts/results/test_cmmlu_pairs.json +0 -14
  256. wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json +0 -30
  257. wisent/examples/scripts/results/test_cnn_dailymail_pairs.json +0 -8
  258. wisent/examples/scripts/results/test_cocoteros_es_evaluation.json +0 -30
  259. wisent/examples/scripts/results/test_cocoteros_es_pairs.json +0 -8
  260. wisent/examples/scripts/results/test_codexglue_code_to_text_go_evaluation.json +0 -30
  261. wisent/examples/scripts/results/test_codexglue_code_to_text_go_pairs.json +0 -8
  262. wisent/examples/scripts/results/test_codexglue_code_to_text_java_evaluation.json +0 -30
  263. wisent/examples/scripts/results/test_codexglue_code_to_text_java_pairs.json +0 -8
  264. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_evaluation.json +0 -30
  265. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_pairs.json +0 -8
  266. wisent/examples/scripts/results/test_codexglue_code_to_text_php_evaluation.json +0 -30
  267. wisent/examples/scripts/results/test_codexglue_code_to_text_php_pairs.json +0 -8
  268. wisent/examples/scripts/results/test_codexglue_code_to_text_python_evaluation.json +0 -30
  269. wisent/examples/scripts/results/test_codexglue_code_to_text_python_pairs.json +0 -8
  270. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_evaluation.json +0 -30
  271. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_pairs.json +0 -8
  272. wisent/examples/scripts/results/test_coedit_gec_evaluation.json +0 -30
  273. wisent/examples/scripts/results/test_coedit_gec_pairs.json +0 -8
  274. wisent/examples/scripts/results/test_cola_evaluation.json +0 -30
  275. wisent/examples/scripts/results/test_cola_pairs.json +0 -8
  276. wisent/examples/scripts/results/test_commonsense_qa_evaluation.json +0 -30
  277. wisent/examples/scripts/results/test_commonsense_qa_pairs.json +0 -8
  278. wisent/examples/scripts/results/test_conala_evaluation.json +0 -30
  279. wisent/examples/scripts/results/test_conala_pairs.json +0 -8
  280. wisent/examples/scripts/results/test_concode_evaluation.json +0 -30
  281. wisent/examples/scripts/results/test_concode_pairs.json +0 -8
  282. wisent/examples/scripts/results/test_copa_evaluation.json +0 -30
  283. wisent/examples/scripts/results/test_copa_pairs.json +0 -8
  284. wisent/examples/scripts/results/test_copal_id_evaluation.json +0 -30
  285. wisent/examples/scripts/results/test_copal_id_pairs.json +0 -8
  286. wisent/examples/scripts/results/test_coqa_evaluation.json +0 -30
  287. wisent/examples/scripts/results/test_coqa_pairs.json +0 -8
  288. wisent/examples/scripts/results/test_coqcat_evaluation.json +0 -30
  289. wisent/examples/scripts/results/test_coqcat_pairs.json +0 -8
  290. wisent/examples/scripts/results/test_crows_pairs_evaluation.json +0 -51
  291. wisent/examples/scripts/results/test_crows_pairs_pairs.json +0 -14
  292. wisent/examples/scripts/results/test_csatqa_evaluation.json +0 -51
  293. wisent/examples/scripts/results/test_csatqa_pairs.json +0 -14
  294. wisent/examples/scripts/results/test_cycle_letters_evaluation.json +0 -30
  295. wisent/examples/scripts/results/test_cycle_letters_pairs.json +0 -8
  296. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_evaluation.json +0 -51
  297. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_pairs.json +0 -14
  298. wisent/examples/scripts/results/test_darija_bench_evaluation.json +0 -51
  299. wisent/examples/scripts/results/test_darija_bench_pairs.json +0 -14
  300. wisent/examples/scripts/results/test_darijahellaswag_evaluation.json +0 -30
  301. wisent/examples/scripts/results/test_darijahellaswag_pairs.json +0 -8
  302. wisent/examples/scripts/results/test_darijammlu_evaluation.json +0 -51
  303. wisent/examples/scripts/results/test_darijammlu_pairs.json +0 -14
  304. wisent/examples/scripts/results/test_dbpedia_14_evaluation.json +0 -30
  305. wisent/examples/scripts/results/test_dbpedia_14_pairs.json +0 -8
  306. wisent/examples/scripts/results/test_drop_evaluation.json +0 -30
  307. wisent/examples/scripts/results/test_drop_pairs.json +0 -8
  308. wisent/examples/scripts/results/test_ds1000_evaluation.json +0 -30
  309. wisent/examples/scripts/results/test_ds1000_pairs.json +0 -8
  310. wisent/examples/scripts/results/test_egyhellaswag_evaluation.json +0 -30
  311. wisent/examples/scripts/results/test_egyhellaswag_pairs.json +0 -8
  312. wisent/examples/scripts/results/test_egymmlu_evaluation.json +0 -51
  313. wisent/examples/scripts/results/test_egymmlu_pairs.json +0 -14
  314. wisent/examples/scripts/results/test_epec_koref_bin_evaluation.json +0 -30
  315. wisent/examples/scripts/results/test_epec_koref_bin_pairs.json +0 -8
  316. wisent/examples/scripts/results/test_eq_bench_evaluation.json +0 -30
  317. wisent/examples/scripts/results/test_eq_bench_pairs.json +0 -8
  318. wisent/examples/scripts/results/test_escola_evaluation.json +0 -30
  319. wisent/examples/scripts/results/test_escola_pairs.json +0 -8
  320. wisent/examples/scripts/results/test_ethics_cm_evaluation.json +0 -30
  321. wisent/examples/scripts/results/test_ethics_cm_pairs.json +0 -8
  322. wisent/examples/scripts/results/test_ethos_binary_evaluation.json +0 -30
  323. wisent/examples/scripts/results/test_ethos_binary_pairs.json +0 -8
  324. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_evaluation.json +0 -51
  325. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_pairs.json +0 -14
  326. wisent/examples/scripts/results/test_eus_exams_es_evaluation.json +0 -51
  327. wisent/examples/scripts/results/test_eus_exams_es_pairs.json +0 -14
  328. wisent/examples/scripts/results/test_eus_exams_evaluation.json +0 -51
  329. wisent/examples/scripts/results/test_eus_exams_pairs.json +0 -14
  330. wisent/examples/scripts/results/test_eus_proficiency_evaluation.json +0 -30
  331. wisent/examples/scripts/results/test_eus_proficiency_pairs.json +0 -8
  332. wisent/examples/scripts/results/test_eus_reading_evaluation.json +0 -30
  333. wisent/examples/scripts/results/test_eus_reading_pairs.json +0 -8
  334. wisent/examples/scripts/results/test_eus_trivia_evaluation.json +0 -30
  335. wisent/examples/scripts/results/test_eus_trivia_pairs.json +0 -8
  336. wisent/examples/scripts/results/test_evalita-mp_evaluation.json +0 -51
  337. wisent/examples/scripts/results/test_evalita-mp_pairs.json +0 -14
  338. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
  339. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
  340. wisent/examples/scripts/results/test_evalita_LLM_evaluation.json +0 -51
  341. wisent/examples/scripts/results/test_evalita_LLM_pairs.json +0 -14
  342. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_evaluation.json +0 -51
  343. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_pairs.json +0 -14
  344. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_evaluation.json +0 -30
  345. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_pairs.json +0 -8
  346. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_evaluation.json +0 -51
  347. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_pairs.json +0 -14
  348. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
  349. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
  350. wisent/examples/scripts/results/test_fda_evaluation.json +0 -30
  351. wisent/examples/scripts/results/test_fda_pairs.json +0 -8
  352. wisent/examples/scripts/results/test_financial_tweets_evaluation.json +0 -30
  353. wisent/examples/scripts/results/test_financial_tweets_pairs.json +0 -8
  354. wisent/examples/scripts/results/test_fld/test_fld_evaluation.json +0 -30
  355. wisent/examples/scripts/results/test_fld/test_fld_pairs.json +0 -8
  356. wisent/examples/scripts/results/test_fld_evaluation.json +0 -30
  357. wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json +0 -30
  358. wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json +0 -8
  359. wisent/examples/scripts/results/test_fld_pairs.json +0 -8
  360. wisent/examples/scripts/results/test_flores_evaluation.json +0 -51
  361. wisent/examples/scripts/results/test_flores_pairs.json +0 -14
  362. wisent/examples/scripts/results/test_freebase_evaluation.json +0 -30
  363. wisent/examples/scripts/results/test_freebase_pairs.json +0 -8
  364. wisent/examples/scripts/results/test_french_bench_evaluation.json +0 -51
  365. wisent/examples/scripts/results/test_french_bench_pairs.json +0 -14
  366. wisent/examples/scripts/results/test_galcola_evaluation.json +0 -30
  367. wisent/examples/scripts/results/test_galcola_pairs.json +0 -8
  368. wisent/examples/scripts/results/test_galician_bench_evaluation.json +0 -51
  369. wisent/examples/scripts/results/test_galician_bench_pairs.json +0 -14
  370. wisent/examples/scripts/results/test_glianorex_evaluation.json +0 -30
  371. wisent/examples/scripts/results/test_glianorex_pairs.json +0 -8
  372. wisent/examples/scripts/results/test_global_mmlu_evaluation.json +0 -51
  373. wisent/examples/scripts/results/test_global_mmlu_pairs.json +0 -14
  374. wisent/examples/scripts/results/test_glue_evaluation.json +0 -51
  375. wisent/examples/scripts/results/test_glue_pairs.json +0 -14
  376. wisent/examples/scripts/results/test_gpqa_evaluation.json +0 -51
  377. wisent/examples/scripts/results/test_gpqa_pairs.json +0 -14
  378. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_evaluation.json +0 -51
  379. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_pairs.json +0 -14
  380. wisent/examples/scripts/results/test_groundcocoa_evaluation.json +0 -30
  381. wisent/examples/scripts/results/test_groundcocoa_pairs.json +0 -8
  382. wisent/examples/scripts/results/test_gsm8k_evaluation.json +0 -30
  383. wisent/examples/scripts/results/test_gsm8k_pairs.json +0 -8
  384. wisent/examples/scripts/results/test_haerae_evaluation.json +0 -51
  385. wisent/examples/scripts/results/test_haerae_pairs.json +0 -14
  386. wisent/examples/scripts/results/test_headqa_evaluation.json +0 -30
  387. wisent/examples/scripts/results/test_headqa_pairs.json +0 -8
  388. wisent/examples/scripts/results/test_hellaswag_evaluation.json +0 -30
  389. wisent/examples/scripts/results/test_hellaswag_pairs.json +0 -8
  390. wisent/examples/scripts/results/test_hendrycks_ethics_evaluation.json +0 -51
  391. wisent/examples/scripts/results/test_hendrycks_ethics_pairs.json +0 -14
  392. wisent/examples/scripts/results/test_hendrycks_math_evaluation.json +0 -51
  393. wisent/examples/scripts/results/test_hendrycks_math_pairs.json +0 -14
  394. wisent/examples/scripts/results/test_histoires_morales_evaluation.json +0 -30
  395. wisent/examples/scripts/results/test_histoires_morales_pairs.json +0 -8
  396. wisent/examples/scripts/results/test_hmmt_evaluation.json +0 -30
  397. wisent/examples/scripts/results/test_hmmt_feb_2025_evaluation.json +0 -30
  398. wisent/examples/scripts/results/test_hmmt_feb_2025_pairs.json +0 -8
  399. wisent/examples/scripts/results/test_hmmt_pairs.json +0 -8
  400. wisent/examples/scripts/results/test_hrm8k_evaluation.json +0 -51
  401. wisent/examples/scripts/results/test_hrm8k_pairs.json +0 -14
  402. wisent/examples/scripts/results/test_humaneval_evaluation.json +0 -30
  403. wisent/examples/scripts/results/test_humaneval_pairs.json +0 -8
  404. wisent/examples/scripts/results/test_humaneval_plus_evaluation.json +0 -30
  405. wisent/examples/scripts/results/test_humaneval_plus_pairs.json +0 -8
  406. wisent/examples/scripts/results/test_ifeval_evaluation.json +0 -30
  407. wisent/examples/scripts/results/test_ifeval_pairs.json +0 -8
  408. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_evaluation.json +0 -30
  409. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_pairs.json +0 -8
  410. wisent/examples/scripts/results/test_instruct_humaneval_evaluation.json +0 -30
  411. wisent/examples/scripts/results/test_instruct_humaneval_pairs.json +0 -8
  412. wisent/examples/scripts/results/test_inverse_scaling_evaluation.json +0 -51
  413. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_evaluation.json +0 -30
  414. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_pairs.json +0 -8
  415. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_evaluation.json +0 -51
  416. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_pairs.json +0 -14
  417. wisent/examples/scripts/results/test_inverse_scaling_pairs.json +0 -14
  418. wisent/examples/scripts/results/test_iwslt2017-ar-en_evaluation.json +0 -30
  419. wisent/examples/scripts/results/test_iwslt2017-ar-en_pairs.json +0 -8
  420. wisent/examples/scripts/results/test_iwslt2017-en-ar_evaluation.json +0 -30
  421. wisent/examples/scripts/results/test_iwslt2017-en-ar_pairs.json +0 -8
  422. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_evaluation.json +0 -30
  423. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_pairs.json +0 -8
  424. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_evaluation.json +0 -30
  425. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_pairs.json +0 -8
  426. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_evaluation.json +0 -30
  427. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_pairs.json +0 -8
  428. wisent/examples/scripts/results/test_japanese_leaderboard_evaluation.json +0 -51
  429. wisent/examples/scripts/results/test_japanese_leaderboard_pairs.json +0 -14
  430. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_evaluation.json +0 -30
  431. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_pairs.json +0 -8
  432. wisent/examples/scripts/results/test_jsonschema_bench_evaluation.json +0 -30
  433. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_evaluation.json +0 -30
  434. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_pairs.json +0 -8
  435. wisent/examples/scripts/results/test_jsonschema_bench_pairs.json +0 -8
  436. wisent/examples/scripts/results/test_kbl_evaluation.json +0 -51
  437. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_evaluation.json +0 -51
  438. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_pairs.json +0 -14
  439. wisent/examples/scripts/results/test_kbl_pairs.json +0 -14
  440. wisent/examples/scripts/results/test_kmmlu_evaluation.json +0 -51
  441. wisent/examples/scripts/results/test_kmmlu_pairs.json +0 -14
  442. wisent/examples/scripts/results/test_kobest_evaluation.json +0 -51
  443. wisent/examples/scripts/results/test_kobest_pairs.json +0 -14
  444. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json +0 -30
  445. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json +0 -8
  446. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json +0 -30
  447. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json +0 -8
  448. wisent/examples/scripts/results/test_kormedmcqa_evaluation.json +0 -30
  449. wisent/examples/scripts/results/test_kormedmcqa_pairs.json +0 -8
  450. wisent/examples/scripts/results/test_lambada_cloze_evaluation.json +0 -30
  451. wisent/examples/scripts/results/test_lambada_cloze_pairs.json +0 -8
  452. wisent/examples/scripts/results/test_lambada_evaluation.json +0 -30
  453. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  454. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  455. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json +0 -51
  456. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json +0 -14
  457. wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json +0 -51
  458. wisent/examples/scripts/results/test_lambada_multilingual_pairs.json +0 -14
  459. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json +0 -51
  460. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json +0 -14
  461. wisent/examples/scripts/results/test_lambada_openai_evaluation.json +0 -30
  462. wisent/examples/scripts/results/test_lambada_openai_pairs.json +0 -8
  463. wisent/examples/scripts/results/test_lambada_pairs.json +0 -8
  464. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  465. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  466. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  467. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  468. wisent/examples/scripts/results/test_lambada_standard_evaluation.json +0 -30
  469. wisent/examples/scripts/results/test_lambada_standard_pairs.json +0 -8
  470. wisent/examples/scripts/results/test_leaderboard_evaluation.json +0 -51
  471. wisent/examples/scripts/results/test_leaderboard_pairs.json +0 -14
  472. wisent/examples/scripts/results/test_libra/test_libra_evaluation.json +0 -51
  473. wisent/examples/scripts/results/test_libra/test_libra_pairs.json +0 -14
  474. wisent/examples/scripts/results/test_libra_evaluation.json +0 -51
  475. wisent/examples/scripts/results/test_libra_pairs.json +0 -14
  476. wisent/examples/scripts/results/test_lingoly_evaluation.json +0 -30
  477. wisent/examples/scripts/results/test_lingoly_pairs.json +0 -8
  478. wisent/examples/scripts/results/test_livecodebench_evaluation.json +0 -30
  479. wisent/examples/scripts/results/test_livecodebench_pairs.json +0 -8
  480. wisent/examples/scripts/results/test_livemathbench_cnmo_en_evaluation.json +0 -30
  481. wisent/examples/scripts/results/test_livemathbench_cnmo_en_pairs.json +0 -8
  482. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_evaluation.json +0 -30
  483. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_pairs.json +0 -8
  484. wisent/examples/scripts/results/test_llama_evaluation.json +0 -30
  485. wisent/examples/scripts/results/test_llama_pairs.json +0 -8
  486. wisent/examples/scripts/results/test_logiqa2_evaluation.json +0 -30
  487. wisent/examples/scripts/results/test_logiqa2_pairs.json +0 -8
  488. wisent/examples/scripts/results/test_logiqa_evaluation.json +0 -30
  489. wisent/examples/scripts/results/test_logiqa_pairs.json +0 -8
  490. wisent/examples/scripts/results/test_m_mmlu_evaluation.json +0 -51
  491. wisent/examples/scripts/results/test_m_mmlu_pairs.json +0 -14
  492. wisent/examples/scripts/results/test_mastermind/test_mastermind_evaluation.json +0 -51
  493. wisent/examples/scripts/results/test_mastermind/test_mastermind_pairs.json +0 -14
  494. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_evaluation.json +0 -30
  495. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_pairs.json +0 -8
  496. wisent/examples/scripts/results/test_mastermind_evaluation.json +0 -51
  497. wisent/examples/scripts/results/test_mastermind_pairs.json +0 -14
  498. wisent/examples/scripts/results/test_math500_evaluation.json +0 -30
  499. wisent/examples/scripts/results/test_math500_pairs.json +0 -8
  500. wisent/examples/scripts/results/test_math_evaluation.json +0 -30
  501. wisent/examples/scripts/results/test_math_pairs.json +0 -8
  502. wisent/examples/scripts/results/test_mathqa_evaluation.json +0 -30
  503. wisent/examples/scripts/results/test_mathqa_pairs.json +0 -8
  504. wisent/examples/scripts/results/test_mbpp_evaluation.json +0 -30
  505. wisent/examples/scripts/results/test_mbpp_pairs.json +0 -8
  506. wisent/examples/scripts/results/test_mbpp_plus_evaluation.json +0 -30
  507. wisent/examples/scripts/results/test_mbpp_plus_pairs.json +0 -8
  508. wisent/examples/scripts/results/test_mc_taco_evaluation.json +0 -30
  509. wisent/examples/scripts/results/test_mc_taco_pairs.json +0 -8
  510. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_evaluation.json +0 -51
  511. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_pairs.json +0 -14
  512. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_evaluation.json +0 -30
  513. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_pairs.json +0 -8
  514. wisent/examples/scripts/results/test_med_concepts_qa_evaluation.json +0 -51
  515. wisent/examples/scripts/results/test_med_concepts_qa_pairs.json +0 -14
  516. wisent/examples/scripts/results/test_meddialog_evaluation.json +0 -30
  517. wisent/examples/scripts/results/test_meddialog_pairs.json +0 -8
  518. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_evaluation.json +0 -30
  519. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_pairs.json +0 -8
  520. wisent/examples/scripts/results/test_mediqa_qa2019_evaluation.json +0 -30
  521. wisent/examples/scripts/results/test_mediqa_qa2019_pairs.json +0 -8
  522. wisent/examples/scripts/results/test_medmcqa_evaluation.json +0 -30
  523. wisent/examples/scripts/results/test_medmcqa_pairs.json +0 -8
  524. wisent/examples/scripts/results/test_medqa_evaluation.json +0 -30
  525. wisent/examples/scripts/results/test_medqa_pairs.json +0 -8
  526. wisent/examples/scripts/results/test_medtext_evaluation.json +0 -30
  527. wisent/examples/scripts/results/test_medtext_pairs.json +0 -8
  528. wisent/examples/scripts/results/test_mela_evaluation.json +0 -51
  529. wisent/examples/scripts/results/test_mela_pairs.json +0 -14
  530. wisent/examples/scripts/results/test_meqsum_evaluation.json +0 -30
  531. wisent/examples/scripts/results/test_meqsum_pairs.json +0 -8
  532. wisent/examples/scripts/results/test_mercury_evaluation.json +0 -30
  533. wisent/examples/scripts/results/test_mercury_pairs.json +0 -8
  534. wisent/examples/scripts/results/test_metabench_evaluation.json +0 -51
  535. wisent/examples/scripts/results/test_metabench_pairs.json +0 -14
  536. wisent/examples/scripts/results/test_mgsm_evaluation.json +0 -51
  537. wisent/examples/scripts/results/test_mgsm_pairs.json +0 -14
  538. wisent/examples/scripts/results/test_mimic_repsum_evaluation.json +0 -30
  539. wisent/examples/scripts/results/test_mimic_repsum_pairs.json +0 -8
  540. wisent/examples/scripts/results/test_minerva_math_evaluation.json +0 -51
  541. wisent/examples/scripts/results/test_minerva_math_pairs.json +0 -14
  542. wisent/examples/scripts/results/test_mlqa_evaluation.json +0 -51
  543. wisent/examples/scripts/results/test_mlqa_pairs.json +0 -14
  544. wisent/examples/scripts/results/test_mmlu-pro-plus_evaluation.json +0 -51
  545. wisent/examples/scripts/results/test_mmlu-pro-plus_pairs.json +0 -14
  546. wisent/examples/scripts/results/test_mmlu_evaluation.json +0 -51
  547. wisent/examples/scripts/results/test_mmlu_pairs.json +0 -14
  548. wisent/examples/scripts/results/test_mmlu_pro_evaluation.json +0 -51
  549. wisent/examples/scripts/results/test_mmlu_pro_pairs.json +0 -14
  550. wisent/examples/scripts/results/test_mmlu_prox_evaluation.json +0 -51
  551. wisent/examples/scripts/results/test_mmlu_prox_pairs.json +0 -14
  552. wisent/examples/scripts/results/test_mmlusr_evaluation.json +0 -30
  553. wisent/examples/scripts/results/test_mmlusr_pairs.json +0 -8
  554. wisent/examples/scripts/results/test_mmmu_evaluation.json +0 -51
  555. wisent/examples/scripts/results/test_mmmu_pairs.json +0 -14
  556. wisent/examples/scripts/results/test_mnli_evaluation.json +0 -30
  557. wisent/examples/scripts/results/test_mnli_pairs.json +0 -8
  558. wisent/examples/scripts/results/test_model_written_evals_evaluation.json +0 -51
  559. wisent/examples/scripts/results/test_model_written_evals_pairs.json +0 -14
  560. wisent/examples/scripts/results/test_moral_stories_evaluation.json +0 -30
  561. wisent/examples/scripts/results/test_moral_stories_pairs.json +0 -8
  562. wisent/examples/scripts/results/test_mts_dialog_evaluation.json +0 -30
  563. wisent/examples/scripts/results/test_mts_dialog_pairs.json +0 -8
  564. wisent/examples/scripts/results/test_multiblimp_evaluation.json +0 -51
  565. wisent/examples/scripts/results/test_multiblimp_pairs.json +0 -14
  566. wisent/examples/scripts/results/test_multimedqa_evaluation.json +0 -51
  567. wisent/examples/scripts/results/test_multimedqa_pairs.json +0 -14
  568. wisent/examples/scripts/results/test_multipl_e_evaluation.json +0 -30
  569. wisent/examples/scripts/results/test_multipl_e_pairs.json +0 -8
  570. wisent/examples/scripts/results/test_mutual_evaluation.json +0 -30
  571. wisent/examples/scripts/results/test_mutual_pairs.json +0 -8
  572. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_evaluation.json +0 -30
  573. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_pairs.json +0 -8
  574. wisent/examples/scripts/results/test_noreval_evaluation.json +0 -51
  575. wisent/examples/scripts/results/test_noreval_pairs.json +0 -14
  576. wisent/examples/scripts/results/test_noticia_evaluation.json +0 -30
  577. wisent/examples/scripts/results/test_noticia_pairs.json +0 -8
  578. wisent/examples/scripts/results/test_nq_open_evaluation.json +0 -30
  579. wisent/examples/scripts/results/test_nq_open_pairs.json +0 -8
  580. wisent/examples/scripts/results/test_olaph_evaluation.json +0 -30
  581. wisent/examples/scripts/results/test_olaph_pairs.json +0 -8
  582. wisent/examples/scripts/results/test_openbookqa_evaluation.json +0 -30
  583. wisent/examples/scripts/results/test_openbookqa_pairs.json +0 -8
  584. wisent/examples/scripts/results/test_openllm_evaluation.json +0 -51
  585. wisent/examples/scripts/results/test_openllm_pairs.json +0 -14
  586. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_evaluation.json +0 -30
  587. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_pairs.json +0 -8
  588. wisent/examples/scripts/results/test_paloma_evaluation.json +0 -51
  589. wisent/examples/scripts/results/test_paloma_pairs.json +0 -14
  590. wisent/examples/scripts/results/test_passkey/test_passkey_evaluation.json +0 -30
  591. wisent/examples/scripts/results/test_passkey/test_passkey_pairs.json +0 -8
  592. wisent/examples/scripts/results/test_paws-x_evaluation.json +0 -51
  593. wisent/examples/scripts/results/test_paws-x_pairs.json +0 -14
  594. wisent/examples/scripts/results/test_paws_en/test_paws_en_evaluation.json +0 -30
  595. wisent/examples/scripts/results/test_paws_en/test_paws_en_pairs.json +0 -8
  596. wisent/examples/scripts/results/test_penn_treebank_evaluation.json +0 -30
  597. wisent/examples/scripts/results/test_penn_treebank_pairs.json +0 -8
  598. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_evaluation.json +0 -30
  599. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_pairs.json +0 -8
  600. wisent/examples/scripts/results/test_piqa_evaluation.json +0 -30
  601. wisent/examples/scripts/results/test_piqa_pairs.json +0 -8
  602. wisent/examples/scripts/results/test_polemo2_evaluation.json +0 -30
  603. wisent/examples/scripts/results/test_polemo2_pairs.json +0 -8
  604. wisent/examples/scripts/results/test_polymath_en_high_evaluation.json +0 -30
  605. wisent/examples/scripts/results/test_polymath_en_high_pairs.json +0 -8
  606. wisent/examples/scripts/results/test_polymath_en_medium_evaluation.json +0 -30
  607. wisent/examples/scripts/results/test_polymath_en_medium_pairs.json +0 -8
  608. wisent/examples/scripts/results/test_polymath_zh_high_evaluation.json +0 -30
  609. wisent/examples/scripts/results/test_polymath_zh_high_pairs.json +0 -8
  610. wisent/examples/scripts/results/test_polymath_zh_medium_evaluation.json +0 -30
  611. wisent/examples/scripts/results/test_polymath_zh_medium_pairs.json +0 -8
  612. wisent/examples/scripts/results/test_portuguese_bench_evaluation.json +0 -51
  613. wisent/examples/scripts/results/test_portuguese_bench_pairs.json +0 -14
  614. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
  615. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
  616. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
  617. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
  618. wisent/examples/scripts/results/test_prost_evaluation.json +0 -30
  619. wisent/examples/scripts/results/test_prost_pairs.json +0 -8
  620. wisent/examples/scripts/results/test_ptb_evaluation.json +0 -30
  621. wisent/examples/scripts/results/test_ptb_pairs.json +0 -8
  622. wisent/examples/scripts/results/test_pubmedqa_evaluation.json +0 -30
  623. wisent/examples/scripts/results/test_pubmedqa_pairs.json +0 -8
  624. wisent/examples/scripts/results/test_pythia_evaluation.json +0 -51
  625. wisent/examples/scripts/results/test_pythia_pairs.json +0 -14
  626. wisent/examples/scripts/results/test_qa4mre_evaluation.json +0 -30
  627. wisent/examples/scripts/results/test_qa4mre_pairs.json +0 -8
  628. wisent/examples/scripts/results/test_qasper_evaluation.json +0 -30
  629. wisent/examples/scripts/results/test_qasper_pairs.json +0 -8
  630. wisent/examples/scripts/results/test_race_evaluation.json +0 -30
  631. wisent/examples/scripts/results/test_race_pairs.json +0 -8
  632. wisent/examples/scripts/results/test_realtoxicityprompts_evaluation.json +0 -30
  633. wisent/examples/scripts/results/test_realtoxicityprompts_pairs.json +0 -8
  634. wisent/examples/scripts/results/test_recode_evaluation.json +0 -30
  635. wisent/examples/scripts/results/test_recode_pairs.json +0 -8
  636. wisent/examples/scripts/results/test_record_evaluation.json +0 -30
  637. wisent/examples/scripts/results/test_record_pairs.json +0 -8
  638. wisent/examples/scripts/results/test_ruler_evaluation.json +0 -51
  639. wisent/examples/scripts/results/test_ruler_pairs.json +0 -14
  640. wisent/examples/scripts/results/test_sciq_evaluation.json +0 -30
  641. wisent/examples/scripts/results/test_sciq_pairs.json +0 -8
  642. wisent/examples/scripts/results/test_score_evaluation.json +0 -51
  643. wisent/examples/scripts/results/test_score_pairs.json +0 -14
  644. wisent/examples/scripts/results/test_self_consistency_evaluation.json +0 -30
  645. wisent/examples/scripts/results/test_self_consistency_pairs.json +0 -8
  646. wisent/examples/scripts/results/test_siqa/test_siqa_evaluation.json +0 -30
  647. wisent/examples/scripts/results/test_siqa/test_siqa_pairs.json +0 -8
  648. wisent/examples/scripts/results/test_siqa_evaluation.json +0 -30
  649. wisent/examples/scripts/results/test_siqa_pairs.json +0 -8
  650. wisent/examples/scripts/results/test_spanish_bench_evaluation.json +0 -51
  651. wisent/examples/scripts/results/test_spanish_bench_pairs.json +0 -14
  652. wisent/examples/scripts/results/test_squad2_evaluation.json +0 -30
  653. wisent/examples/scripts/results/test_squad2_pairs.json +0 -8
  654. wisent/examples/scripts/results/test_squadv2_evaluation.json +0 -30
  655. wisent/examples/scripts/results/test_squadv2_pairs.json +0 -8
  656. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_evaluation.json +0 -30
  657. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_pairs.json +0 -8
  658. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_evaluation.json +0 -51
  659. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_pairs.json +0 -14
  660. wisent/examples/scripts/results/test_swag_evaluation.json +0 -30
  661. wisent/examples/scripts/results/test_swag_pairs.json +0 -8
  662. wisent/examples/scripts/results/test_tinyBenchmarks_evaluation.json +0 -51
  663. wisent/examples/scripts/results/test_tinyBenchmarks_pairs.json +0 -14
  664. wisent/examples/scripts/results/test_tmmluplus_evaluation.json +0 -51
  665. wisent/examples/scripts/results/test_tmmluplus_pairs.json +0 -14
  666. wisent/examples/scripts/results/test_translation_evaluation.json +0 -51
  667. wisent/examples/scripts/results/test_translation_pairs.json +0 -14
  668. wisent/examples/scripts/results/test_triviaqa_evaluation.json +0 -30
  669. wisent/examples/scripts/results/test_triviaqa_pairs.json +0 -8
  670. wisent/examples/scripts/results/test_truthfulqa-multi_evaluation.json +0 -51
  671. wisent/examples/scripts/results/test_truthfulqa-multi_pairs.json +0 -14
  672. wisent/examples/scripts/results/test_truthfulqa_evaluation.json +0 -30
  673. wisent/examples/scripts/results/test_truthfulqa_mc1_evaluation.json +0 -30
  674. wisent/examples/scripts/results/test_truthfulqa_mc1_pairs.json +0 -8
  675. wisent/examples/scripts/results/test_truthfulqa_mc2_evaluation.json +0 -30
  676. wisent/examples/scripts/results/test_truthfulqa_mc2_pairs.json +0 -8
  677. wisent/examples/scripts/results/test_truthfulqa_pairs.json +0 -8
  678. wisent/examples/scripts/results/test_turkishmmlu_evaluation.json +0 -51
  679. wisent/examples/scripts/results/test_turkishmmlu_pairs.json +0 -14
  680. wisent/examples/scripts/results/test_unfair_tos_evaluation.json +0 -30
  681. wisent/examples/scripts/results/test_unfair_tos_pairs.json +0 -8
  682. wisent/examples/scripts/results/test_unscramble_evaluation.json +0 -51
  683. wisent/examples/scripts/results/test_unscramble_pairs.json +0 -14
  684. wisent/examples/scripts/results/test_webqs_evaluation.json +0 -30
  685. wisent/examples/scripts/results/test_webqs_pairs.json +0 -8
  686. wisent/examples/scripts/results/test_wikitext103_evaluation.json +0 -30
  687. wisent/examples/scripts/results/test_wikitext103_pairs.json +0 -8
  688. wisent/examples/scripts/results/test_wikitext_evaluation.json +0 -30
  689. wisent/examples/scripts/results/test_wikitext_pairs.json +0 -8
  690. wisent/examples/scripts/results/test_winogender_evaluation.json +0 -51
  691. wisent/examples/scripts/results/test_winogender_pairs.json +0 -14
  692. wisent/examples/scripts/results/test_winogrande_evaluation.json +0 -30
  693. wisent/examples/scripts/results/test_winogrande_pairs.json +0 -8
  694. wisent/examples/scripts/results/test_wmdp_evaluation.json +0 -30
  695. wisent/examples/scripts/results/test_wmdp_pairs.json +0 -8
  696. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_evaluation.json +0 -30
  697. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_pairs.json +0 -8
  698. wisent/examples/scripts/results/test_wmt14_en_fr_evaluation.json +0 -30
  699. wisent/examples/scripts/results/test_wmt14_en_fr_pairs.json +0 -8
  700. wisent/examples/scripts/results/test_wmt16_en_de_evaluation.json +0 -30
  701. wisent/examples/scripts/results/test_wmt16_en_de_pairs.json +0 -8
  702. wisent/examples/scripts/results/test_wmt16_ro_en_evaluation.json +0 -30
  703. wisent/examples/scripts/results/test_wmt16_ro_en_pairs.json +0 -8
  704. wisent/examples/scripts/results/test_wsc273_evaluation.json +0 -30
  705. wisent/examples/scripts/results/test_wsc273_pairs.json +0 -8
  706. wisent/examples/scripts/results/test_xcopa_evaluation.json +0 -51
  707. wisent/examples/scripts/results/test_xcopa_pairs.json +0 -14
  708. wisent/examples/scripts/results/test_xnli_eu_evaluation.json +0 -30
  709. wisent/examples/scripts/results/test_xnli_eu_pairs.json +0 -8
  710. wisent/examples/scripts/results/test_xnli_evaluation.json +0 -51
  711. wisent/examples/scripts/results/test_xnli_pairs.json +0 -14
  712. wisent/examples/scripts/results/test_xquad_evaluation.json +0 -51
  713. wisent/examples/scripts/results/test_xquad_pairs.json +0 -14
  714. wisent/examples/scripts/results/test_xstorycloze_evaluation.json +0 -51
  715. wisent/examples/scripts/results/test_xstorycloze_pairs.json +0 -14
  716. wisent/examples/scripts/results/test_xsum_evaluation.json +0 -30
  717. wisent/examples/scripts/results/test_xsum_pairs.json +0 -8
  718. wisent/examples/scripts/results/test_xwinograd_evaluation.json +0 -51
  719. wisent/examples/scripts/results/test_xwinograd_pairs.json +0 -14
  720. wisent/examples/scripts/results/test_yahoo_answers_topics_evaluation.json +0 -30
  721. wisent/examples/scripts/results/test_yahoo_answers_topics_pairs.json +0 -8
  722. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/WHEEL +0 -0
  723. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/entry_points.txt +0 -0
  724. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/licenses/LICENSE +0 -0
  725. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1202 @@
1
+ """
2
+ Exhaustive layer combination analysis.
3
+
4
+ Tests all 2^N - 1 layer combinations to find optimal layer subsets
5
+ for geometry detection.
6
+
7
+ Uses CLI commands for pair generation and activation extraction.
8
+
9
+ ===============================================================================
10
+ DEBUGGING NOTES - READ BEFORE MAKING ASSUMPTIONS
11
+ ===============================================================================
12
+
13
+ On Dec 15, 2025, a Qwen3-8B run (36 layers = 68 billion combinations) became
14
+ unresponsive after starting step [5]. The instance lost SSM connection, SSH
15
+ timed out, and required a reboot.
16
+
17
+ WHAT WE KNOW (facts with evidence):
18
+ - Step [5] started: "Running exhaustive analysis (68719476735 combinations)..."
19
+ - No further output after that line
20
+ - Instance became unreachable (SSM ConnectionLost, SSH timeout)
21
+ - After reboot, dmesg.0 showed NO OOM messages
22
+ - kern.log had no errors between 18:30 (step 5 start) and 19:58 (reboot)
23
+
24
+ WHAT WE DO NOT KNOW (no evidence):
25
+ - Whether the process was running or stuck
26
+ - Whether memory was exhausted (no OOM in logs)
27
+ - Whether CPU was pegged
28
+ - The actual cause of unresponsiveness
29
+
30
+ DO NOT ASSUME:
31
+ - That 68 billion combinations is "too many" without measuring
32
+ - That the list allocation caused OOM (no evidence)
33
+ - That the loop is slow (no benchmarks)
34
+ - ANY root cause without actual evidence from logs/metrics
35
+
36
+ If investigating future failures:
37
+ 1. Check dmesg BEFORE rebooting for OOM messages
38
+ 2. Check /var/log/kern.log for errors
39
+ 3. Try to SSH and run 'top', 'free -h', 'ps aux' before assuming crash
40
+ 4. Get actual memory/CPU metrics, don't guess
41
+
42
+ The instance may have been working fine but just not producing output.
43
+ ===============================================================================
44
+ """
45
+
46
+ import json
47
+ import os
48
+ import subprocess
49
+ import sys
50
+ import tempfile
51
+ import time
52
+ import torch
53
+ from datetime import datetime
54
+ from typing import Dict, List
55
+
56
+
57
+ def run_exhaustive_layer_analysis(
58
+ task: str = "truthfulqa_gen",
59
+ model: str = "meta-llama/Llama-3.2-1B-Instruct",
60
+ num_pairs: int = 50,
61
+ max_layers: int | None = None,
62
+ output_dir: str = "/home/ubuntu/output",
63
+ ):
64
+ """
65
+ Run exhaustive layer combination analysis.
66
+
67
+ Tests all 2^N - 1 layer combinations to find which layer subsets
68
+ produce the strongest geometric structure detection.
69
+
70
+ Uses CLI commands:
71
+ - generate-pairs-from-task: Generate contrastive pairs
72
+ - get-activations: Extract activations for all layers
73
+
74
+ Automatically detects the model's layer count.
75
+
76
+ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
77
+ WARNING: DO NOT SET max_layers TO REDUCE THE NUMBER OF LAYERS TESTED.
78
+
79
+ The whole point of this analysis is to test ALL layer combinations.
80
+ If you need to reduce combinations for feasibility:
81
+ 1. Use a larger instance (g6e.2xlarge = 64GB, g6e.4xlarge = 128GB, g6e.12xlarge = 384GB)
82
+ 2. Wait longer - it's supposed to take hours/days
83
+ 3. DO NOT artificially cap layers - that defeats the purpose
84
+
85
+ max_layers exists ONLY for debugging/testing purposes, NOT for production runs.
86
+ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
87
+ """
88
+ from wisent.core.contrastive_pairs.diagnostics.control_vectors import (
89
+ detect_geometry_exhaustive,
90
+ )
91
+
92
+ sys.stdout.reconfigure(line_buffering=True)
93
+
94
+ print("=" * 80)
95
+ print("EXHAUSTIVE LAYER COMBINATION ANALYSIS")
96
+ print("=" * 80)
97
+ print(f"Task: {task}")
98
+ print(f"Model: {model}")
99
+ print(f"Num pairs: {num_pairs}")
100
+ print(f"Output dir: {output_dir}")
101
+
102
+ # Auto-detect model layer count from config (without loading weights)
103
+ print(f"\n[0] Detecting model layer count from config...")
104
+ start = time.time()
105
+ from transformers import AutoConfig
106
+ config = AutoConfig.from_pretrained(model, trust_remote_code=True)
107
+ # Different models use different config keys for layer count
108
+ model_layers = getattr(config, 'num_hidden_layers', None) or \
109
+ getattr(config, 'n_layer', None) or \
110
+ getattr(config, 'num_layers', None) or 32
111
+ print(f" Model has {model_layers} layers (detected in {time.time() - start:.1f}s)")
112
+
113
+ # Determine layers to use
114
+ if max_layers is not None:
115
+ num_layers = min(max_layers, model_layers)
116
+ print(f" Using {num_layers} layers (limited by --max-layers)")
117
+ else:
118
+ num_layers = model_layers
119
+
120
+ print(f" Total combinations to test: {2**num_layers - 1:,}")
121
+ print("=" * 80)
122
+
123
+ with tempfile.TemporaryDirectory() as tmpdir:
124
+ pairs_file = os.path.join(tmpdir, "pairs.json")
125
+ activations_file = os.path.join(tmpdir, "activations.json")
126
+
127
+ # Step 1: Generate pairs using CLI
128
+ print(f"\n[1] Generating {num_pairs} pairs for {task}...")
129
+ start = time.time()
130
+ result = subprocess.run(
131
+ [
132
+ sys.executable, "-m", "wisent.core.main", "generate-pairs-from-task",
133
+ task,
134
+ "--output", pairs_file,
135
+ "--limit", str(num_pairs),
136
+ ],
137
+ capture_output=True,
138
+ text=True,
139
+ timeout=600
140
+ )
141
+ if result.returncode != 0:
142
+ print(f"ERROR: Pair generation failed: {result.stderr}")
143
+ return
144
+ print(f" Generated pairs in {time.time() - start:.1f}s")
145
+
146
+ # Step 2: Get activations for ALL layers using CLI
147
+ # Build layers string: "1,2,3,...,num_layers"
148
+ layers_str = ",".join(str(i) for i in range(1, num_layers + 1))
149
+
150
+ print(f"\n[2] Extracting activations for layers 1-{num_layers}...")
151
+ start = time.time()
152
+ result = subprocess.run(
153
+ [
154
+ sys.executable, "-m", "wisent.core.main", "get-activations",
155
+ pairs_file,
156
+ "--output", activations_file,
157
+ "--model", model,
158
+ "--layers", layers_str,
159
+ "--token-aggregation", "final",
160
+ ],
161
+ capture_output=True,
162
+ text=True,
163
+ timeout=1800 # 30 min timeout for activation extraction
164
+ )
165
+ if result.returncode != 0:
166
+ print(f"ERROR: Activation extraction failed: {result.stderr}")
167
+ return
168
+ print(f" Extracted activations in {time.time() - start:.1f}s")
169
+
170
+ # Step 3: Load activations from JSON
171
+ print("\n[3] Loading activations from file...")
172
+ with open(activations_file, 'r') as f:
173
+ data = json.load(f)
174
+
175
+ pairs_list = data.get('pairs', [])
176
+ print(f" Loaded {len(pairs_list)} pairs with activations")
177
+
178
+ # Step 4: Convert to tensors by layer
179
+ print("\n[4] Converting to tensors by layer...")
180
+ pos_by_layer: Dict[int, List[torch.Tensor]] = {}
181
+ neg_by_layer: Dict[int, List[torch.Tensor]] = {}
182
+
183
+ for pair in pairs_list:
184
+ pos_la = pair.get('positive_response', {}).get('layers_activations', {})
185
+ neg_la = pair.get('negative_response', {}).get('layers_activations', {})
186
+
187
+ for layer_key in pos_la:
188
+ layer = int(layer_key)
189
+ if max_layers is not None and layer > max_layers:
190
+ continue
191
+
192
+ if layer not in pos_by_layer:
193
+ pos_by_layer[layer] = []
194
+ neg_by_layer[layer] = []
195
+
196
+ if layer_key in pos_la and layer_key in neg_la:
197
+ pos_by_layer[layer].append(torch.tensor(pos_la[layer_key]).reshape(-1))
198
+ neg_by_layer[layer].append(torch.tensor(neg_la[layer_key]).reshape(-1))
199
+
200
+ # Stack into tensors
201
+ pos_tensors = {}
202
+ neg_tensors = {}
203
+ layers_available = sorted(pos_by_layer.keys())
204
+
205
+ for layer in layers_available:
206
+ if pos_by_layer[layer] and neg_by_layer[layer]:
207
+ pos_tensors[layer] = torch.stack(pos_by_layer[layer])
208
+ neg_tensors[layer] = torch.stack(neg_by_layer[layer])
209
+ print(f" Layer {layer}: {pos_tensors[layer].shape}")
210
+
211
+ num_layers = len(pos_tensors)
212
+ actual_combos = 2 ** num_layers - 1
213
+ print(f"\n {num_layers} layers available -> {actual_combos} combinations to test")
214
+
215
+ # Step 5: Run exhaustive analysis
216
+ print(f"\n[5] Running exhaustive analysis ({actual_combos} combinations)...")
217
+ start = time.time()
218
+
219
+ last_report = [0, time.time()] # [last_count, last_time]
220
+ def progress_callback(current: int, total: int):
221
+ # Report every 10000 combinations OR every 30 seconds, whichever comes first
222
+ now = time.time()
223
+ if current - last_report[0] >= 10000 or now - last_report[1] >= 30:
224
+ elapsed = now - start
225
+ rate = current / elapsed if elapsed > 0 else 0
226
+ remaining = (total - current) / rate if rate > 0 else float('inf')
227
+ pct = 100 * current / total
228
+ print(f" Progress: {current:,}/{total:,} ({pct:.4f}%) - {rate:.1f} combos/sec - ETA: {remaining:.0f}s")
229
+ last_report[0] = current
230
+ last_report[1] = now
231
+
232
+ result = detect_geometry_exhaustive(
233
+ pos_tensors,
234
+ neg_tensors,
235
+ max_layers=num_layers,
236
+ combination_method="concat",
237
+ progress_callback=progress_callback,
238
+ )
239
+
240
+ elapsed = time.time() - start
241
+ print(f"\n Completed in {elapsed:.1f}s ({actual_combos / elapsed:.1f} combos/sec)")
242
+
243
+ # Print results
244
+ print("\n" + "=" * 80)
245
+ print("RESULTS")
246
+ print("=" * 80)
247
+
248
+ print(f"\nTotal combinations tested: {result.total_combinations}")
249
+ print(f"\nBest combination: {result.best_combination}")
250
+ print(f"Best score: {result.best_score:.4f}")
251
+ print(f"Best structure: {result.best_structure.value}")
252
+
253
+ print(f"\nBest single layer: L{result.single_layer_best}")
254
+ print(f"Best single layer score: {result.single_layer_best_score:.4f}")
255
+ print(f"Combination beats single: {result.combination_beats_single}")
256
+ print(f"Improvement over single: {result.improvement_over_single:.4f}")
257
+
258
+ print("\n--- Top 10 Combinations ---")
259
+ for i, r in enumerate(result.top_10):
260
+ layers_str = "+".join(f"L{l}" for l in r.layers)
261
+ print(f" {i+1}. {layers_str}: {r.best_structure.value} = {r.best_score:.4f}")
262
+
263
+ print("\n--- Patterns ---")
264
+ print(f" Most important layers: {result.patterns.get('most_important_layers', [])}")
265
+ print(f" Optimal combination size: {result.patterns.get('optimal_combination_size', 1)}")
266
+ print(f" Dominant structure: {result.patterns.get('dominant_structure', 'unknown')}")
267
+ print(f" Best score by size: {result.patterns.get('best_score_by_size', {})}")
268
+ print(f" Early vs late ratio: {result.patterns.get('early_vs_late_ratio', 0):.2f}")
269
+
270
+ print(f"\n--- Recommendation ---")
271
+ print(f" {result.recommendation}")
272
+
273
+ # Save results
274
+ os.makedirs(output_dir, exist_ok=True)
275
+ output_file = os.path.join(output_dir, f"exhaustive_geometry_{task}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
276
+
277
+ # Convert to serializable format
278
+ results_json = {
279
+ "task": task,
280
+ "model": model,
281
+ "num_pairs": num_pairs,
282
+ "max_layers": num_layers,
283
+ "total_combinations": result.total_combinations,
284
+ "elapsed_seconds": elapsed,
285
+ "best_combination": list(result.best_combination),
286
+ "best_score": result.best_score,
287
+ "best_structure": result.best_structure.value,
288
+ "single_layer_best": result.single_layer_best,
289
+ "single_layer_best_score": result.single_layer_best_score,
290
+ "combination_beats_single": result.combination_beats_single,
291
+ "improvement_over_single": result.improvement_over_single,
292
+ "top_10": [
293
+ {
294
+ "layers": list(r.layers),
295
+ "best_structure": r.best_structure.value,
296
+ "best_score": r.best_score,
297
+ "all_scores": r.all_scores,
298
+ }
299
+ for r in result.top_10
300
+ ],
301
+ "top_100": [
302
+ {
303
+ "layers": list(r.layers),
304
+ "best_structure": r.best_structure.value,
305
+ "best_score": r.best_score,
306
+ }
307
+ for r in result.all_results[:100]
308
+ ],
309
+ "patterns": {
310
+ k: v if not isinstance(v, float) or not (v != v) else None # Handle NaN
311
+ for k, v in result.patterns.items()
312
+ },
313
+ "recommendation": result.recommendation,
314
+ }
315
+
316
+ with open(output_file, "w") as f:
317
+ json.dump(results_json, f, indent=2)
318
+ print(f"\nResults saved to: {output_file}")
319
+
320
+ return result
321
+
322
+
323
+ def run_limited_layer_analysis(
324
+ task: str = "truthfulqa_gen",
325
+ model: str = "meta-llama/Llama-3.2-1B-Instruct",
326
+ num_pairs: int = 50,
327
+ max_combo_size: int = 3,
328
+ output_dir: str = "/home/ubuntu/output",
329
+ ):
330
+ """
331
+ Run limited layer combination analysis.
332
+
333
+ Tests 1-layer, 2-layer, 3-layer combinations plus all layers combined.
334
+ Much faster than exhaustive: O(N^3) instead of O(2^N).
335
+
336
+ For 36 layers with max_combo_size=3:
337
+ - 36 + 630 + 7140 + 1 = 7,807 combinations (vs 68 billion exhaustive)
338
+ """
339
+ from wisent.core.contrastive_pairs.diagnostics.control_vectors import (
340
+ detect_geometry_limited,
341
+ )
342
+ from math import comb
343
+
344
+ sys.stdout.reconfigure(line_buffering=True)
345
+
346
+ print("=" * 80)
347
+ print("LIMITED LAYER COMBINATION ANALYSIS")
348
+ print("=" * 80)
349
+ print(f"Task: {task}")
350
+ print(f"Model: {model}")
351
+ print(f"Num pairs: {num_pairs}")
352
+ print(f"Max combo size: {max_combo_size}")
353
+ print(f"Output dir: {output_dir}")
354
+
355
+ # Auto-detect model layer count from config
356
+ print(f"\n[0] Detecting model layer count from config...")
357
+ start = time.time()
358
+ from transformers import AutoConfig
359
+ config = AutoConfig.from_pretrained(model, trust_remote_code=True)
360
+ model_layers = getattr(config, 'num_hidden_layers', None) or \
361
+ getattr(config, 'n_layer', None) or \
362
+ getattr(config, 'num_layers', None) or 32
363
+ print(f" Model has {model_layers} layers (detected in {time.time() - start:.1f}s)")
364
+
365
+ # Calculate expected combinations
366
+ total_combos = sum(comb(model_layers, r) for r in range(1, min(max_combo_size, model_layers) + 1))
367
+ if max_combo_size < model_layers:
368
+ total_combos += 1 # all layers
369
+ print(f" Will test {total_combos:,} combinations (1 to {max_combo_size} layers + all {model_layers})")
370
+ print("=" * 80)
371
+
372
+ with tempfile.TemporaryDirectory() as tmpdir:
373
+ pairs_file = os.path.join(tmpdir, "pairs.json")
374
+ activations_file = os.path.join(tmpdir, "activations.json")
375
+
376
+ # Step 1: Generate pairs
377
+ print(f"\n[1] Generating {num_pairs} pairs for {task}...")
378
+ start = time.time()
379
+ result = subprocess.run(
380
+ [
381
+ sys.executable, "-m", "wisent.core.main", "generate-pairs-from-task",
382
+ task,
383
+ "--output", pairs_file,
384
+ "--limit", str(num_pairs),
385
+ ],
386
+ capture_output=True,
387
+ text=True,
388
+ timeout=600
389
+ )
390
+ if result.returncode != 0:
391
+ print(f"ERROR: Pair generation failed: {result.stderr}")
392
+ return
393
+ print(f" Generated pairs in {time.time() - start:.1f}s")
394
+
395
+ # Step 2: Get activations for ALL layers
396
+ layers_str = ",".join(str(i) for i in range(1, model_layers + 1))
397
+
398
+ print(f"\n[2] Extracting activations for layers 1-{model_layers}...")
399
+ start = time.time()
400
+ result = subprocess.run(
401
+ [
402
+ sys.executable, "-m", "wisent.core.main", "get-activations",
403
+ pairs_file,
404
+ "--output", activations_file,
405
+ "--model", model,
406
+ "--layers", layers_str,
407
+ "--token-aggregation", "final",
408
+ ],
409
+ capture_output=True,
410
+ text=True,
411
+ timeout=1800
412
+ )
413
+ if result.returncode != 0:
414
+ print(f"ERROR: Activation extraction failed: {result.stderr}")
415
+ return
416
+ print(f" Extracted activations in {time.time() - start:.1f}s")
417
+
418
+ # Step 3: Load activations
419
+ print("\n[3] Loading activations from file...")
420
+ with open(activations_file, 'r') as f:
421
+ data = json.load(f)
422
+
423
+ pairs_list = data.get('pairs', [])
424
+ print(f" Loaded {len(pairs_list)} pairs with activations")
425
+
426
+ # Step 4: Convert to tensors by layer
427
+ print("\n[4] Converting to tensors by layer...")
428
+ pos_by_layer: Dict[int, List[torch.Tensor]] = {}
429
+ neg_by_layer: Dict[int, List[torch.Tensor]] = {}
430
+
431
+ for pair in pairs_list:
432
+ pos_la = pair.get('positive_response', {}).get('layers_activations', {})
433
+ neg_la = pair.get('negative_response', {}).get('layers_activations', {})
434
+
435
+ for layer_key in pos_la:
436
+ layer = int(layer_key)
437
+ if layer not in pos_by_layer:
438
+ pos_by_layer[layer] = []
439
+ neg_by_layer[layer] = []
440
+
441
+ if layer_key in pos_la and layer_key in neg_la:
442
+ pos_by_layer[layer].append(torch.tensor(pos_la[layer_key]).reshape(-1))
443
+ neg_by_layer[layer].append(torch.tensor(neg_la[layer_key]).reshape(-1))
444
+
445
+ pos_tensors: Dict[int, torch.Tensor] = {}
446
+ neg_tensors: Dict[int, torch.Tensor] = {}
447
+ for layer in sorted(pos_by_layer.keys()):
448
+ if pos_by_layer[layer]:
449
+ pos_tensors[layer] = torch.stack(pos_by_layer[layer])
450
+ neg_tensors[layer] = torch.stack(neg_by_layer[layer])
451
+ print(f" Layer {layer}: {pos_tensors[layer].shape}")
452
+
453
+ num_layers = len(pos_tensors)
454
+ print(f"\n {num_layers} layers available")
455
+
456
+ # Step 5: Run limited analysis
457
+ print(f"\n[5] Running limited analysis ({total_combos:,} combinations)...")
458
+ start = time.time()
459
+
460
+ last_report = [0, time.time()]
461
+ def progress_callback(current: int, total: int):
462
+ now = time.time()
463
+ if current - last_report[0] >= 100 or now - last_report[1] >= 30 or current == total:
464
+ elapsed = now - start
465
+ rate = current / elapsed if elapsed > 0 else 0
466
+ remaining = (total - current) / rate if rate > 0 else 0
467
+ pct = 100 * current / total
468
+ print(f" Progress: {current:,}/{total:,} ({pct:.1f}%) - {rate:.1f} combos/sec - ETA: {remaining:.0f}s")
469
+ last_report[0] = current
470
+ last_report[1] = now
471
+
472
+ result = detect_geometry_limited(
473
+ pos_tensors,
474
+ neg_tensors,
475
+ max_combo_size=max_combo_size,
476
+ combination_method="concat",
477
+ progress_callback=progress_callback,
478
+ )
479
+
480
+ elapsed = time.time() - start
481
+ print(f"\n Completed in {elapsed:.1f}s ({total_combos / elapsed:.1f} combos/sec)")
482
+
483
+ # Print results
484
+ print("\n" + "=" * 80)
485
+ print("RESULTS")
486
+ print("=" * 80)
487
+
488
+ print(f"\nTotal combinations tested: {result.total_combinations}")
489
+ print(f"\nBest combination: {result.best_combination}")
490
+ print(f"Best score: {result.best_score:.4f}")
491
+ print(f"Best structure: {result.best_structure.value}")
492
+
493
+ print(f"\nBest single layer: L{result.single_layer_best}")
494
+ print(f"Best single layer score: {result.single_layer_best_score:.4f}")
495
+ print(f"Combination beats single: {result.combination_beats_single}")
496
+ print(f"Improvement over single: {result.improvement_over_single:.4f}")
497
+
498
+ print("\n--- Top 10 Combinations ---")
499
+ for i, r in enumerate(result.top_10):
500
+ layers_str = "+".join(f"L{l}" for l in r.layers)
501
+ print(f"{i+1}. {layers_str}: {r.best_score:.4f} ({r.best_structure.value})")
502
+
503
+ print(f"\nRecommendation: {result.recommendation}")
504
+
505
+ # Save results
506
+ os.makedirs(output_dir, exist_ok=True)
507
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
508
+ output_file = os.path.join(output_dir, f"geometry_limited_{task}_{timestamp}.json")
509
+
510
+ results_json = {
511
+ "task": task,
512
+ "model": model,
513
+ "num_pairs": num_pairs,
514
+ "max_combo_size": max_combo_size,
515
+ "total_combinations": result.total_combinations,
516
+ "best_combination": list(result.best_combination),
517
+ "best_score": result.best_score,
518
+ "best_structure": result.best_structure.value,
519
+ "single_layer_best": result.single_layer_best,
520
+ "single_layer_best_score": result.single_layer_best_score,
521
+ "combination_beats_single": result.combination_beats_single,
522
+ "improvement_over_single": result.improvement_over_single,
523
+ "top_10": [
524
+ {
525
+ "layers": list(r.layers),
526
+ "best_score": r.best_score,
527
+ "best_structure": r.best_structure.value,
528
+ "all_scores": r.all_scores,
529
+ }
530
+ for r in result.top_10
531
+ ],
532
+ "top_100": [
533
+ {
534
+ "layers": list(r.layers),
535
+ "best_score": r.best_score,
536
+ "best_structure": r.best_structure.value,
537
+ }
538
+ for r in result.all_results[:100]
539
+ ],
540
+ "patterns": result.patterns,
541
+ "recommendation": result.recommendation,
542
+ }
543
+
544
+ with open(output_file, "w") as f:
545
+ json.dump(results_json, f, indent=2)
546
+ print(f"\nResults saved to: {output_file}")
547
+
548
+ return result
549
+
550
+
551
+ def run_contiguous_layer_analysis(
552
+ task: str = "truthfulqa_gen",
553
+ model: str = "meta-llama/Llama-3.2-1B-Instruct",
554
+ num_pairs: int = 50,
555
+ output_dir: str = "/home/ubuntu/output",
556
+ ):
557
+ """
558
+ Run contiguous layer combination analysis.
559
+
560
+ Only tests combinations where layers are adjacent (1-2, 2-3, 1-5, etc.).
561
+ Very fast: O(N^2) = N*(N+1)/2 combinations.
562
+
563
+ For 36 layers: 666 combinations
564
+ For 24 layers: 300 combinations
565
+ """
566
+ from wisent.core.contrastive_pairs.diagnostics.control_vectors import (
567
+ detect_geometry_contiguous,
568
+ )
569
+
570
+ sys.stdout.reconfigure(line_buffering=True)
571
+
572
+ print("=" * 80)
573
+ print("CONTIGUOUS LAYER COMBINATION ANALYSIS")
574
+ print("=" * 80)
575
+ print(f"Task: {task}")
576
+ print(f"Model: {model}")
577
+ print(f"Num pairs: {num_pairs}")
578
+ print(f"Output dir: {output_dir}")
579
+
580
+ # Auto-detect model layer count from config
581
+ print(f"\n[0] Detecting model layer count from config...")
582
+ start = time.time()
583
+ from transformers import AutoConfig
584
+ config = AutoConfig.from_pretrained(model, trust_remote_code=True)
585
+ model_layers = getattr(config, 'num_hidden_layers', None) or \
586
+ getattr(config, 'n_layer', None) or \
587
+ getattr(config, 'num_layers', None) or 32
588
+ print(f" Model has {model_layers} layers (detected in {time.time() - start:.1f}s)")
589
+
590
+ # Calculate expected combinations
591
+ total_combos = model_layers * (model_layers + 1) // 2
592
+ print(f" Will test {total_combos:,} contiguous combinations")
593
+ print("=" * 80)
594
+
595
+ with tempfile.TemporaryDirectory() as tmpdir:
596
+ pairs_file = os.path.join(tmpdir, "pairs.json")
597
+ activations_file = os.path.join(tmpdir, "activations.json")
598
+
599
+ # Step 1: Generate pairs
600
+ print(f"\n[1] Generating {num_pairs} pairs for {task}...")
601
+ start = time.time()
602
+ result = subprocess.run(
603
+ [
604
+ sys.executable, "-m", "wisent.core.main", "generate-pairs-from-task",
605
+ task,
606
+ "--output", pairs_file,
607
+ "--limit", str(num_pairs),
608
+ ],
609
+ capture_output=True,
610
+ text=True,
611
+ timeout=600
612
+ )
613
+ if result.returncode != 0:
614
+ print(f"ERROR: Pair generation failed: {result.stderr}")
615
+ return
616
+ print(f" Generated pairs in {time.time() - start:.1f}s")
617
+
618
+ # Step 2: Get activations for ALL layers
619
+ layers_str = ",".join(str(i) for i in range(1, model_layers + 1))
620
+
621
+ print(f"\n[2] Extracting activations for layers 1-{model_layers}...")
622
+ start = time.time()
623
+ result = subprocess.run(
624
+ [
625
+ sys.executable, "-m", "wisent.core.main", "get-activations",
626
+ pairs_file,
627
+ "--output", activations_file,
628
+ "--model", model,
629
+ "--layers", layers_str,
630
+ "--token-aggregation", "final",
631
+ ],
632
+ capture_output=True,
633
+ text=True,
634
+ timeout=1800
635
+ )
636
+ if result.returncode != 0:
637
+ print(f"ERROR: Activation extraction failed: {result.stderr}")
638
+ return
639
+ print(f" Extracted activations in {time.time() - start:.1f}s")
640
+
641
+ # Step 3: Load activations
642
+ print("\n[3] Loading activations from file...")
643
+ with open(activations_file, 'r') as f:
644
+ data = json.load(f)
645
+
646
+ pairs_list = data.get('pairs', [])
647
+ print(f" Loaded {len(pairs_list)} pairs with activations")
648
+
649
+ # Step 4: Convert to tensors by layer
650
+ print("\n[4] Converting to tensors by layer...")
651
+ pos_by_layer: Dict[int, List[torch.Tensor]] = {}
652
+ neg_by_layer: Dict[int, List[torch.Tensor]] = {}
653
+
654
+ for pair in pairs_list:
655
+ pos_la = pair.get('positive_response', {}).get('layers_activations', {})
656
+ neg_la = pair.get('negative_response', {}).get('layers_activations', {})
657
+
658
+ for layer_key in pos_la:
659
+ layer = int(layer_key)
660
+ if layer not in pos_by_layer:
661
+ pos_by_layer[layer] = []
662
+ neg_by_layer[layer] = []
663
+
664
+ if layer_key in pos_la and layer_key in neg_la:
665
+ pos_by_layer[layer].append(torch.tensor(pos_la[layer_key]).reshape(-1))
666
+ neg_by_layer[layer].append(torch.tensor(neg_la[layer_key]).reshape(-1))
667
+
668
+ pos_tensors: Dict[int, torch.Tensor] = {}
669
+ neg_tensors: Dict[int, torch.Tensor] = {}
670
+ for layer in sorted(pos_by_layer.keys()):
671
+ if pos_by_layer[layer]:
672
+ pos_tensors[layer] = torch.stack(pos_by_layer[layer])
673
+ neg_tensors[layer] = torch.stack(neg_by_layer[layer])
674
+ print(f" Layer {layer}: {pos_tensors[layer].shape}")
675
+
676
+ num_layers = len(pos_tensors)
677
+ print(f"\n {num_layers} layers available")
678
+
679
+ # Step 5: Run contiguous analysis
680
+ print(f"\n[5] Running contiguous analysis ({total_combos:,} combinations)...")
681
+ start = time.time()
682
+
683
+ last_report = [0, time.time()]
684
+ def progress_callback(current: int, total: int):
685
+ now = time.time()
686
+ if current - last_report[0] >= 50 or now - last_report[1] >= 30 or current == total:
687
+ elapsed = now - start
688
+ rate = current / elapsed if elapsed > 0 else 0
689
+ remaining = (total - current) / rate if rate > 0 else 0
690
+ pct = 100 * current / total
691
+ print(f" Progress: {current:,}/{total:,} ({pct:.1f}%) - {rate:.1f} combos/sec - ETA: {remaining:.0f}s")
692
+ last_report[0] = current
693
+ last_report[1] = now
694
+
695
+ result = detect_geometry_contiguous(
696
+ pos_tensors,
697
+ neg_tensors,
698
+ combination_method="concat",
699
+ progress_callback=progress_callback,
700
+ )
701
+
702
+ elapsed = time.time() - start
703
+ print(f"\n Completed in {elapsed:.1f}s ({total_combos / elapsed:.1f} combos/sec)")
704
+
705
+ # Print results
706
+ print("\n" + "=" * 80)
707
+ print("RESULTS")
708
+ print("=" * 80)
709
+
710
+ print(f"\nTotal combinations tested: {result.total_combinations}")
711
+ print(f"\nBest combination: {result.best_combination}")
712
+ print(f"Best score: {result.best_score:.4f}")
713
+ print(f"Best structure: {result.best_structure.value}")
714
+
715
+ print(f"\nBest single layer: L{result.single_layer_best}")
716
+ print(f"Best single layer score: {result.single_layer_best_score:.4f}")
717
+ print(f"Combination beats single: {result.combination_beats_single}")
718
+ print(f"Improvement over single: {result.improvement_over_single:.4f}")
719
+
720
+ print("\n--- Top 10 Combinations ---")
721
+ for i, r in enumerate(result.top_10):
722
+ layers_str = f"L{r.layers[0]}-L{r.layers[-1]}" if len(r.layers) > 1 else f"L{r.layers[0]}"
723
+ print(f"{i+1}. {layers_str} ({len(r.layers)} layers): {r.best_score:.4f} ({r.best_structure.value})")
724
+
725
+ print(f"\nRecommendation: {result.recommendation}")
726
+
727
+ # Save results
728
+ os.makedirs(output_dir, exist_ok=True)
729
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
730
+ output_file = os.path.join(output_dir, f"geometry_contiguous_{task}_{timestamp}.json")
731
+
732
+ results_json = {
733
+ "task": task,
734
+ "model": model,
735
+ "num_pairs": num_pairs,
736
+ "mode": "contiguous",
737
+ "total_combinations": result.total_combinations,
738
+ "best_combination": list(result.best_combination),
739
+ "best_score": result.best_score,
740
+ "best_structure": result.best_structure.value,
741
+ "single_layer_best": result.single_layer_best,
742
+ "single_layer_best_score": result.single_layer_best_score,
743
+ "combination_beats_single": result.combination_beats_single,
744
+ "improvement_over_single": result.improvement_over_single,
745
+ "top_10": [
746
+ {
747
+ "layers": list(r.layers),
748
+ "best_score": r.best_score,
749
+ "best_structure": r.best_structure.value,
750
+ "all_scores": r.all_scores,
751
+ }
752
+ for r in result.top_10
753
+ ],
754
+ "top_100": [
755
+ {
756
+ "layers": list(r.layers),
757
+ "best_score": r.best_score,
758
+ "best_structure": r.best_structure.value,
759
+ }
760
+ for r in result.all_results[:100]
761
+ ],
762
+ "patterns": result.patterns,
763
+ "recommendation": result.recommendation,
764
+ }
765
+
766
+ with open(output_file, "w") as f:
767
+ json.dump(results_json, f, indent=2)
768
+ print(f"\nResults saved to: {output_file}")
769
+
770
+ return result
771
+
772
+
773
+ TOKEN_AGGREGATIONS = ["final", "average", "first", "max", "min", "max_score"]
774
+ PROMPT_STRATEGIES = ["chat_template", "direct_completion", "instruction_following", "multiple_choice", "role_playing"]
775
+
776
+
777
+ def run_smart_layer_analysis(
778
+ task: str = "truthfulqa_gen",
779
+ model: str = "meta-llama/Llama-3.2-1B-Instruct",
780
+ num_pairs: int = 50,
781
+ max_combo_size: int = 3,
782
+ token_aggregation: str = "final",
783
+ prompt_strategy: str = "chat_template",
784
+ output_dir: str = "/home/ubuntu/output",
785
+ ):
786
+ """
787
+ Run smart layer combination analysis.
788
+
789
+ Combines contiguous + limited search: tests all contiguous ranges (L1-L5, L3-L8, etc.)
790
+ plus all 1,2,3-layer non-contiguous combinations. Deduplicates overlaps.
791
+
792
+ For 36 layers: ~7,800 unique combinations
793
+ For 24 layers: ~2,600 unique combinations
794
+ """
795
+ from wisent.core.contrastive_pairs.diagnostics.control_vectors import (
796
+ detect_geometry_smart,
797
+ )
798
+ from math import comb
799
+
800
+ sys.stdout.reconfigure(line_buffering=True)
801
+
802
+ print("=" * 80)
803
+ print("SMART LAYER COMBINATION ANALYSIS")
804
+ print("(Contiguous + Limited 1,2,3-layer combinations)")
805
+ print("=" * 80)
806
+ print(f"Task: {task}")
807
+ print(f"Model: {model}")
808
+ print(f"Num pairs: {num_pairs}")
809
+ print(f"Max combo size: {max_combo_size}")
810
+ print(f"Token aggregation: {token_aggregation}")
811
+ print(f"Prompt strategy: {prompt_strategy}")
812
+ print(f"Output dir: {output_dir}")
813
+
814
+ # Auto-detect model layer count from config
815
+ print(f"\n[0] Detecting model layer count from config...")
816
+ start = time.time()
817
+ from transformers import AutoConfig
818
+ config = AutoConfig.from_pretrained(model, trust_remote_code=True)
819
+ model_layers = getattr(config, 'num_hidden_layers', None) or \
820
+ getattr(config, 'n_layer', None) or \
821
+ getattr(config, 'num_layers', None) or 32
822
+ print(f" Model has {model_layers} layers (detected in {time.time() - start:.1f}s)")
823
+
824
+ # Calculate expected combinations (estimate, actual will be less due to deduplication)
825
+ contiguous = model_layers * (model_layers + 1) // 2
826
+ limited = sum(comb(model_layers, r) for r in range(1, min(max_combo_size, model_layers) + 1))
827
+ print(f" Contiguous: {contiguous:,}, Limited 1-{max_combo_size}: {limited:,}")
828
+ print(f" (Actual will be less due to deduplication)")
829
+ print("=" * 80)
830
+
831
+ with tempfile.TemporaryDirectory() as tmpdir:
832
+ pairs_file = os.path.join(tmpdir, "pairs.json")
833
+ activations_file = os.path.join(tmpdir, "activations.json")
834
+
835
+ # Step 1: Generate pairs
836
+ print(f"\n[1] Generating {num_pairs} pairs for {task}...")
837
+ start = time.time()
838
+ result = subprocess.run(
839
+ [
840
+ sys.executable, "-m", "wisent.core.main", "generate-pairs-from-task",
841
+ task,
842
+ "--output", pairs_file,
843
+ "--limit", str(num_pairs),
844
+ ],
845
+ capture_output=True,
846
+ text=True,
847
+ timeout=600
848
+ )
849
+ if result.returncode != 0:
850
+ print(f"ERROR: Pair generation failed: {result.stderr}")
851
+ return
852
+ print(f" Generated pairs in {time.time() - start:.1f}s")
853
+
854
+ # Step 2: Get activations for ALL layers
855
+ layers_str = ",".join(str(i) for i in range(1, model_layers + 1))
856
+
857
+ print(f"\n[2] Extracting activations for layers 1-{model_layers}...")
858
+ print(f" Token aggregation: {token_aggregation}, Prompt strategy: {prompt_strategy}")
859
+ start = time.time()
860
+ result = subprocess.run(
861
+ [
862
+ sys.executable, "-m", "wisent.core.main", "get-activations",
863
+ pairs_file,
864
+ "--output", activations_file,
865
+ "--model", model,
866
+ "--layers", layers_str,
867
+ "--token-aggregation", token_aggregation,
868
+ "--prompt-strategy", prompt_strategy,
869
+ ],
870
+ capture_output=True,
871
+ text=True,
872
+ timeout=1800
873
+ )
874
+ if result.returncode != 0:
875
+ print(f"ERROR: Activation extraction failed: {result.stderr}")
876
+ return
877
+ print(f" Extracted activations in {time.time() - start:.1f}s")
878
+
879
+ # Step 3: Load activations
880
+ print("\n[3] Loading activations from file...")
881
+ with open(activations_file, 'r') as f:
882
+ data = json.load(f)
883
+
884
+ pairs_list = data.get('pairs', [])
885
+ print(f" Loaded {len(pairs_list)} pairs with activations")
886
+
887
+ # Step 4: Convert to tensors by layer
888
+ print("\n[4] Converting to tensors by layer...")
889
+ pos_by_layer: Dict[int, List[torch.Tensor]] = {}
890
+ neg_by_layer: Dict[int, List[torch.Tensor]] = {}
891
+
892
+ for pair in pairs_list:
893
+ pos_la = pair.get('positive_response', {}).get('layers_activations', {})
894
+ neg_la = pair.get('negative_response', {}).get('layers_activations', {})
895
+
896
+ for layer_key in pos_la:
897
+ layer = int(layer_key)
898
+ if layer not in pos_by_layer:
899
+ pos_by_layer[layer] = []
900
+ neg_by_layer[layer] = []
901
+
902
+ if layer_key in pos_la and layer_key in neg_la:
903
+ pos_by_layer[layer].append(torch.tensor(pos_la[layer_key]).reshape(-1))
904
+ neg_by_layer[layer].append(torch.tensor(neg_la[layer_key]).reshape(-1))
905
+
906
+ pos_tensors: Dict[int, torch.Tensor] = {}
907
+ neg_tensors: Dict[int, torch.Tensor] = {}
908
+ for layer in sorted(pos_by_layer.keys()):
909
+ if pos_by_layer[layer]:
910
+ pos_tensors[layer] = torch.stack(pos_by_layer[layer])
911
+ neg_tensors[layer] = torch.stack(neg_by_layer[layer])
912
+ print(f" Layer {layer}: {pos_tensors[layer].shape}")
913
+
914
+ num_layers = len(pos_tensors)
915
+ print(f"\n {num_layers} layers available")
916
+
917
+ # Step 5: Run smart analysis
918
+ print(f"\n[5] Running smart analysis...")
919
+ start = time.time()
920
+
921
+ last_report = [0, time.time()]
922
+ def progress_callback(current: int, total: int):
923
+ now = time.time()
924
+ if current - last_report[0] >= 100 or now - last_report[1] >= 30 or current == total:
925
+ elapsed = now - start
926
+ rate = current / elapsed if elapsed > 0 else 0
927
+ remaining = (total - current) / rate if rate > 0 else 0
928
+ pct = 100 * current / total
929
+ print(f" Progress: {current:,}/{total:,} ({pct:.1f}%) - {rate:.1f} combos/sec - ETA: {remaining:.0f}s")
930
+ last_report[0] = current
931
+ last_report[1] = now
932
+
933
+ result = detect_geometry_smart(
934
+ pos_tensors,
935
+ neg_tensors,
936
+ max_combo_size=max_combo_size,
937
+ combination_method="concat",
938
+ progress_callback=progress_callback,
939
+ )
940
+
941
+ elapsed = time.time() - start
942
+ print(f"\n Completed in {elapsed:.1f}s ({result.total_combinations / elapsed:.1f} combos/sec)")
943
+
944
+ # Print results
945
+ print("\n" + "=" * 80)
946
+ print("RESULTS")
947
+ print("=" * 80)
948
+
949
+ print(f"\nTotal combinations tested: {result.total_combinations}")
950
+ print(f"\nBest combination: {result.best_combination}")
951
+ print(f"Best score: {result.best_score:.4f}")
952
+ print(f"Best structure: {result.best_structure.value}")
953
+
954
+ print(f"\nBest single layer: L{result.single_layer_best}")
955
+ print(f"Best single layer score: {result.single_layer_best_score:.4f}")
956
+ print(f"Combination beats single: {result.combination_beats_single}")
957
+ print(f"Improvement over single: {result.improvement_over_single:.4f}")
958
+
959
+ print("\n--- Top 10 Combinations ---")
960
+ for i, r in enumerate(result.top_10):
961
+ if len(r.layers) > 1 and r.layers[-1] - r.layers[0] == len(r.layers) - 1:
962
+ # Contiguous
963
+ layers_str = f"L{r.layers[0]}-L{r.layers[-1]}"
964
+ else:
965
+ layers_str = "+".join(f"L{l}" for l in r.layers)
966
+ print(f"{i+1}. {layers_str} ({len(r.layers)} layers): {r.best_score:.4f} ({r.best_structure.value})")
967
+
968
+ print(f"\nRecommendation: {result.recommendation}")
969
+
970
+ # Save results
971
+ os.makedirs(output_dir, exist_ok=True)
972
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
973
+ output_file = os.path.join(output_dir, f"geometry_smart_{task}_{token_aggregation}_{prompt_strategy}_{timestamp}.json")
974
+
975
+ results_json = {
976
+ "task": task,
977
+ "model": model,
978
+ "num_pairs": num_pairs,
979
+ "mode": "smart",
980
+ "max_combo_size": max_combo_size,
981
+ "token_aggregation": token_aggregation,
982
+ "prompt_strategy": prompt_strategy,
983
+ "total_combinations": result.total_combinations,
984
+ "best_combination": list(result.best_combination),
985
+ "best_score": result.best_score,
986
+ "best_structure": result.best_structure.value,
987
+ "single_layer_best": result.single_layer_best,
988
+ "single_layer_best_score": result.single_layer_best_score,
989
+ "combination_beats_single": result.combination_beats_single,
990
+ "improvement_over_single": result.improvement_over_single,
991
+ "top_10": [
992
+ {
993
+ "layers": list(r.layers),
994
+ "best_score": r.best_score,
995
+ "best_structure": r.best_structure.value,
996
+ "all_scores": r.all_scores,
997
+ }
998
+ for r in result.top_10
999
+ ],
1000
+ "top_100": [
1001
+ {
1002
+ "layers": list(r.layers),
1003
+ "best_score": r.best_score,
1004
+ "best_structure": r.best_structure.value,
1005
+ }
1006
+ for r in result.all_results[:100]
1007
+ ],
1008
+ "patterns": result.patterns,
1009
+ "recommendation": result.recommendation,
1010
+ }
1011
+
1012
+ with open(output_file, "w") as f:
1013
+ json.dump(results_json, f, indent=2)
1014
+ print(f"\nResults saved to: {output_file}")
1015
+
1016
+ return result
1017
+
1018
+
1019
+ def run_comprehensive_sweep(
1020
+ task: str = "truthfulqa_gen",
1021
+ model: str = "meta-llama/Llama-3.2-1B-Instruct",
1022
+ num_pairs: int = 50,
1023
+ max_combo_size: int = 3,
1024
+ output_dir: str = "/home/ubuntu/output",
1025
+ ):
1026
+ """
1027
+ Run comprehensive sweep across all token aggregations and prompt strategies.
1028
+
1029
+ Tests 6 token aggregations x 5 prompt strategies = 30 configurations,
1030
+ each with smart layer combination search.
1031
+ """
1032
+ sys.stdout.reconfigure(line_buffering=True)
1033
+
1034
+ print("=" * 80)
1035
+ print("COMPREHENSIVE GEOMETRY SWEEP")
1036
+ print("=" * 80)
1037
+ print(f"Task: {task}")
1038
+ print(f"Model: {model}")
1039
+ print(f"Num pairs: {num_pairs}")
1040
+ print(f"Token aggregations: {TOKEN_AGGREGATIONS}")
1041
+ print(f"Prompt strategies: {PROMPT_STRATEGIES}")
1042
+ print(f"Total configurations: {len(TOKEN_AGGREGATIONS) * len(PROMPT_STRATEGIES)}")
1043
+ print("=" * 80)
1044
+
1045
+ all_results = []
1046
+ total_configs = len(TOKEN_AGGREGATIONS) * len(PROMPT_STRATEGIES)
1047
+ config_idx = 0
1048
+
1049
+ for token_agg in TOKEN_AGGREGATIONS:
1050
+ for prompt_strat in PROMPT_STRATEGIES:
1051
+ config_idx += 1
1052
+ print(f"\n{'='*80}")
1053
+ print(f"CONFIG {config_idx}/{total_configs}: {token_agg} + {prompt_strat}")
1054
+ print("=" * 80)
1055
+
1056
+ try:
1057
+ result = run_smart_layer_analysis(
1058
+ task=task,
1059
+ model=model,
1060
+ num_pairs=num_pairs,
1061
+ max_combo_size=max_combo_size,
1062
+ token_aggregation=token_agg,
1063
+ prompt_strategy=prompt_strat,
1064
+ output_dir=output_dir,
1065
+ )
1066
+
1067
+ if result:
1068
+ all_results.append({
1069
+ "token_aggregation": token_agg,
1070
+ "prompt_strategy": prompt_strat,
1071
+ "best_combination": list(result.best_combination),
1072
+ "best_score": result.best_score,
1073
+ "best_structure": result.best_structure.value,
1074
+ "single_layer_best": result.single_layer_best,
1075
+ "single_layer_best_score": result.single_layer_best_score,
1076
+ "improvement_over_single": result.improvement_over_single,
1077
+ })
1078
+ except Exception as e:
1079
+ print(f"ERROR in config {token_agg}+{prompt_strat}: {e}")
1080
+ all_results.append({
1081
+ "token_aggregation": token_agg,
1082
+ "prompt_strategy": prompt_strat,
1083
+ "error": str(e),
1084
+ })
1085
+
1086
+ # Save summary
1087
+ print("\n" + "=" * 80)
1088
+ print("SWEEP SUMMARY")
1089
+ print("=" * 80)
1090
+
1091
+ # Sort by best_score
1092
+ successful = [r for r in all_results if "best_score" in r]
1093
+ successful.sort(key=lambda x: x["best_score"], reverse=True)
1094
+
1095
+ print(f"\nCompleted {len(successful)}/{total_configs} configurations")
1096
+ print("\n--- Top 10 Configurations ---")
1097
+ for i, r in enumerate(successful[:10]):
1098
+ print(f"{i+1}. {r['token_aggregation']}+{r['prompt_strategy']}: {r['best_score']:.4f} ({r['best_structure']}) - layers {r['best_combination']}")
1099
+
1100
+ # Save sweep summary
1101
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1102
+ summary_file = os.path.join(output_dir, f"geometry_sweep_summary_{task}_{timestamp}.json")
1103
+
1104
+ summary = {
1105
+ "task": task,
1106
+ "model": model,
1107
+ "num_pairs": num_pairs,
1108
+ "max_combo_size": max_combo_size,
1109
+ "token_aggregations": TOKEN_AGGREGATIONS,
1110
+ "prompt_strategies": PROMPT_STRATEGIES,
1111
+ "total_configurations": total_configs,
1112
+ "successful_configurations": len(successful),
1113
+ "all_results": all_results,
1114
+ "top_10": successful[:10],
1115
+ }
1116
+
1117
+ with open(summary_file, "w") as f:
1118
+ json.dump(summary, f, indent=2)
1119
+ print(f"\nSweep summary saved to: {summary_file}")
1120
+
1121
+ return summary
1122
+
1123
+
1124
+ if __name__ == "__main__":
1125
+ import argparse
1126
+ parser = argparse.ArgumentParser()
1127
+ parser.add_argument("--task", default="truthfulqa_gen")
1128
+ parser.add_argument("--model", default="meta-llama/Llama-3.2-1B-Instruct")
1129
+ parser.add_argument("--num-pairs", type=int, default=50)
1130
+ # WARNING: Do NOT use --max-layers in production runs!
1131
+ # The whole point of exhaustive analysis is to test ALL layers.
1132
+ # If you need more memory, use a larger instance type instead.
1133
+ parser.add_argument("--max-layers", type=int, default=None,
1134
+ help="DEBUG ONLY - DO NOT USE IN PRODUCTION. Use larger instance instead.")
1135
+ parser.add_argument("--output-dir", default="/home/ubuntu/output")
1136
+ parser.add_argument("--sweep", action="store_true",
1137
+ help="Run comprehensive sweep across all token aggregations and prompt strategies")
1138
+ parser.add_argument("--smart", action="store_true", default=True,
1139
+ help="Use smart search (contiguous + 1,2,3-layer) - DEFAULT")
1140
+ parser.add_argument("--limited", action="store_true",
1141
+ help="Use limited search (1,2,3-layer combos + all layers)")
1142
+ parser.add_argument("--contiguous", action="store_true",
1143
+ help="Use contiguous search (adjacent layers only)")
1144
+ parser.add_argument("--exhaustive", action="store_true",
1145
+ help="Use exhaustive search (all 2^N combinations) - VERY SLOW")
1146
+ parser.add_argument("--max-combo-size", type=int, default=3,
1147
+ help="Max combination size for limited/smart search (default: 3)")
1148
+ parser.add_argument("--token-aggregation", default="final", choices=TOKEN_AGGREGATIONS,
1149
+ help="Token aggregation method (default: final)")
1150
+ parser.add_argument("--prompt-strategy", default="chat_template", choices=PROMPT_STRATEGIES,
1151
+ help="Prompt construction strategy (default: chat_template)")
1152
+ args = parser.parse_args()
1153
+
1154
+ # Print loud warning if max_layers is set
1155
+ if args.max_layers is not None:
1156
+ print("!" * 80)
1157
+ print("WARNING: --max-layers is set! This should ONLY be used for debugging.")
1158
+ print("For production runs, use a larger instance type instead of capping layers.")
1159
+ print("!" * 80)
1160
+
1161
+ if args.sweep:
1162
+ run_comprehensive_sweep(
1163
+ task=args.task,
1164
+ model=args.model,
1165
+ num_pairs=args.num_pairs,
1166
+ max_combo_size=args.max_combo_size,
1167
+ output_dir=args.output_dir,
1168
+ )
1169
+ elif args.exhaustive:
1170
+ run_exhaustive_layer_analysis(
1171
+ task=args.task,
1172
+ model=args.model,
1173
+ num_pairs=args.num_pairs,
1174
+ max_layers=args.max_layers,
1175
+ output_dir=args.output_dir,
1176
+ )
1177
+ elif args.contiguous:
1178
+ run_contiguous_layer_analysis(
1179
+ task=args.task,
1180
+ model=args.model,
1181
+ num_pairs=args.num_pairs,
1182
+ output_dir=args.output_dir,
1183
+ )
1184
+ elif args.limited:
1185
+ run_limited_layer_analysis(
1186
+ task=args.task,
1187
+ model=args.model,
1188
+ num_pairs=args.num_pairs,
1189
+ max_combo_size=args.max_combo_size,
1190
+ output_dir=args.output_dir,
1191
+ )
1192
+ else:
1193
+ # Default: smart search
1194
+ run_smart_layer_analysis(
1195
+ task=args.task,
1196
+ model=args.model,
1197
+ num_pairs=args.num_pairs,
1198
+ max_combo_size=args.max_combo_size,
1199
+ token_aggregation=args.token_aggregation,
1200
+ prompt_strategy=args.prompt_strategy,
1201
+ output_dir=args.output_dir,
1202
+ )