wisent 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1020) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/__init__.py +22 -6
  3. wisent/core/activations/activation_cache.py +393 -0
  4. wisent/core/activations/activations.py +22 -40
  5. wisent/core/activations/activations_collector.py +145 -373
  6. wisent/core/activations/classifier_inference_strategy.py +195 -0
  7. wisent/core/activations/core/atoms.py +8 -92
  8. wisent/core/activations/extraction_strategy.py +480 -0
  9. wisent/core/agent/diagnose/response_diagnostics.py +3 -3
  10. wisent/core/agent/diagnose.py +3 -3
  11. wisent/core/autonomous_agent.py +2 -2
  12. wisent/core/classifiers/classifiers/core/atoms.py +3 -2
  13. wisent/core/cli/__init__.py +2 -1
  14. wisent/core/cli/agent/apply_steering.py +25 -31
  15. wisent/core/cli/agent/evaluate_response.py +18 -20
  16. wisent/core/cli/agent/train_classifier.py +36 -26
  17. wisent/core/cli/check_linearity.py +35 -3
  18. wisent/core/cli/cluster_benchmarks.py +470 -0
  19. wisent/core/cli/create_steering_vector.py +19 -9
  20. wisent/core/cli/diagnose_vectors.py +7 -4
  21. wisent/core/cli/estimate_unified_goodness_time.py +6 -4
  22. wisent/core/cli/generate_pairs_from_task.py +9 -56
  23. wisent/core/cli/generate_vector_from_task.py +4 -0
  24. wisent/core/cli/geometry_search.py +137 -0
  25. wisent/core/cli/get_activations.py +13 -37
  26. wisent/core/cli/method_optimizer.py +860 -0
  27. wisent/core/cli/modify_weights.py +3 -2
  28. wisent/core/cli/optimize.py +44 -5
  29. wisent/core/cli/optimize_classification.py +5 -6
  30. wisent/core/cli/optimize_sample_size.py +9 -23
  31. wisent/core/cli/optimize_steering.py +433 -159
  32. wisent/core/cli/optimize_weights.py +67 -7
  33. wisent/core/cli/preview_pairs.py +203 -0
  34. wisent/core/cli/steering_method_trainer.py +8 -7
  35. wisent/core/cli/steering_search_space.py +20 -15
  36. wisent/core/cli/tasks.py +31 -117
  37. wisent/core/cli/train_unified_goodness.py +18 -19
  38. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +1582 -177
  39. wisent/core/contrastive_pairs/diagnostics/linearity.py +70 -80
  40. wisent/core/contrastive_pairs/diagnostics/vector_quality.py +6 -5
  41. wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py +5 -19
  42. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py +11 -5
  43. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
  44. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
  45. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py +146 -32
  46. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
  47. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py +2 -2
  48. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
  49. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
  50. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
  51. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
  52. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
  53. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
  54. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py +98 -57
  55. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
  56. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
  57. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
  58. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
  59. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
  60. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
  61. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
  62. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
  63. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
  64. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
  65. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
  66. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
  67. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
  68. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
  69. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py +8 -8
  70. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py +1 -1
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +11 -5
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval_aqua_rat.py +129 -0
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
  82. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
  83. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
  84. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
  85. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
  86. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
  87. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
  88. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
  89. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
  90. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
  91. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
  92. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
  93. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
  94. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
  95. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
  96. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
  102. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
  103. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
  104. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
  105. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
  106. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
  107. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
  108. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
  109. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
  110. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
  111. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
  112. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
  113. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
  114. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
  115. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
  116. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
  117. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
  118. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
  119. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
  120. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py +11 -6
  121. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
  122. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
  123. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
  124. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
  125. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
  126. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
  127. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
  128. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
  129. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
  130. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
  131. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
  132. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
  133. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
  134. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
  135. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
  136. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
  137. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
  138. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
  139. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
  140. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
  141. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
  142. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
  143. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
  144. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
  145. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
  146. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
  147. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
  148. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
  149. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
  150. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
  151. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
  152. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
  153. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
  154. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
  155. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
  156. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
  157. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
  158. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
  159. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +1 -1
  160. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
  161. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
  162. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
  163. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
  164. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
  165. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
  166. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
  167. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
  168. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
  169. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
  170. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
  171. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
  172. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
  173. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
  174. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
  175. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
  176. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
  177. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
  178. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
  179. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
  180. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
  181. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
  182. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
  183. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
  184. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
  185. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
  186. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
  187. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
  188. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
  189. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py +47 -6
  190. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
  191. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
  192. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
  193. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
  194. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
  195. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
  196. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
  197. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
  198. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
  199. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
  200. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
  201. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
  202. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
  203. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
  204. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
  205. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
  206. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
  207. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
  208. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
  209. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
  210. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
  211. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
  212. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
  213. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
  214. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
  215. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
  216. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
  217. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
  218. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
  219. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
  220. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
  221. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
  222. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
  223. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
  224. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
  225. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
  226. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
  227. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
  228. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
  229. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
  230. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
  231. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
  232. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
  233. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
  234. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
  235. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
  236. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
  237. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
  238. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
  239. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
  240. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
  241. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
  242. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
  243. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
  244. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
  245. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
  246. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
  247. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
  248. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
  249. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
  250. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
  251. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
  252. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
  253. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
  254. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
  255. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
  256. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
  257. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
  258. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
  259. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
  260. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
  261. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
  262. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
  263. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
  264. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
  265. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
  266. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
  267. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
  268. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
  269. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
  270. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
  271. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
  272. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
  273. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
  274. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
  275. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
  276. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
  277. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
  278. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
  279. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
  280. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
  281. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
  282. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
  283. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
  284. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
  285. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
  286. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
  287. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
  288. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
  289. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
  290. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
  291. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
  292. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
  293. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
  294. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
  295. wisent/core/data_loaders/loaders/lm_loader.py +12 -1
  296. wisent/core/evaluators/benchmark_specific/apps_evaluator.py +133 -0
  297. wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py +6 -1
  298. wisent/core/evaluators/benchmark_specific/conala_evaluator.py +31 -168
  299. wisent/core/evaluators/custom/examples/humanization_coherent.py +89 -35
  300. wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py +2 -20
  301. wisent/core/evaluators/personalization/coherence.py +46 -0
  302. wisent/core/geometry_runner.py +995 -0
  303. wisent/core/geometry_search_space.py +237 -0
  304. wisent/core/hyperparameter_optimizer.py +14 -14
  305. wisent/core/lm_eval_harness_ground_truth.py +7 -11
  306. wisent/core/main.py +6 -0
  307. wisent/core/models/core/atoms.py +5 -3
  308. wisent/core/models/wisent_model.py +9 -8
  309. wisent/core/opti/methods/opti_weights.py +29 -2
  310. wisent/core/optuna/classifier/activation_generator.py +14 -12
  311. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  312. wisent/core/optuna/steering/steering_optimization.py +14 -9
  313. wisent/core/parser_arguments/check_linearity_parser.py +12 -2
  314. wisent/core/parser_arguments/cluster_benchmarks_parser.py +31 -0
  315. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
  316. wisent/core/parser_arguments/generate_vector_from_task_parser.py +22 -2
  317. wisent/core/parser_arguments/geometry_search_parser.py +61 -0
  318. wisent/core/parser_arguments/main_parser.py +16 -0
  319. wisent/core/parser_arguments/optimize_steering_parser.py +117 -10
  320. wisent/core/parser_arguments/optimize_weights_parser.py +6 -0
  321. wisent/core/parser_arguments/tasks_parser.py +7 -19
  322. wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
  323. wisent/core/steering.py +5 -3
  324. wisent/core/steering_methods/core/atoms.py +1 -2
  325. wisent/core/steering_methods/methods/caa.py +1 -1
  326. wisent/core/steering_methods/methods/hyperplane.py +75 -0
  327. wisent/core/steering_methods/methods/prism.py +1 -2
  328. wisent/core/steering_methods/methods/pulse.py +39 -8
  329. wisent/core/steering_methods/methods/titan.py +59 -14
  330. wisent/core/steering_methods/registry.py +52 -12
  331. wisent/core/steering_optimizer.py +15 -15
  332. wisent/core/synthetic/generators/nonsense_generator.py +30 -18
  333. wisent/core/trainers/steering_trainer.py +11 -20
  334. wisent/core/utils/device.py +27 -27
  335. wisent/core/utils/layer_combinations.py +70 -0
  336. wisent/examples/__init__.py +1 -0
  337. wisent/examples/scripts/__init__.py +1 -0
  338. wisent/examples/scripts/count_all_benchmarks.py +121 -0
  339. wisent/examples/scripts/discover_directions.py +469 -0
  340. wisent/examples/scripts/extract_benchmark_info.py +71 -0
  341. wisent/examples/scripts/generate_paper_data.py +384 -0
  342. wisent/examples/scripts/intervention_validation.py +626 -0
  343. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
  344. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
  345. wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
  346. wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
  347. wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
  348. wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
  349. wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
  350. wisent/examples/scripts/search_all_short_names.py +31 -0
  351. wisent/examples/scripts/test_all_benchmarks.py +138 -0
  352. wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
  353. wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
  354. wisent/examples/scripts/test_nonsense_baseline.py +261 -0
  355. wisent/examples/scripts/test_one_benchmark.py +324 -0
  356. wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
  357. wisent/examples/scripts/threshold_analysis.py +434 -0
  358. wisent/examples/scripts/visualization_gallery.py +582 -0
  359. wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
  360. wisent/parameters/lm_eval/category_directions.json +137 -0
  361. wisent/parameters/lm_eval/repair_plan.json +282 -0
  362. wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json +19 -70
  363. wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
  364. wisent/parameters/lm_eval/working_benchmarks.json +206 -0
  365. wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
  366. wisent/scripts/run_quality_metrics_sweep.sh +22 -27
  367. wisent/tests/test_aggregation_geometry.py +236 -0
  368. wisent/tests/test_detector_accuracy.py +163 -0
  369. wisent/tests/test_geometry_exhaustive.py +1202 -0
  370. wisent/tests/visualize_geometry.py +255 -61
  371. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
  372. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/RECORD +376 -974
  373. wisent/core/activations/prompt_construction_strategy.py +0 -47
  374. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
  375. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text.py +0 -15
  376. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_go.py +0 -64
  377. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_java.py +0 -65
  378. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_javascript.py +0 -65
  379. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_php.py +0 -65
  380. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_python.py +0 -65
  381. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_ruby.py +0 -65
  382. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/freebase.py +0 -99
  383. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py +0 -180
  384. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py +0 -129
  385. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py +0 -142
  386. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py +0 -155
  387. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code2text.py +0 -161
  388. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/codexglue.py +0 -107
  389. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livemathbench.py +0 -155
  390. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polymath.py +0 -155
  391. wisent/examples/scripts/results/benchmark_descriptions.json +0 -1244
  392. wisent/examples/scripts/results/benchmark_evaluation_methods.json +0 -66
  393. wisent/examples/scripts/results/benchmark_evaluator_mapping.json +0 -2781
  394. wisent/examples/scripts/results/benchmark_evaluator_mapping_updated.json +0 -30536
  395. wisent/examples/scripts/results/benchmark_evaluators_clean.json +0 -469
  396. wisent/examples/scripts/results/benchmark_methods_summary.json +0 -260
  397. wisent/examples/scripts/results/benchmark_pair_creation_methods.json +0 -66
  398. wisent/examples/scripts/results/benchmark_pair_totals.json +0 -269
  399. wisent/examples/scripts/results/benchmark_tags.json +0 -917
  400. wisent/examples/scripts/results/benchmark_test_summary_nov4.json +0 -71
  401. wisent/examples/scripts/results/coding_benchmarks_test_code_status.json +0 -150
  402. wisent/examples/scripts/results/failing_benchmarks.json +0 -946
  403. wisent/examples/scripts/results/failing_benchmarks_list.json +0 -41
  404. wisent/examples/scripts/results/failing_benchmarks_test_results.json +0 -945
  405. wisent/examples/scripts/results/missing_benchmark_tags.json +0 -341
  406. wisent/examples/scripts/results/test_20_newsgroups_evaluation.json +0 -30
  407. wisent/examples/scripts/results/test_20_newsgroups_pairs.json +0 -8
  408. wisent/examples/scripts/results/test_AraDICE_evaluation.json +0 -51
  409. wisent/examples/scripts/results/test_AraDICE_pairs.json +0 -14
  410. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_evaluation.json +0 -30
  411. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_pairs.json +0 -8
  412. wisent/examples/scripts/results/test_ArabCulture_evaluation.json +0 -51
  413. wisent/examples/scripts/results/test_ArabCulture_pairs.json +0 -14
  414. wisent/examples/scripts/results/test_Tag_evaluation.json +0 -30
  415. wisent/examples/scripts/results/test_Tag_pairs.json +0 -8
  416. wisent/examples/scripts/results/test_aclue_evaluation.json +0 -51
  417. wisent/examples/scripts/results/test_aclue_pairs.json +0 -14
  418. wisent/examples/scripts/results/test_acp_bench_evaluation.json +0 -51
  419. wisent/examples/scripts/results/test_acp_bench_hard_evaluation.json +0 -51
  420. wisent/examples/scripts/results/test_acp_bench_hard_pairs.json +0 -14
  421. wisent/examples/scripts/results/test_acp_bench_pairs.json +0 -14
  422. wisent/examples/scripts/results/test_advanced_ai_risk_evaluation.json +0 -51
  423. wisent/examples/scripts/results/test_advanced_ai_risk_pairs.json +0 -14
  424. wisent/examples/scripts/results/test_aexams_evaluation.json +0 -51
  425. wisent/examples/scripts/results/test_aexams_pairs.json +0 -14
  426. wisent/examples/scripts/results/test_afrimgsm_direct_amh_evaluation.json +0 -30
  427. wisent/examples/scripts/results/test_afrimgsm_direct_amh_pairs.json +0 -8
  428. wisent/examples/scripts/results/test_afrimmlu_direct_amh_evaluation.json +0 -30
  429. wisent/examples/scripts/results/test_afrimmlu_direct_amh_pairs.json +0 -8
  430. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_evaluation.json +0 -30
  431. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_pairs.json +0 -8
  432. wisent/examples/scripts/results/test_ag_news_evaluation.json +0 -30
  433. wisent/examples/scripts/results/test_ag_news_pairs.json +0 -8
  434. wisent/examples/scripts/results/test_agieval_evaluation.json +0 -51
  435. wisent/examples/scripts/results/test_agieval_pairs.json +0 -14
  436. wisent/examples/scripts/results/test_aime2024_evaluation.json +0 -30
  437. wisent/examples/scripts/results/test_aime2024_pairs.json +0 -8
  438. wisent/examples/scripts/results/test_aime2025_evaluation.json +0 -30
  439. wisent/examples/scripts/results/test_aime2025_pairs.json +0 -8
  440. wisent/examples/scripts/results/test_aime_evaluation.json +0 -30
  441. wisent/examples/scripts/results/test_aime_pairs.json +0 -8
  442. wisent/examples/scripts/results/test_anagrams1_evaluation.json +0 -30
  443. wisent/examples/scripts/results/test_anagrams1_pairs.json +0 -8
  444. wisent/examples/scripts/results/test_anagrams2_evaluation.json +0 -30
  445. wisent/examples/scripts/results/test_anagrams2_pairs.json +0 -8
  446. wisent/examples/scripts/results/test_anli_evaluation.json +0 -30
  447. wisent/examples/scripts/results/test_anli_pairs.json +0 -8
  448. wisent/examples/scripts/results/test_apps_evaluation.json +0 -30
  449. wisent/examples/scripts/results/test_apps_pairs.json +0 -8
  450. wisent/examples/scripts/results/test_arabic_exams_evaluation.json +0 -30
  451. wisent/examples/scripts/results/test_arabic_exams_pairs.json +0 -8
  452. wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json +0 -51
  453. wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json +0 -14
  454. wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json +0 -51
  455. wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json +0 -14
  456. wisent/examples/scripts/results/test_arabicmmlu_evaluation.json +0 -51
  457. wisent/examples/scripts/results/test_arabicmmlu_pairs.json +0 -14
  458. wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json +0 -51
  459. wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json +0 -14
  460. wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json +0 -51
  461. wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json +0 -14
  462. wisent/examples/scripts/results/test_arc_ar_evaluation.json +0 -30
  463. wisent/examples/scripts/results/test_arc_ar_pairs.json +0 -8
  464. wisent/examples/scripts/results/test_arc_challenge_evaluation.json +0 -30
  465. wisent/examples/scripts/results/test_arc_challenge_pairs.json +0 -8
  466. wisent/examples/scripts/results/test_arc_easy_evaluation.json +0 -30
  467. wisent/examples/scripts/results/test_arc_easy_pairs.json +0 -8
  468. wisent/examples/scripts/results/test_argument_topic_evaluation.json +0 -30
  469. wisent/examples/scripts/results/test_argument_topic_pairs.json +0 -8
  470. wisent/examples/scripts/results/test_arithmetic_evaluation.json +0 -51
  471. wisent/examples/scripts/results/test_arithmetic_pairs.json +0 -14
  472. wisent/examples/scripts/results/test_asdiv_evaluation.json +0 -30
  473. wisent/examples/scripts/results/test_asdiv_pairs.json +0 -8
  474. wisent/examples/scripts/results/test_assin_entailment_evaluation.json +0 -30
  475. wisent/examples/scripts/results/test_assin_entailment_pairs.json +0 -8
  476. wisent/examples/scripts/results/test_atis_evaluation.json +0 -30
  477. wisent/examples/scripts/results/test_atis_pairs.json +0 -8
  478. wisent/examples/scripts/results/test_babi_evaluation.json +0 -30
  479. wisent/examples/scripts/results/test_babi_pairs.json +0 -8
  480. wisent/examples/scripts/results/test_babilong_evaluation.json +0 -30
  481. wisent/examples/scripts/results/test_babilong_pairs.json +0 -8
  482. wisent/examples/scripts/results/test_bangla_mmlu_evaluation.json +0 -30
  483. wisent/examples/scripts/results/test_bangla_mmlu_pairs.json +0 -8
  484. wisent/examples/scripts/results/test_banking77_evaluation.json +0 -30
  485. wisent/examples/scripts/results/test_banking77_pairs.json +0 -8
  486. wisent/examples/scripts/results/test_basque/test_basque-glue_pairs.json +0 -14
  487. wisent/examples/scripts/results/test_basque-glue_evaluation.json +0 -51
  488. wisent/examples/scripts/results/test_basque-glue_pairs.json +0 -14
  489. wisent/examples/scripts/results/test_basque2/test_basque-glue_evaluation.json +0 -51
  490. wisent/examples/scripts/results/test_basque2/test_basque-glue_pairs.json +0 -14
  491. wisent/examples/scripts/results/test_basque_bench_evaluation.json +0 -51
  492. wisent/examples/scripts/results/test_basque_bench_pairs.json +0 -14
  493. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_evaluation.json +0 -51
  494. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_pairs.json +0 -14
  495. wisent/examples/scripts/results/test_basqueglue_evaluation.json +0 -51
  496. wisent/examples/scripts/results/test_basqueglue_pairs.json +0 -14
  497. wisent/examples/scripts/results/test_bbh_evaluation.json +0 -51
  498. wisent/examples/scripts/results/test_bbh_pairs.json +0 -14
  499. wisent/examples/scripts/results/test_bbq_evaluation.json +0 -30
  500. wisent/examples/scripts/results/test_bbq_pairs.json +0 -8
  501. wisent/examples/scripts/results/test_bec2016eu_evaluation.json +0 -51
  502. wisent/examples/scripts/results/test_bec2016eu_pairs.json +0 -14
  503. wisent/examples/scripts/results/test_belebele_evaluation.json +0 -51
  504. wisent/examples/scripts/results/test_belebele_pairs.json +0 -14
  505. wisent/examples/scripts/results/test_benchmarks_evaluation.json +0 -51
  506. wisent/examples/scripts/results/test_benchmarks_pairs.json +0 -14
  507. wisent/examples/scripts/results/test_bertaqa_evaluation.json +0 -51
  508. wisent/examples/scripts/results/test_bertaqa_pairs.json +0 -14
  509. wisent/examples/scripts/results/test_bhtc_v2_evaluation.json +0 -30
  510. wisent/examples/scripts/results/test_bhtc_v2_pairs.json +0 -8
  511. wisent/examples/scripts/results/test_bigbench_evaluation.json +0 -51
  512. wisent/examples/scripts/results/test_bigbench_pairs.json +0 -14
  513. wisent/examples/scripts/results/test_blimp_evaluation.json +0 -51
  514. wisent/examples/scripts/results/test_blimp_pairs.json +0 -14
  515. wisent/examples/scripts/results/test_boolq/test_boolq_evaluation.json +0 -30
  516. wisent/examples/scripts/results/test_boolq/test_boolq_pairs.json +0 -8
  517. wisent/examples/scripts/results/test_boolq-seq2seq_evaluation.json +0 -30
  518. wisent/examples/scripts/results/test_boolq-seq2seq_pairs.json +0 -8
  519. wisent/examples/scripts/results/test_boolq_evaluation.json +0 -30
  520. wisent/examples/scripts/results/test_boolq_pairs.json +0 -8
  521. wisent/examples/scripts/results/test_c4_evaluation.json +0 -30
  522. wisent/examples/scripts/results/test_c4_pairs.json +0 -8
  523. wisent/examples/scripts/results/test_cabreu_evaluation.json +0 -30
  524. wisent/examples/scripts/results/test_cabreu_pairs.json +0 -8
  525. wisent/examples/scripts/results/test_careqa_evaluation.json +0 -30
  526. wisent/examples/scripts/results/test_careqa_pairs.json +0 -8
  527. wisent/examples/scripts/results/test_catalan_bench_evaluation.json +0 -51
  528. wisent/examples/scripts/results/test_catalan_bench_pairs.json +0 -14
  529. wisent/examples/scripts/results/test_catalanqa_evaluation.json +0 -30
  530. wisent/examples/scripts/results/test_catalanqa_pairs.json +0 -8
  531. wisent/examples/scripts/results/test_catcola_evaluation.json +0 -30
  532. wisent/examples/scripts/results/test_catcola_pairs.json +0 -8
  533. wisent/examples/scripts/results/test_cb_evaluation.json +0 -30
  534. wisent/examples/scripts/results/test_cb_pairs.json +0 -8
  535. wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json +0 -51
  536. wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json +0 -14
  537. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json +0 -30
  538. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json +0 -8
  539. wisent/examples/scripts/results/test_ceval_evaluation.json +0 -51
  540. wisent/examples/scripts/results/test_ceval_pairs.json +0 -14
  541. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json +0 -51
  542. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json +0 -14
  543. wisent/examples/scripts/results/test_chain_of_thought_evaluation.json +0 -51
  544. wisent/examples/scripts/results/test_chain_of_thought_pairs.json +0 -14
  545. wisent/examples/scripts/results/test_chartqa_evaluation.json +0 -30
  546. wisent/examples/scripts/results/test_chartqa_pairs.json +0 -8
  547. wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json +0 -30
  548. wisent/examples/scripts/results/test_claim_stance_topic_pairs.json +0 -8
  549. wisent/examples/scripts/results/test_cmmlu_evaluation.json +0 -51
  550. wisent/examples/scripts/results/test_cmmlu_pairs.json +0 -14
  551. wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json +0 -30
  552. wisent/examples/scripts/results/test_cnn_dailymail_pairs.json +0 -8
  553. wisent/examples/scripts/results/test_cocoteros_es_evaluation.json +0 -30
  554. wisent/examples/scripts/results/test_cocoteros_es_pairs.json +0 -8
  555. wisent/examples/scripts/results/test_codexglue_code_to_text_go_evaluation.json +0 -30
  556. wisent/examples/scripts/results/test_codexglue_code_to_text_go_pairs.json +0 -8
  557. wisent/examples/scripts/results/test_codexglue_code_to_text_java_evaluation.json +0 -30
  558. wisent/examples/scripts/results/test_codexglue_code_to_text_java_pairs.json +0 -8
  559. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_evaluation.json +0 -30
  560. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_pairs.json +0 -8
  561. wisent/examples/scripts/results/test_codexglue_code_to_text_php_evaluation.json +0 -30
  562. wisent/examples/scripts/results/test_codexglue_code_to_text_php_pairs.json +0 -8
  563. wisent/examples/scripts/results/test_codexglue_code_to_text_python_evaluation.json +0 -30
  564. wisent/examples/scripts/results/test_codexglue_code_to_text_python_pairs.json +0 -8
  565. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_evaluation.json +0 -30
  566. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_pairs.json +0 -8
  567. wisent/examples/scripts/results/test_coedit_gec_evaluation.json +0 -30
  568. wisent/examples/scripts/results/test_coedit_gec_pairs.json +0 -8
  569. wisent/examples/scripts/results/test_cola_evaluation.json +0 -30
  570. wisent/examples/scripts/results/test_cola_pairs.json +0 -8
  571. wisent/examples/scripts/results/test_commonsense_qa_evaluation.json +0 -30
  572. wisent/examples/scripts/results/test_commonsense_qa_pairs.json +0 -8
  573. wisent/examples/scripts/results/test_conala_evaluation.json +0 -30
  574. wisent/examples/scripts/results/test_conala_pairs.json +0 -8
  575. wisent/examples/scripts/results/test_concode_evaluation.json +0 -30
  576. wisent/examples/scripts/results/test_concode_pairs.json +0 -8
  577. wisent/examples/scripts/results/test_copa_evaluation.json +0 -30
  578. wisent/examples/scripts/results/test_copa_pairs.json +0 -8
  579. wisent/examples/scripts/results/test_copal_id_evaluation.json +0 -30
  580. wisent/examples/scripts/results/test_copal_id_pairs.json +0 -8
  581. wisent/examples/scripts/results/test_coqa_evaluation.json +0 -30
  582. wisent/examples/scripts/results/test_coqa_pairs.json +0 -8
  583. wisent/examples/scripts/results/test_coqcat_evaluation.json +0 -30
  584. wisent/examples/scripts/results/test_coqcat_pairs.json +0 -8
  585. wisent/examples/scripts/results/test_crows_pairs_evaluation.json +0 -51
  586. wisent/examples/scripts/results/test_crows_pairs_pairs.json +0 -14
  587. wisent/examples/scripts/results/test_csatqa_evaluation.json +0 -51
  588. wisent/examples/scripts/results/test_csatqa_pairs.json +0 -14
  589. wisent/examples/scripts/results/test_cycle_letters_evaluation.json +0 -30
  590. wisent/examples/scripts/results/test_cycle_letters_pairs.json +0 -8
  591. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_evaluation.json +0 -51
  592. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_pairs.json +0 -14
  593. wisent/examples/scripts/results/test_darija_bench_evaluation.json +0 -51
  594. wisent/examples/scripts/results/test_darija_bench_pairs.json +0 -14
  595. wisent/examples/scripts/results/test_darijahellaswag_evaluation.json +0 -30
  596. wisent/examples/scripts/results/test_darijahellaswag_pairs.json +0 -8
  597. wisent/examples/scripts/results/test_darijammlu_evaluation.json +0 -51
  598. wisent/examples/scripts/results/test_darijammlu_pairs.json +0 -14
  599. wisent/examples/scripts/results/test_dbpedia_14_evaluation.json +0 -30
  600. wisent/examples/scripts/results/test_dbpedia_14_pairs.json +0 -8
  601. wisent/examples/scripts/results/test_drop_evaluation.json +0 -30
  602. wisent/examples/scripts/results/test_drop_pairs.json +0 -8
  603. wisent/examples/scripts/results/test_ds1000_evaluation.json +0 -30
  604. wisent/examples/scripts/results/test_ds1000_pairs.json +0 -8
  605. wisent/examples/scripts/results/test_egyhellaswag_evaluation.json +0 -30
  606. wisent/examples/scripts/results/test_egyhellaswag_pairs.json +0 -8
  607. wisent/examples/scripts/results/test_egymmlu_evaluation.json +0 -51
  608. wisent/examples/scripts/results/test_egymmlu_pairs.json +0 -14
  609. wisent/examples/scripts/results/test_epec_koref_bin_evaluation.json +0 -30
  610. wisent/examples/scripts/results/test_epec_koref_bin_pairs.json +0 -8
  611. wisent/examples/scripts/results/test_eq_bench_evaluation.json +0 -30
  612. wisent/examples/scripts/results/test_eq_bench_pairs.json +0 -8
  613. wisent/examples/scripts/results/test_escola_evaluation.json +0 -30
  614. wisent/examples/scripts/results/test_escola_pairs.json +0 -8
  615. wisent/examples/scripts/results/test_ethics_cm_evaluation.json +0 -30
  616. wisent/examples/scripts/results/test_ethics_cm_pairs.json +0 -8
  617. wisent/examples/scripts/results/test_ethos_binary_evaluation.json +0 -30
  618. wisent/examples/scripts/results/test_ethos_binary_pairs.json +0 -8
  619. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_evaluation.json +0 -51
  620. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_pairs.json +0 -14
  621. wisent/examples/scripts/results/test_eus_exams_es_evaluation.json +0 -51
  622. wisent/examples/scripts/results/test_eus_exams_es_pairs.json +0 -14
  623. wisent/examples/scripts/results/test_eus_exams_evaluation.json +0 -51
  624. wisent/examples/scripts/results/test_eus_exams_pairs.json +0 -14
  625. wisent/examples/scripts/results/test_eus_proficiency_evaluation.json +0 -30
  626. wisent/examples/scripts/results/test_eus_proficiency_pairs.json +0 -8
  627. wisent/examples/scripts/results/test_eus_reading_evaluation.json +0 -30
  628. wisent/examples/scripts/results/test_eus_reading_pairs.json +0 -8
  629. wisent/examples/scripts/results/test_eus_trivia_evaluation.json +0 -30
  630. wisent/examples/scripts/results/test_eus_trivia_pairs.json +0 -8
  631. wisent/examples/scripts/results/test_evalita-mp_evaluation.json +0 -51
  632. wisent/examples/scripts/results/test_evalita-mp_pairs.json +0 -14
  633. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
  634. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
  635. wisent/examples/scripts/results/test_evalita_LLM_evaluation.json +0 -51
  636. wisent/examples/scripts/results/test_evalita_LLM_pairs.json +0 -14
  637. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_evaluation.json +0 -51
  638. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_pairs.json +0 -14
  639. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_evaluation.json +0 -30
  640. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_pairs.json +0 -8
  641. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_evaluation.json +0 -51
  642. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_pairs.json +0 -14
  643. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
  644. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
  645. wisent/examples/scripts/results/test_fda_evaluation.json +0 -30
  646. wisent/examples/scripts/results/test_fda_pairs.json +0 -8
  647. wisent/examples/scripts/results/test_financial_tweets_evaluation.json +0 -30
  648. wisent/examples/scripts/results/test_financial_tweets_pairs.json +0 -8
  649. wisent/examples/scripts/results/test_fld/test_fld_evaluation.json +0 -30
  650. wisent/examples/scripts/results/test_fld/test_fld_pairs.json +0 -8
  651. wisent/examples/scripts/results/test_fld_evaluation.json +0 -30
  652. wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json +0 -30
  653. wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json +0 -8
  654. wisent/examples/scripts/results/test_fld_pairs.json +0 -8
  655. wisent/examples/scripts/results/test_flores_evaluation.json +0 -51
  656. wisent/examples/scripts/results/test_flores_pairs.json +0 -14
  657. wisent/examples/scripts/results/test_freebase_evaluation.json +0 -30
  658. wisent/examples/scripts/results/test_freebase_pairs.json +0 -8
  659. wisent/examples/scripts/results/test_french_bench_evaluation.json +0 -51
  660. wisent/examples/scripts/results/test_french_bench_pairs.json +0 -14
  661. wisent/examples/scripts/results/test_galcola_evaluation.json +0 -30
  662. wisent/examples/scripts/results/test_galcola_pairs.json +0 -8
  663. wisent/examples/scripts/results/test_galician_bench_evaluation.json +0 -51
  664. wisent/examples/scripts/results/test_galician_bench_pairs.json +0 -14
  665. wisent/examples/scripts/results/test_glianorex_evaluation.json +0 -30
  666. wisent/examples/scripts/results/test_glianorex_pairs.json +0 -8
  667. wisent/examples/scripts/results/test_global_mmlu_evaluation.json +0 -51
  668. wisent/examples/scripts/results/test_global_mmlu_pairs.json +0 -14
  669. wisent/examples/scripts/results/test_glue_evaluation.json +0 -51
  670. wisent/examples/scripts/results/test_glue_pairs.json +0 -14
  671. wisent/examples/scripts/results/test_gpqa_evaluation.json +0 -51
  672. wisent/examples/scripts/results/test_gpqa_pairs.json +0 -14
  673. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_evaluation.json +0 -51
  674. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_pairs.json +0 -14
  675. wisent/examples/scripts/results/test_groundcocoa_evaluation.json +0 -30
  676. wisent/examples/scripts/results/test_groundcocoa_pairs.json +0 -8
  677. wisent/examples/scripts/results/test_gsm8k_evaluation.json +0 -30
  678. wisent/examples/scripts/results/test_gsm8k_pairs.json +0 -8
  679. wisent/examples/scripts/results/test_haerae_evaluation.json +0 -51
  680. wisent/examples/scripts/results/test_haerae_pairs.json +0 -14
  681. wisent/examples/scripts/results/test_headqa_evaluation.json +0 -30
  682. wisent/examples/scripts/results/test_headqa_pairs.json +0 -8
  683. wisent/examples/scripts/results/test_hellaswag_evaluation.json +0 -30
  684. wisent/examples/scripts/results/test_hellaswag_pairs.json +0 -8
  685. wisent/examples/scripts/results/test_hendrycks_ethics_evaluation.json +0 -51
  686. wisent/examples/scripts/results/test_hendrycks_ethics_pairs.json +0 -14
  687. wisent/examples/scripts/results/test_hendrycks_math_evaluation.json +0 -51
  688. wisent/examples/scripts/results/test_hendrycks_math_pairs.json +0 -14
  689. wisent/examples/scripts/results/test_histoires_morales_evaluation.json +0 -30
  690. wisent/examples/scripts/results/test_histoires_morales_pairs.json +0 -8
  691. wisent/examples/scripts/results/test_hmmt_evaluation.json +0 -30
  692. wisent/examples/scripts/results/test_hmmt_feb_2025_evaluation.json +0 -30
  693. wisent/examples/scripts/results/test_hmmt_feb_2025_pairs.json +0 -8
  694. wisent/examples/scripts/results/test_hmmt_pairs.json +0 -8
  695. wisent/examples/scripts/results/test_hrm8k_evaluation.json +0 -51
  696. wisent/examples/scripts/results/test_hrm8k_pairs.json +0 -14
  697. wisent/examples/scripts/results/test_humaneval_evaluation.json +0 -30
  698. wisent/examples/scripts/results/test_humaneval_pairs.json +0 -8
  699. wisent/examples/scripts/results/test_humaneval_plus_evaluation.json +0 -30
  700. wisent/examples/scripts/results/test_humaneval_plus_pairs.json +0 -8
  701. wisent/examples/scripts/results/test_ifeval_evaluation.json +0 -30
  702. wisent/examples/scripts/results/test_ifeval_pairs.json +0 -8
  703. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_evaluation.json +0 -30
  704. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_pairs.json +0 -8
  705. wisent/examples/scripts/results/test_instruct_humaneval_evaluation.json +0 -30
  706. wisent/examples/scripts/results/test_instruct_humaneval_pairs.json +0 -8
  707. wisent/examples/scripts/results/test_inverse_scaling_evaluation.json +0 -51
  708. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_evaluation.json +0 -30
  709. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_pairs.json +0 -8
  710. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_evaluation.json +0 -51
  711. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_pairs.json +0 -14
  712. wisent/examples/scripts/results/test_inverse_scaling_pairs.json +0 -14
  713. wisent/examples/scripts/results/test_iwslt2017-ar-en_evaluation.json +0 -30
  714. wisent/examples/scripts/results/test_iwslt2017-ar-en_pairs.json +0 -8
  715. wisent/examples/scripts/results/test_iwslt2017-en-ar_evaluation.json +0 -30
  716. wisent/examples/scripts/results/test_iwslt2017-en-ar_pairs.json +0 -8
  717. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_evaluation.json +0 -30
  718. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_pairs.json +0 -8
  719. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_evaluation.json +0 -30
  720. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_pairs.json +0 -8
  721. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_evaluation.json +0 -30
  722. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_pairs.json +0 -8
  723. wisent/examples/scripts/results/test_japanese_leaderboard_evaluation.json +0 -51
  724. wisent/examples/scripts/results/test_japanese_leaderboard_pairs.json +0 -14
  725. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_evaluation.json +0 -30
  726. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_pairs.json +0 -8
  727. wisent/examples/scripts/results/test_jsonschema_bench_evaluation.json +0 -30
  728. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_evaluation.json +0 -30
  729. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_pairs.json +0 -8
  730. wisent/examples/scripts/results/test_jsonschema_bench_pairs.json +0 -8
  731. wisent/examples/scripts/results/test_kbl_evaluation.json +0 -51
  732. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_evaluation.json +0 -51
  733. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_pairs.json +0 -14
  734. wisent/examples/scripts/results/test_kbl_pairs.json +0 -14
  735. wisent/examples/scripts/results/test_kmmlu_evaluation.json +0 -51
  736. wisent/examples/scripts/results/test_kmmlu_pairs.json +0 -14
  737. wisent/examples/scripts/results/test_kobest_evaluation.json +0 -51
  738. wisent/examples/scripts/results/test_kobest_pairs.json +0 -14
  739. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json +0 -30
  740. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json +0 -8
  741. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json +0 -30
  742. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json +0 -8
  743. wisent/examples/scripts/results/test_kormedmcqa_evaluation.json +0 -30
  744. wisent/examples/scripts/results/test_kormedmcqa_pairs.json +0 -8
  745. wisent/examples/scripts/results/test_lambada_cloze_evaluation.json +0 -30
  746. wisent/examples/scripts/results/test_lambada_cloze_pairs.json +0 -8
  747. wisent/examples/scripts/results/test_lambada_evaluation.json +0 -30
  748. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  749. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  750. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json +0 -51
  751. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json +0 -14
  752. wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json +0 -51
  753. wisent/examples/scripts/results/test_lambada_multilingual_pairs.json +0 -14
  754. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json +0 -51
  755. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json +0 -14
  756. wisent/examples/scripts/results/test_lambada_openai_evaluation.json +0 -30
  757. wisent/examples/scripts/results/test_lambada_openai_pairs.json +0 -8
  758. wisent/examples/scripts/results/test_lambada_pairs.json +0 -8
  759. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  760. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  761. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  762. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  763. wisent/examples/scripts/results/test_lambada_standard_evaluation.json +0 -30
  764. wisent/examples/scripts/results/test_lambada_standard_pairs.json +0 -8
  765. wisent/examples/scripts/results/test_leaderboard_evaluation.json +0 -51
  766. wisent/examples/scripts/results/test_leaderboard_pairs.json +0 -14
  767. wisent/examples/scripts/results/test_libra/test_libra_evaluation.json +0 -51
  768. wisent/examples/scripts/results/test_libra/test_libra_pairs.json +0 -14
  769. wisent/examples/scripts/results/test_libra_evaluation.json +0 -51
  770. wisent/examples/scripts/results/test_libra_pairs.json +0 -14
  771. wisent/examples/scripts/results/test_lingoly_evaluation.json +0 -30
  772. wisent/examples/scripts/results/test_lingoly_pairs.json +0 -8
  773. wisent/examples/scripts/results/test_livecodebench_evaluation.json +0 -30
  774. wisent/examples/scripts/results/test_livecodebench_pairs.json +0 -8
  775. wisent/examples/scripts/results/test_livemathbench_cnmo_en_evaluation.json +0 -30
  776. wisent/examples/scripts/results/test_livemathbench_cnmo_en_pairs.json +0 -8
  777. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_evaluation.json +0 -30
  778. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_pairs.json +0 -8
  779. wisent/examples/scripts/results/test_llama_evaluation.json +0 -30
  780. wisent/examples/scripts/results/test_llama_pairs.json +0 -8
  781. wisent/examples/scripts/results/test_logiqa2_evaluation.json +0 -30
  782. wisent/examples/scripts/results/test_logiqa2_pairs.json +0 -8
  783. wisent/examples/scripts/results/test_logiqa_evaluation.json +0 -30
  784. wisent/examples/scripts/results/test_logiqa_pairs.json +0 -8
  785. wisent/examples/scripts/results/test_m_mmlu_evaluation.json +0 -51
  786. wisent/examples/scripts/results/test_m_mmlu_pairs.json +0 -14
  787. wisent/examples/scripts/results/test_mastermind/test_mastermind_evaluation.json +0 -51
  788. wisent/examples/scripts/results/test_mastermind/test_mastermind_pairs.json +0 -14
  789. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_evaluation.json +0 -30
  790. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_pairs.json +0 -8
  791. wisent/examples/scripts/results/test_mastermind_evaluation.json +0 -51
  792. wisent/examples/scripts/results/test_mastermind_pairs.json +0 -14
  793. wisent/examples/scripts/results/test_math500_evaluation.json +0 -30
  794. wisent/examples/scripts/results/test_math500_pairs.json +0 -8
  795. wisent/examples/scripts/results/test_math_evaluation.json +0 -30
  796. wisent/examples/scripts/results/test_math_pairs.json +0 -8
  797. wisent/examples/scripts/results/test_mathqa_evaluation.json +0 -30
  798. wisent/examples/scripts/results/test_mathqa_pairs.json +0 -8
  799. wisent/examples/scripts/results/test_mbpp_evaluation.json +0 -30
  800. wisent/examples/scripts/results/test_mbpp_pairs.json +0 -8
  801. wisent/examples/scripts/results/test_mbpp_plus_evaluation.json +0 -30
  802. wisent/examples/scripts/results/test_mbpp_plus_pairs.json +0 -8
  803. wisent/examples/scripts/results/test_mc_taco_evaluation.json +0 -30
  804. wisent/examples/scripts/results/test_mc_taco_pairs.json +0 -8
  805. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_evaluation.json +0 -51
  806. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_pairs.json +0 -14
  807. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_evaluation.json +0 -30
  808. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_pairs.json +0 -8
  809. wisent/examples/scripts/results/test_med_concepts_qa_evaluation.json +0 -51
  810. wisent/examples/scripts/results/test_med_concepts_qa_pairs.json +0 -14
  811. wisent/examples/scripts/results/test_meddialog_evaluation.json +0 -30
  812. wisent/examples/scripts/results/test_meddialog_pairs.json +0 -8
  813. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_evaluation.json +0 -30
  814. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_pairs.json +0 -8
  815. wisent/examples/scripts/results/test_mediqa_qa2019_evaluation.json +0 -30
  816. wisent/examples/scripts/results/test_mediqa_qa2019_pairs.json +0 -8
  817. wisent/examples/scripts/results/test_medmcqa_evaluation.json +0 -30
  818. wisent/examples/scripts/results/test_medmcqa_pairs.json +0 -8
  819. wisent/examples/scripts/results/test_medqa_evaluation.json +0 -30
  820. wisent/examples/scripts/results/test_medqa_pairs.json +0 -8
  821. wisent/examples/scripts/results/test_medtext_evaluation.json +0 -30
  822. wisent/examples/scripts/results/test_medtext_pairs.json +0 -8
  823. wisent/examples/scripts/results/test_mela_evaluation.json +0 -51
  824. wisent/examples/scripts/results/test_mela_pairs.json +0 -14
  825. wisent/examples/scripts/results/test_meqsum_evaluation.json +0 -30
  826. wisent/examples/scripts/results/test_meqsum_pairs.json +0 -8
  827. wisent/examples/scripts/results/test_mercury_evaluation.json +0 -30
  828. wisent/examples/scripts/results/test_mercury_pairs.json +0 -8
  829. wisent/examples/scripts/results/test_metabench_evaluation.json +0 -51
  830. wisent/examples/scripts/results/test_metabench_pairs.json +0 -14
  831. wisent/examples/scripts/results/test_mgsm_evaluation.json +0 -51
  832. wisent/examples/scripts/results/test_mgsm_pairs.json +0 -14
  833. wisent/examples/scripts/results/test_mimic_repsum_evaluation.json +0 -30
  834. wisent/examples/scripts/results/test_mimic_repsum_pairs.json +0 -8
  835. wisent/examples/scripts/results/test_minerva_math_evaluation.json +0 -51
  836. wisent/examples/scripts/results/test_minerva_math_pairs.json +0 -14
  837. wisent/examples/scripts/results/test_mlqa_evaluation.json +0 -51
  838. wisent/examples/scripts/results/test_mlqa_pairs.json +0 -14
  839. wisent/examples/scripts/results/test_mmlu-pro-plus_evaluation.json +0 -51
  840. wisent/examples/scripts/results/test_mmlu-pro-plus_pairs.json +0 -14
  841. wisent/examples/scripts/results/test_mmlu_evaluation.json +0 -51
  842. wisent/examples/scripts/results/test_mmlu_pairs.json +0 -14
  843. wisent/examples/scripts/results/test_mmlu_pro_evaluation.json +0 -51
  844. wisent/examples/scripts/results/test_mmlu_pro_pairs.json +0 -14
  845. wisent/examples/scripts/results/test_mmlu_prox_evaluation.json +0 -51
  846. wisent/examples/scripts/results/test_mmlu_prox_pairs.json +0 -14
  847. wisent/examples/scripts/results/test_mmlusr_evaluation.json +0 -30
  848. wisent/examples/scripts/results/test_mmlusr_pairs.json +0 -8
  849. wisent/examples/scripts/results/test_mmmu_evaluation.json +0 -51
  850. wisent/examples/scripts/results/test_mmmu_pairs.json +0 -14
  851. wisent/examples/scripts/results/test_mnli_evaluation.json +0 -30
  852. wisent/examples/scripts/results/test_mnli_pairs.json +0 -8
  853. wisent/examples/scripts/results/test_model_written_evals_evaluation.json +0 -51
  854. wisent/examples/scripts/results/test_model_written_evals_pairs.json +0 -14
  855. wisent/examples/scripts/results/test_moral_stories_evaluation.json +0 -30
  856. wisent/examples/scripts/results/test_moral_stories_pairs.json +0 -8
  857. wisent/examples/scripts/results/test_mts_dialog_evaluation.json +0 -30
  858. wisent/examples/scripts/results/test_mts_dialog_pairs.json +0 -8
  859. wisent/examples/scripts/results/test_multiblimp_evaluation.json +0 -51
  860. wisent/examples/scripts/results/test_multiblimp_pairs.json +0 -14
  861. wisent/examples/scripts/results/test_multimedqa_evaluation.json +0 -51
  862. wisent/examples/scripts/results/test_multimedqa_pairs.json +0 -14
  863. wisent/examples/scripts/results/test_multipl_e_evaluation.json +0 -30
  864. wisent/examples/scripts/results/test_multipl_e_pairs.json +0 -8
  865. wisent/examples/scripts/results/test_mutual_evaluation.json +0 -30
  866. wisent/examples/scripts/results/test_mutual_pairs.json +0 -8
  867. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_evaluation.json +0 -30
  868. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_pairs.json +0 -8
  869. wisent/examples/scripts/results/test_noreval_evaluation.json +0 -51
  870. wisent/examples/scripts/results/test_noreval_pairs.json +0 -14
  871. wisent/examples/scripts/results/test_noticia_evaluation.json +0 -30
  872. wisent/examples/scripts/results/test_noticia_pairs.json +0 -8
  873. wisent/examples/scripts/results/test_nq_open_evaluation.json +0 -30
  874. wisent/examples/scripts/results/test_nq_open_pairs.json +0 -8
  875. wisent/examples/scripts/results/test_olaph_evaluation.json +0 -30
  876. wisent/examples/scripts/results/test_olaph_pairs.json +0 -8
  877. wisent/examples/scripts/results/test_openbookqa_evaluation.json +0 -30
  878. wisent/examples/scripts/results/test_openbookqa_pairs.json +0 -8
  879. wisent/examples/scripts/results/test_openllm_evaluation.json +0 -51
  880. wisent/examples/scripts/results/test_openllm_pairs.json +0 -14
  881. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_evaluation.json +0 -30
  882. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_pairs.json +0 -8
  883. wisent/examples/scripts/results/test_paloma_evaluation.json +0 -51
  884. wisent/examples/scripts/results/test_paloma_pairs.json +0 -14
  885. wisent/examples/scripts/results/test_passkey/test_passkey_evaluation.json +0 -30
  886. wisent/examples/scripts/results/test_passkey/test_passkey_pairs.json +0 -8
  887. wisent/examples/scripts/results/test_paws-x_evaluation.json +0 -51
  888. wisent/examples/scripts/results/test_paws-x_pairs.json +0 -14
  889. wisent/examples/scripts/results/test_paws_en/test_paws_en_evaluation.json +0 -30
  890. wisent/examples/scripts/results/test_paws_en/test_paws_en_pairs.json +0 -8
  891. wisent/examples/scripts/results/test_penn_treebank_evaluation.json +0 -30
  892. wisent/examples/scripts/results/test_penn_treebank_pairs.json +0 -8
  893. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_evaluation.json +0 -30
  894. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_pairs.json +0 -8
  895. wisent/examples/scripts/results/test_piqa_evaluation.json +0 -30
  896. wisent/examples/scripts/results/test_piqa_pairs.json +0 -8
  897. wisent/examples/scripts/results/test_polemo2_evaluation.json +0 -30
  898. wisent/examples/scripts/results/test_polemo2_pairs.json +0 -8
  899. wisent/examples/scripts/results/test_polymath_en_high_evaluation.json +0 -30
  900. wisent/examples/scripts/results/test_polymath_en_high_pairs.json +0 -8
  901. wisent/examples/scripts/results/test_polymath_en_medium_evaluation.json +0 -30
  902. wisent/examples/scripts/results/test_polymath_en_medium_pairs.json +0 -8
  903. wisent/examples/scripts/results/test_polymath_zh_high_evaluation.json +0 -30
  904. wisent/examples/scripts/results/test_polymath_zh_high_pairs.json +0 -8
  905. wisent/examples/scripts/results/test_polymath_zh_medium_evaluation.json +0 -30
  906. wisent/examples/scripts/results/test_polymath_zh_medium_pairs.json +0 -8
  907. wisent/examples/scripts/results/test_portuguese_bench_evaluation.json +0 -51
  908. wisent/examples/scripts/results/test_portuguese_bench_pairs.json +0 -14
  909. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
  910. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
  911. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
  912. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
  913. wisent/examples/scripts/results/test_prost_evaluation.json +0 -30
  914. wisent/examples/scripts/results/test_prost_pairs.json +0 -8
  915. wisent/examples/scripts/results/test_ptb_evaluation.json +0 -30
  916. wisent/examples/scripts/results/test_ptb_pairs.json +0 -8
  917. wisent/examples/scripts/results/test_pubmedqa_evaluation.json +0 -30
  918. wisent/examples/scripts/results/test_pubmedqa_pairs.json +0 -8
  919. wisent/examples/scripts/results/test_pythia_evaluation.json +0 -51
  920. wisent/examples/scripts/results/test_pythia_pairs.json +0 -14
  921. wisent/examples/scripts/results/test_qa4mre_evaluation.json +0 -30
  922. wisent/examples/scripts/results/test_qa4mre_pairs.json +0 -8
  923. wisent/examples/scripts/results/test_qasper_evaluation.json +0 -30
  924. wisent/examples/scripts/results/test_qasper_pairs.json +0 -8
  925. wisent/examples/scripts/results/test_race_evaluation.json +0 -30
  926. wisent/examples/scripts/results/test_race_pairs.json +0 -8
  927. wisent/examples/scripts/results/test_realtoxicityprompts_evaluation.json +0 -30
  928. wisent/examples/scripts/results/test_realtoxicityprompts_pairs.json +0 -8
  929. wisent/examples/scripts/results/test_recode_evaluation.json +0 -30
  930. wisent/examples/scripts/results/test_recode_pairs.json +0 -8
  931. wisent/examples/scripts/results/test_record_evaluation.json +0 -30
  932. wisent/examples/scripts/results/test_record_pairs.json +0 -8
  933. wisent/examples/scripts/results/test_ruler_evaluation.json +0 -51
  934. wisent/examples/scripts/results/test_ruler_pairs.json +0 -14
  935. wisent/examples/scripts/results/test_sciq_evaluation.json +0 -30
  936. wisent/examples/scripts/results/test_sciq_pairs.json +0 -8
  937. wisent/examples/scripts/results/test_score_evaluation.json +0 -51
  938. wisent/examples/scripts/results/test_score_pairs.json +0 -14
  939. wisent/examples/scripts/results/test_self_consistency_evaluation.json +0 -30
  940. wisent/examples/scripts/results/test_self_consistency_pairs.json +0 -8
  941. wisent/examples/scripts/results/test_siqa/test_siqa_evaluation.json +0 -30
  942. wisent/examples/scripts/results/test_siqa/test_siqa_pairs.json +0 -8
  943. wisent/examples/scripts/results/test_siqa_evaluation.json +0 -30
  944. wisent/examples/scripts/results/test_siqa_pairs.json +0 -8
  945. wisent/examples/scripts/results/test_spanish_bench_evaluation.json +0 -51
  946. wisent/examples/scripts/results/test_spanish_bench_pairs.json +0 -14
  947. wisent/examples/scripts/results/test_squad2_evaluation.json +0 -30
  948. wisent/examples/scripts/results/test_squad2_pairs.json +0 -8
  949. wisent/examples/scripts/results/test_squadv2_evaluation.json +0 -30
  950. wisent/examples/scripts/results/test_squadv2_pairs.json +0 -8
  951. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_evaluation.json +0 -30
  952. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_pairs.json +0 -8
  953. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_evaluation.json +0 -51
  954. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_pairs.json +0 -14
  955. wisent/examples/scripts/results/test_swag_evaluation.json +0 -30
  956. wisent/examples/scripts/results/test_swag_pairs.json +0 -8
  957. wisent/examples/scripts/results/test_tinyBenchmarks_evaluation.json +0 -51
  958. wisent/examples/scripts/results/test_tinyBenchmarks_pairs.json +0 -14
  959. wisent/examples/scripts/results/test_tmmluplus_evaluation.json +0 -51
  960. wisent/examples/scripts/results/test_tmmluplus_pairs.json +0 -14
  961. wisent/examples/scripts/results/test_translation_evaluation.json +0 -51
  962. wisent/examples/scripts/results/test_translation_pairs.json +0 -14
  963. wisent/examples/scripts/results/test_triviaqa_evaluation.json +0 -30
  964. wisent/examples/scripts/results/test_triviaqa_pairs.json +0 -8
  965. wisent/examples/scripts/results/test_truthfulqa-multi_evaluation.json +0 -51
  966. wisent/examples/scripts/results/test_truthfulqa-multi_pairs.json +0 -14
  967. wisent/examples/scripts/results/test_truthfulqa_evaluation.json +0 -30
  968. wisent/examples/scripts/results/test_truthfulqa_mc1_evaluation.json +0 -30
  969. wisent/examples/scripts/results/test_truthfulqa_mc1_pairs.json +0 -8
  970. wisent/examples/scripts/results/test_truthfulqa_mc2_evaluation.json +0 -30
  971. wisent/examples/scripts/results/test_truthfulqa_mc2_pairs.json +0 -8
  972. wisent/examples/scripts/results/test_truthfulqa_pairs.json +0 -8
  973. wisent/examples/scripts/results/test_turkishmmlu_evaluation.json +0 -51
  974. wisent/examples/scripts/results/test_turkishmmlu_pairs.json +0 -14
  975. wisent/examples/scripts/results/test_unfair_tos_evaluation.json +0 -30
  976. wisent/examples/scripts/results/test_unfair_tos_pairs.json +0 -8
  977. wisent/examples/scripts/results/test_unscramble_evaluation.json +0 -51
  978. wisent/examples/scripts/results/test_unscramble_pairs.json +0 -14
  979. wisent/examples/scripts/results/test_webqs_evaluation.json +0 -30
  980. wisent/examples/scripts/results/test_webqs_pairs.json +0 -8
  981. wisent/examples/scripts/results/test_wikitext103_evaluation.json +0 -30
  982. wisent/examples/scripts/results/test_wikitext103_pairs.json +0 -8
  983. wisent/examples/scripts/results/test_wikitext_evaluation.json +0 -30
  984. wisent/examples/scripts/results/test_wikitext_pairs.json +0 -8
  985. wisent/examples/scripts/results/test_winogender_evaluation.json +0 -51
  986. wisent/examples/scripts/results/test_winogender_pairs.json +0 -14
  987. wisent/examples/scripts/results/test_winogrande_evaluation.json +0 -30
  988. wisent/examples/scripts/results/test_winogrande_pairs.json +0 -8
  989. wisent/examples/scripts/results/test_wmdp_evaluation.json +0 -30
  990. wisent/examples/scripts/results/test_wmdp_pairs.json +0 -8
  991. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_evaluation.json +0 -30
  992. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_pairs.json +0 -8
  993. wisent/examples/scripts/results/test_wmt14_en_fr_evaluation.json +0 -30
  994. wisent/examples/scripts/results/test_wmt14_en_fr_pairs.json +0 -8
  995. wisent/examples/scripts/results/test_wmt16_en_de_evaluation.json +0 -30
  996. wisent/examples/scripts/results/test_wmt16_en_de_pairs.json +0 -8
  997. wisent/examples/scripts/results/test_wmt16_ro_en_evaluation.json +0 -30
  998. wisent/examples/scripts/results/test_wmt16_ro_en_pairs.json +0 -8
  999. wisent/examples/scripts/results/test_wsc273_evaluation.json +0 -30
  1000. wisent/examples/scripts/results/test_wsc273_pairs.json +0 -8
  1001. wisent/examples/scripts/results/test_xcopa_evaluation.json +0 -51
  1002. wisent/examples/scripts/results/test_xcopa_pairs.json +0 -14
  1003. wisent/examples/scripts/results/test_xnli_eu_evaluation.json +0 -30
  1004. wisent/examples/scripts/results/test_xnli_eu_pairs.json +0 -8
  1005. wisent/examples/scripts/results/test_xnli_evaluation.json +0 -51
  1006. wisent/examples/scripts/results/test_xnli_pairs.json +0 -14
  1007. wisent/examples/scripts/results/test_xquad_evaluation.json +0 -51
  1008. wisent/examples/scripts/results/test_xquad_pairs.json +0 -14
  1009. wisent/examples/scripts/results/test_xstorycloze_evaluation.json +0 -51
  1010. wisent/examples/scripts/results/test_xstorycloze_pairs.json +0 -14
  1011. wisent/examples/scripts/results/test_xsum_evaluation.json +0 -30
  1012. wisent/examples/scripts/results/test_xsum_pairs.json +0 -8
  1013. wisent/examples/scripts/results/test_xwinograd_evaluation.json +0 -51
  1014. wisent/examples/scripts/results/test_xwinograd_pairs.json +0 -14
  1015. wisent/examples/scripts/results/test_yahoo_answers_topics_evaluation.json +0 -30
  1016. wisent/examples/scripts/results/test_yahoo_answers_topics_pairs.json +0 -8
  1017. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
  1018. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
  1019. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
  1020. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1202 @@
1
+ """
2
+ Exhaustive layer combination analysis.
3
+
4
+ Tests all 2^N - 1 layer combinations to find optimal layer subsets
5
+ for geometry detection.
6
+
7
+ Uses CLI commands for pair generation and activation extraction.
8
+
9
+ ===============================================================================
10
+ DEBUGGING NOTES - READ BEFORE MAKING ASSUMPTIONS
11
+ ===============================================================================
12
+
13
+ On Dec 15, 2025, a Qwen3-8B run (36 layers = 68 billion combinations) became
14
+ unresponsive after starting step [5]. The instance lost SSM connection, SSH
15
+ timed out, and required a reboot.
16
+
17
+ WHAT WE KNOW (facts with evidence):
18
+ - Step [5] started: "Running exhaustive analysis (68719476735 combinations)..."
19
+ - No further output after that line
20
+ - Instance became unreachable (SSM ConnectionLost, SSH timeout)
21
+ - After reboot, dmesg.0 showed NO OOM messages
22
+ - kern.log had no errors between 18:30 (step 5 start) and 19:58 (reboot)
23
+
24
+ WHAT WE DO NOT KNOW (no evidence):
25
+ - Whether the process was running or stuck
26
+ - Whether memory was exhausted (no OOM in logs)
27
+ - Whether CPU was pegged
28
+ - The actual cause of unresponsiveness
29
+
30
+ DO NOT ASSUME:
31
+ - That 68 billion combinations is "too many" without measuring
32
+ - That the list allocation caused OOM (no evidence)
33
+ - That the loop is slow (no benchmarks)
34
+ - ANY root cause without actual evidence from logs/metrics
35
+
36
+ If investigating future failures:
37
+ 1. Check dmesg BEFORE rebooting for OOM messages
38
+ 2. Check /var/log/kern.log for errors
39
+ 3. Try to SSH and run 'top', 'free -h', 'ps aux' before assuming crash
40
+ 4. Get actual memory/CPU metrics, don't guess
41
+
42
+ The instance may have been working fine but just not producing output.
43
+ ===============================================================================
44
+ """
45
+
46
+ import json
47
+ import os
48
+ import subprocess
49
+ import sys
50
+ import tempfile
51
+ import time
52
+ import torch
53
+ from datetime import datetime
54
+ from typing import Dict, List
55
+
56
+
57
+ def run_exhaustive_layer_analysis(
58
+ task: str = "truthfulqa_gen",
59
+ model: str = "meta-llama/Llama-3.2-1B-Instruct",
60
+ num_pairs: int = 50,
61
+ max_layers: int | None = None,
62
+ output_dir: str = "/home/ubuntu/output",
63
+ ):
64
+ """
65
+ Run exhaustive layer combination analysis.
66
+
67
+ Tests all 2^N - 1 layer combinations to find which layer subsets
68
+ produce the strongest geometric structure detection.
69
+
70
+ Uses CLI commands:
71
+ - generate-pairs-from-task: Generate contrastive pairs
72
+ - get-activations: Extract activations for all layers
73
+
74
+ Automatically detects the model's layer count.
75
+
76
+ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
77
+ WARNING: DO NOT SET max_layers TO REDUCE THE NUMBER OF LAYERS TESTED.
78
+
79
+ The whole point of this analysis is to test ALL layer combinations.
80
+ If you need to reduce combinations for feasibility:
81
+ 1. Use a larger instance (g6e.2xlarge = 64GB, g6e.4xlarge = 128GB, g6e.12xlarge = 384GB)
82
+ 2. Wait longer - it's supposed to take hours/days
83
+ 3. DO NOT artificially cap layers - that defeats the purpose
84
+
85
+ max_layers exists ONLY for debugging/testing purposes, NOT for production runs.
86
+ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
87
+ """
88
+ from wisent.core.contrastive_pairs.diagnostics.control_vectors import (
89
+ detect_geometry_exhaustive,
90
+ )
91
+
92
+ sys.stdout.reconfigure(line_buffering=True)
93
+
94
+ print("=" * 80)
95
+ print("EXHAUSTIVE LAYER COMBINATION ANALYSIS")
96
+ print("=" * 80)
97
+ print(f"Task: {task}")
98
+ print(f"Model: {model}")
99
+ print(f"Num pairs: {num_pairs}")
100
+ print(f"Output dir: {output_dir}")
101
+
102
+ # Auto-detect model layer count from config (without loading weights)
103
+ print(f"\n[0] Detecting model layer count from config...")
104
+ start = time.time()
105
+ from transformers import AutoConfig
106
+ config = AutoConfig.from_pretrained(model, trust_remote_code=True)
107
+ # Different models use different config keys for layer count
108
+ model_layers = getattr(config, 'num_hidden_layers', None) or \
109
+ getattr(config, 'n_layer', None) or \
110
+ getattr(config, 'num_layers', None) or 32
111
+ print(f" Model has {model_layers} layers (detected in {time.time() - start:.1f}s)")
112
+
113
+ # Determine layers to use
114
+ if max_layers is not None:
115
+ num_layers = min(max_layers, model_layers)
116
+ print(f" Using {num_layers} layers (limited by --max-layers)")
117
+ else:
118
+ num_layers = model_layers
119
+
120
+ print(f" Total combinations to test: {2**num_layers - 1:,}")
121
+ print("=" * 80)
122
+
123
+ with tempfile.TemporaryDirectory() as tmpdir:
124
+ pairs_file = os.path.join(tmpdir, "pairs.json")
125
+ activations_file = os.path.join(tmpdir, "activations.json")
126
+
127
+ # Step 1: Generate pairs using CLI
128
+ print(f"\n[1] Generating {num_pairs} pairs for {task}...")
129
+ start = time.time()
130
+ result = subprocess.run(
131
+ [
132
+ sys.executable, "-m", "wisent.core.main", "generate-pairs-from-task",
133
+ task,
134
+ "--output", pairs_file,
135
+ "--limit", str(num_pairs),
136
+ ],
137
+ capture_output=True,
138
+ text=True,
139
+ timeout=600
140
+ )
141
+ if result.returncode != 0:
142
+ print(f"ERROR: Pair generation failed: {result.stderr}")
143
+ return
144
+ print(f" Generated pairs in {time.time() - start:.1f}s")
145
+
146
+ # Step 2: Get activations for ALL layers using CLI
147
+ # Build layers string: "1,2,3,...,num_layers"
148
+ layers_str = ",".join(str(i) for i in range(1, num_layers + 1))
149
+
150
+ print(f"\n[2] Extracting activations for layers 1-{num_layers}...")
151
+ start = time.time()
152
+ result = subprocess.run(
153
+ [
154
+ sys.executable, "-m", "wisent.core.main", "get-activations",
155
+ pairs_file,
156
+ "--output", activations_file,
157
+ "--model", model,
158
+ "--layers", layers_str,
159
+ "--token-aggregation", "final",
160
+ ],
161
+ capture_output=True,
162
+ text=True,
163
+ timeout=1800 # 30 min timeout for activation extraction
164
+ )
165
+ if result.returncode != 0:
166
+ print(f"ERROR: Activation extraction failed: {result.stderr}")
167
+ return
168
+ print(f" Extracted activations in {time.time() - start:.1f}s")
169
+
170
+ # Step 3: Load activations from JSON
171
+ print("\n[3] Loading activations from file...")
172
+ with open(activations_file, 'r') as f:
173
+ data = json.load(f)
174
+
175
+ pairs_list = data.get('pairs', [])
176
+ print(f" Loaded {len(pairs_list)} pairs with activations")
177
+
178
+ # Step 4: Convert to tensors by layer
179
+ print("\n[4] Converting to tensors by layer...")
180
+ pos_by_layer: Dict[int, List[torch.Tensor]] = {}
181
+ neg_by_layer: Dict[int, List[torch.Tensor]] = {}
182
+
183
+ for pair in pairs_list:
184
+ pos_la = pair.get('positive_response', {}).get('layers_activations', {})
185
+ neg_la = pair.get('negative_response', {}).get('layers_activations', {})
186
+
187
+ for layer_key in pos_la:
188
+ layer = int(layer_key)
189
+ if max_layers is not None and layer > max_layers:
190
+ continue
191
+
192
+ if layer not in pos_by_layer:
193
+ pos_by_layer[layer] = []
194
+ neg_by_layer[layer] = []
195
+
196
+ if layer_key in pos_la and layer_key in neg_la:
197
+ pos_by_layer[layer].append(torch.tensor(pos_la[layer_key]).reshape(-1))
198
+ neg_by_layer[layer].append(torch.tensor(neg_la[layer_key]).reshape(-1))
199
+
200
+ # Stack into tensors
201
+ pos_tensors = {}
202
+ neg_tensors = {}
203
+ layers_available = sorted(pos_by_layer.keys())
204
+
205
+ for layer in layers_available:
206
+ if pos_by_layer[layer] and neg_by_layer[layer]:
207
+ pos_tensors[layer] = torch.stack(pos_by_layer[layer])
208
+ neg_tensors[layer] = torch.stack(neg_by_layer[layer])
209
+ print(f" Layer {layer}: {pos_tensors[layer].shape}")
210
+
211
+ num_layers = len(pos_tensors)
212
+ actual_combos = 2 ** num_layers - 1
213
+ print(f"\n {num_layers} layers available -> {actual_combos} combinations to test")
214
+
215
+ # Step 5: Run exhaustive analysis
216
+ print(f"\n[5] Running exhaustive analysis ({actual_combos} combinations)...")
217
+ start = time.time()
218
+
219
+ last_report = [0, time.time()] # [last_count, last_time]
220
+ def progress_callback(current: int, total: int):
221
+ # Report every 10000 combinations OR every 30 seconds, whichever comes first
222
+ now = time.time()
223
+ if current - last_report[0] >= 10000 or now - last_report[1] >= 30:
224
+ elapsed = now - start
225
+ rate = current / elapsed if elapsed > 0 else 0
226
+ remaining = (total - current) / rate if rate > 0 else float('inf')
227
+ pct = 100 * current / total
228
+ print(f" Progress: {current:,}/{total:,} ({pct:.4f}%) - {rate:.1f} combos/sec - ETA: {remaining:.0f}s")
229
+ last_report[0] = current
230
+ last_report[1] = now
231
+
232
+ result = detect_geometry_exhaustive(
233
+ pos_tensors,
234
+ neg_tensors,
235
+ max_layers=num_layers,
236
+ combination_method="concat",
237
+ progress_callback=progress_callback,
238
+ )
239
+
240
+ elapsed = time.time() - start
241
+ print(f"\n Completed in {elapsed:.1f}s ({actual_combos / elapsed:.1f} combos/sec)")
242
+
243
+ # Print results
244
+ print("\n" + "=" * 80)
245
+ print("RESULTS")
246
+ print("=" * 80)
247
+
248
+ print(f"\nTotal combinations tested: {result.total_combinations}")
249
+ print(f"\nBest combination: {result.best_combination}")
250
+ print(f"Best score: {result.best_score:.4f}")
251
+ print(f"Best structure: {result.best_structure.value}")
252
+
253
+ print(f"\nBest single layer: L{result.single_layer_best}")
254
+ print(f"Best single layer score: {result.single_layer_best_score:.4f}")
255
+ print(f"Combination beats single: {result.combination_beats_single}")
256
+ print(f"Improvement over single: {result.improvement_over_single:.4f}")
257
+
258
+ print("\n--- Top 10 Combinations ---")
259
+ for i, r in enumerate(result.top_10):
260
+ layers_str = "+".join(f"L{l}" for l in r.layers)
261
+ print(f" {i+1}. {layers_str}: {r.best_structure.value} = {r.best_score:.4f}")
262
+
263
+ print("\n--- Patterns ---")
264
+ print(f" Most important layers: {result.patterns.get('most_important_layers', [])}")
265
+ print(f" Optimal combination size: {result.patterns.get('optimal_combination_size', 1)}")
266
+ print(f" Dominant structure: {result.patterns.get('dominant_structure', 'unknown')}")
267
+ print(f" Best score by size: {result.patterns.get('best_score_by_size', {})}")
268
+ print(f" Early vs late ratio: {result.patterns.get('early_vs_late_ratio', 0):.2f}")
269
+
270
+ print(f"\n--- Recommendation ---")
271
+ print(f" {result.recommendation}")
272
+
273
+ # Save results
274
+ os.makedirs(output_dir, exist_ok=True)
275
+ output_file = os.path.join(output_dir, f"exhaustive_geometry_{task}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
276
+
277
+ # Convert to serializable format
278
+ results_json = {
279
+ "task": task,
280
+ "model": model,
281
+ "num_pairs": num_pairs,
282
+ "max_layers": num_layers,
283
+ "total_combinations": result.total_combinations,
284
+ "elapsed_seconds": elapsed,
285
+ "best_combination": list(result.best_combination),
286
+ "best_score": result.best_score,
287
+ "best_structure": result.best_structure.value,
288
+ "single_layer_best": result.single_layer_best,
289
+ "single_layer_best_score": result.single_layer_best_score,
290
+ "combination_beats_single": result.combination_beats_single,
291
+ "improvement_over_single": result.improvement_over_single,
292
+ "top_10": [
293
+ {
294
+ "layers": list(r.layers),
295
+ "best_structure": r.best_structure.value,
296
+ "best_score": r.best_score,
297
+ "all_scores": r.all_scores,
298
+ }
299
+ for r in result.top_10
300
+ ],
301
+ "top_100": [
302
+ {
303
+ "layers": list(r.layers),
304
+ "best_structure": r.best_structure.value,
305
+ "best_score": r.best_score,
306
+ }
307
+ for r in result.all_results[:100]
308
+ ],
309
+ "patterns": {
310
+ k: v if not isinstance(v, float) or not (v != v) else None # Handle NaN
311
+ for k, v in result.patterns.items()
312
+ },
313
+ "recommendation": result.recommendation,
314
+ }
315
+
316
+ with open(output_file, "w") as f:
317
+ json.dump(results_json, f, indent=2)
318
+ print(f"\nResults saved to: {output_file}")
319
+
320
+ return result
321
+
322
+
323
+ def run_limited_layer_analysis(
324
+ task: str = "truthfulqa_gen",
325
+ model: str = "meta-llama/Llama-3.2-1B-Instruct",
326
+ num_pairs: int = 50,
327
+ max_combo_size: int = 3,
328
+ output_dir: str = "/home/ubuntu/output",
329
+ ):
330
+ """
331
+ Run limited layer combination analysis.
332
+
333
+ Tests 1-layer, 2-layer, 3-layer combinations plus all layers combined.
334
+ Much faster than exhaustive: O(N^3) instead of O(2^N).
335
+
336
+ For 36 layers with max_combo_size=3:
337
+ - 36 + 630 + 7140 + 1 = 7,807 combinations (vs 68 billion exhaustive)
338
+ """
339
+ from wisent.core.contrastive_pairs.diagnostics.control_vectors import (
340
+ detect_geometry_limited,
341
+ )
342
+ from math import comb
343
+
344
+ sys.stdout.reconfigure(line_buffering=True)
345
+
346
+ print("=" * 80)
347
+ print("LIMITED LAYER COMBINATION ANALYSIS")
348
+ print("=" * 80)
349
+ print(f"Task: {task}")
350
+ print(f"Model: {model}")
351
+ print(f"Num pairs: {num_pairs}")
352
+ print(f"Max combo size: {max_combo_size}")
353
+ print(f"Output dir: {output_dir}")
354
+
355
+ # Auto-detect model layer count from config
356
+ print(f"\n[0] Detecting model layer count from config...")
357
+ start = time.time()
358
+ from transformers import AutoConfig
359
+ config = AutoConfig.from_pretrained(model, trust_remote_code=True)
360
+ model_layers = getattr(config, 'num_hidden_layers', None) or \
361
+ getattr(config, 'n_layer', None) or \
362
+ getattr(config, 'num_layers', None) or 32
363
+ print(f" Model has {model_layers} layers (detected in {time.time() - start:.1f}s)")
364
+
365
+ # Calculate expected combinations
366
+ total_combos = sum(comb(model_layers, r) for r in range(1, min(max_combo_size, model_layers) + 1))
367
+ if max_combo_size < model_layers:
368
+ total_combos += 1 # all layers
369
+ print(f" Will test {total_combos:,} combinations (1 to {max_combo_size} layers + all {model_layers})")
370
+ print("=" * 80)
371
+
372
+ with tempfile.TemporaryDirectory() as tmpdir:
373
+ pairs_file = os.path.join(tmpdir, "pairs.json")
374
+ activations_file = os.path.join(tmpdir, "activations.json")
375
+
376
+ # Step 1: Generate pairs
377
+ print(f"\n[1] Generating {num_pairs} pairs for {task}...")
378
+ start = time.time()
379
+ result = subprocess.run(
380
+ [
381
+ sys.executable, "-m", "wisent.core.main", "generate-pairs-from-task",
382
+ task,
383
+ "--output", pairs_file,
384
+ "--limit", str(num_pairs),
385
+ ],
386
+ capture_output=True,
387
+ text=True,
388
+ timeout=600
389
+ )
390
+ if result.returncode != 0:
391
+ print(f"ERROR: Pair generation failed: {result.stderr}")
392
+ return
393
+ print(f" Generated pairs in {time.time() - start:.1f}s")
394
+
395
+ # Step 2: Get activations for ALL layers
396
+ layers_str = ",".join(str(i) for i in range(1, model_layers + 1))
397
+
398
+ print(f"\n[2] Extracting activations for layers 1-{model_layers}...")
399
+ start = time.time()
400
+ result = subprocess.run(
401
+ [
402
+ sys.executable, "-m", "wisent.core.main", "get-activations",
403
+ pairs_file,
404
+ "--output", activations_file,
405
+ "--model", model,
406
+ "--layers", layers_str,
407
+ "--token-aggregation", "final",
408
+ ],
409
+ capture_output=True,
410
+ text=True,
411
+ timeout=1800
412
+ )
413
+ if result.returncode != 0:
414
+ print(f"ERROR: Activation extraction failed: {result.stderr}")
415
+ return
416
+ print(f" Extracted activations in {time.time() - start:.1f}s")
417
+
418
+ # Step 3: Load activations
419
+ print("\n[3] Loading activations from file...")
420
+ with open(activations_file, 'r') as f:
421
+ data = json.load(f)
422
+
423
+ pairs_list = data.get('pairs', [])
424
+ print(f" Loaded {len(pairs_list)} pairs with activations")
425
+
426
+ # Step 4: Convert to tensors by layer
427
+ print("\n[4] Converting to tensors by layer...")
428
+ pos_by_layer: Dict[int, List[torch.Tensor]] = {}
429
+ neg_by_layer: Dict[int, List[torch.Tensor]] = {}
430
+
431
+ for pair in pairs_list:
432
+ pos_la = pair.get('positive_response', {}).get('layers_activations', {})
433
+ neg_la = pair.get('negative_response', {}).get('layers_activations', {})
434
+
435
+ for layer_key in pos_la:
436
+ layer = int(layer_key)
437
+ if layer not in pos_by_layer:
438
+ pos_by_layer[layer] = []
439
+ neg_by_layer[layer] = []
440
+
441
+ if layer_key in pos_la and layer_key in neg_la:
442
+ pos_by_layer[layer].append(torch.tensor(pos_la[layer_key]).reshape(-1))
443
+ neg_by_layer[layer].append(torch.tensor(neg_la[layer_key]).reshape(-1))
444
+
445
+ pos_tensors: Dict[int, torch.Tensor] = {}
446
+ neg_tensors: Dict[int, torch.Tensor] = {}
447
+ for layer in sorted(pos_by_layer.keys()):
448
+ if pos_by_layer[layer]:
449
+ pos_tensors[layer] = torch.stack(pos_by_layer[layer])
450
+ neg_tensors[layer] = torch.stack(neg_by_layer[layer])
451
+ print(f" Layer {layer}: {pos_tensors[layer].shape}")
452
+
453
+ num_layers = len(pos_tensors)
454
+ print(f"\n {num_layers} layers available")
455
+
456
+ # Step 5: Run limited analysis
457
+ print(f"\n[5] Running limited analysis ({total_combos:,} combinations)...")
458
+ start = time.time()
459
+
460
+ last_report = [0, time.time()]
461
+ def progress_callback(current: int, total: int):
462
+ now = time.time()
463
+ if current - last_report[0] >= 100 or now - last_report[1] >= 30 or current == total:
464
+ elapsed = now - start
465
+ rate = current / elapsed if elapsed > 0 else 0
466
+ remaining = (total - current) / rate if rate > 0 else 0
467
+ pct = 100 * current / total
468
+ print(f" Progress: {current:,}/{total:,} ({pct:.1f}%) - {rate:.1f} combos/sec - ETA: {remaining:.0f}s")
469
+ last_report[0] = current
470
+ last_report[1] = now
471
+
472
+ result = detect_geometry_limited(
473
+ pos_tensors,
474
+ neg_tensors,
475
+ max_combo_size=max_combo_size,
476
+ combination_method="concat",
477
+ progress_callback=progress_callback,
478
+ )
479
+
480
+ elapsed = time.time() - start
481
+ print(f"\n Completed in {elapsed:.1f}s ({total_combos / elapsed:.1f} combos/sec)")
482
+
483
+ # Print results
484
+ print("\n" + "=" * 80)
485
+ print("RESULTS")
486
+ print("=" * 80)
487
+
488
+ print(f"\nTotal combinations tested: {result.total_combinations}")
489
+ print(f"\nBest combination: {result.best_combination}")
490
+ print(f"Best score: {result.best_score:.4f}")
491
+ print(f"Best structure: {result.best_structure.value}")
492
+
493
+ print(f"\nBest single layer: L{result.single_layer_best}")
494
+ print(f"Best single layer score: {result.single_layer_best_score:.4f}")
495
+ print(f"Combination beats single: {result.combination_beats_single}")
496
+ print(f"Improvement over single: {result.improvement_over_single:.4f}")
497
+
498
+ print("\n--- Top 10 Combinations ---")
499
+ for i, r in enumerate(result.top_10):
500
+ layers_str = "+".join(f"L{l}" for l in r.layers)
501
+ print(f"{i+1}. {layers_str}: {r.best_score:.4f} ({r.best_structure.value})")
502
+
503
+ print(f"\nRecommendation: {result.recommendation}")
504
+
505
+ # Save results
506
+ os.makedirs(output_dir, exist_ok=True)
507
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
508
+ output_file = os.path.join(output_dir, f"geometry_limited_{task}_{timestamp}.json")
509
+
510
+ results_json = {
511
+ "task": task,
512
+ "model": model,
513
+ "num_pairs": num_pairs,
514
+ "max_combo_size": max_combo_size,
515
+ "total_combinations": result.total_combinations,
516
+ "best_combination": list(result.best_combination),
517
+ "best_score": result.best_score,
518
+ "best_structure": result.best_structure.value,
519
+ "single_layer_best": result.single_layer_best,
520
+ "single_layer_best_score": result.single_layer_best_score,
521
+ "combination_beats_single": result.combination_beats_single,
522
+ "improvement_over_single": result.improvement_over_single,
523
+ "top_10": [
524
+ {
525
+ "layers": list(r.layers),
526
+ "best_score": r.best_score,
527
+ "best_structure": r.best_structure.value,
528
+ "all_scores": r.all_scores,
529
+ }
530
+ for r in result.top_10
531
+ ],
532
+ "top_100": [
533
+ {
534
+ "layers": list(r.layers),
535
+ "best_score": r.best_score,
536
+ "best_structure": r.best_structure.value,
537
+ }
538
+ for r in result.all_results[:100]
539
+ ],
540
+ "patterns": result.patterns,
541
+ "recommendation": result.recommendation,
542
+ }
543
+
544
+ with open(output_file, "w") as f:
545
+ json.dump(results_json, f, indent=2)
546
+ print(f"\nResults saved to: {output_file}")
547
+
548
+ return result
549
+
550
+
551
+ def run_contiguous_layer_analysis(
552
+ task: str = "truthfulqa_gen",
553
+ model: str = "meta-llama/Llama-3.2-1B-Instruct",
554
+ num_pairs: int = 50,
555
+ output_dir: str = "/home/ubuntu/output",
556
+ ):
557
+ """
558
+ Run contiguous layer combination analysis.
559
+
560
+ Only tests combinations where layers are adjacent (1-2, 2-3, 1-5, etc.).
561
+ Very fast: O(N^2) = N*(N+1)/2 combinations.
562
+
563
+ For 36 layers: 666 combinations
564
+ For 24 layers: 300 combinations
565
+ """
566
+ from wisent.core.contrastive_pairs.diagnostics.control_vectors import (
567
+ detect_geometry_contiguous,
568
+ )
569
+
570
+ sys.stdout.reconfigure(line_buffering=True)
571
+
572
+ print("=" * 80)
573
+ print("CONTIGUOUS LAYER COMBINATION ANALYSIS")
574
+ print("=" * 80)
575
+ print(f"Task: {task}")
576
+ print(f"Model: {model}")
577
+ print(f"Num pairs: {num_pairs}")
578
+ print(f"Output dir: {output_dir}")
579
+
580
+ # Auto-detect model layer count from config
581
+ print(f"\n[0] Detecting model layer count from config...")
582
+ start = time.time()
583
+ from transformers import AutoConfig
584
+ config = AutoConfig.from_pretrained(model, trust_remote_code=True)
585
+ model_layers = getattr(config, 'num_hidden_layers', None) or \
586
+ getattr(config, 'n_layer', None) or \
587
+ getattr(config, 'num_layers', None) or 32
588
+ print(f" Model has {model_layers} layers (detected in {time.time() - start:.1f}s)")
589
+
590
+ # Calculate expected combinations
591
+ total_combos = model_layers * (model_layers + 1) // 2
592
+ print(f" Will test {total_combos:,} contiguous combinations")
593
+ print("=" * 80)
594
+
595
+ with tempfile.TemporaryDirectory() as tmpdir:
596
+ pairs_file = os.path.join(tmpdir, "pairs.json")
597
+ activations_file = os.path.join(tmpdir, "activations.json")
598
+
599
+ # Step 1: Generate pairs
600
+ print(f"\n[1] Generating {num_pairs} pairs for {task}...")
601
+ start = time.time()
602
+ result = subprocess.run(
603
+ [
604
+ sys.executable, "-m", "wisent.core.main", "generate-pairs-from-task",
605
+ task,
606
+ "--output", pairs_file,
607
+ "--limit", str(num_pairs),
608
+ ],
609
+ capture_output=True,
610
+ text=True,
611
+ timeout=600
612
+ )
613
+ if result.returncode != 0:
614
+ print(f"ERROR: Pair generation failed: {result.stderr}")
615
+ return
616
+ print(f" Generated pairs in {time.time() - start:.1f}s")
617
+
618
+ # Step 2: Get activations for ALL layers
619
+ layers_str = ",".join(str(i) for i in range(1, model_layers + 1))
620
+
621
+ print(f"\n[2] Extracting activations for layers 1-{model_layers}...")
622
+ start = time.time()
623
+ result = subprocess.run(
624
+ [
625
+ sys.executable, "-m", "wisent.core.main", "get-activations",
626
+ pairs_file,
627
+ "--output", activations_file,
628
+ "--model", model,
629
+ "--layers", layers_str,
630
+ "--token-aggregation", "final",
631
+ ],
632
+ capture_output=True,
633
+ text=True,
634
+ timeout=1800
635
+ )
636
+ if result.returncode != 0:
637
+ print(f"ERROR: Activation extraction failed: {result.stderr}")
638
+ return
639
+ print(f" Extracted activations in {time.time() - start:.1f}s")
640
+
641
+ # Step 3: Load activations
642
+ print("\n[3] Loading activations from file...")
643
+ with open(activations_file, 'r') as f:
644
+ data = json.load(f)
645
+
646
+ pairs_list = data.get('pairs', [])
647
+ print(f" Loaded {len(pairs_list)} pairs with activations")
648
+
649
+ # Step 4: Convert to tensors by layer
650
+ print("\n[4] Converting to tensors by layer...")
651
+ pos_by_layer: Dict[int, List[torch.Tensor]] = {}
652
+ neg_by_layer: Dict[int, List[torch.Tensor]] = {}
653
+
654
+ for pair in pairs_list:
655
+ pos_la = pair.get('positive_response', {}).get('layers_activations', {})
656
+ neg_la = pair.get('negative_response', {}).get('layers_activations', {})
657
+
658
+ for layer_key in pos_la:
659
+ layer = int(layer_key)
660
+ if layer not in pos_by_layer:
661
+ pos_by_layer[layer] = []
662
+ neg_by_layer[layer] = []
663
+
664
+ if layer_key in pos_la and layer_key in neg_la:
665
+ pos_by_layer[layer].append(torch.tensor(pos_la[layer_key]).reshape(-1))
666
+ neg_by_layer[layer].append(torch.tensor(neg_la[layer_key]).reshape(-1))
667
+
668
+ pos_tensors: Dict[int, torch.Tensor] = {}
669
+ neg_tensors: Dict[int, torch.Tensor] = {}
670
+ for layer in sorted(pos_by_layer.keys()):
671
+ if pos_by_layer[layer]:
672
+ pos_tensors[layer] = torch.stack(pos_by_layer[layer])
673
+ neg_tensors[layer] = torch.stack(neg_by_layer[layer])
674
+ print(f" Layer {layer}: {pos_tensors[layer].shape}")
675
+
676
+ num_layers = len(pos_tensors)
677
+ print(f"\n {num_layers} layers available")
678
+
679
+ # Step 5: Run contiguous analysis
680
+ print(f"\n[5] Running contiguous analysis ({total_combos:,} combinations)...")
681
+ start = time.time()
682
+
683
+ last_report = [0, time.time()]
684
+ def progress_callback(current: int, total: int):
685
+ now = time.time()
686
+ if current - last_report[0] >= 50 or now - last_report[1] >= 30 or current == total:
687
+ elapsed = now - start
688
+ rate = current / elapsed if elapsed > 0 else 0
689
+ remaining = (total - current) / rate if rate > 0 else 0
690
+ pct = 100 * current / total
691
+ print(f" Progress: {current:,}/{total:,} ({pct:.1f}%) - {rate:.1f} combos/sec - ETA: {remaining:.0f}s")
692
+ last_report[0] = current
693
+ last_report[1] = now
694
+
695
+ result = detect_geometry_contiguous(
696
+ pos_tensors,
697
+ neg_tensors,
698
+ combination_method="concat",
699
+ progress_callback=progress_callback,
700
+ )
701
+
702
+ elapsed = time.time() - start
703
+ print(f"\n Completed in {elapsed:.1f}s ({total_combos / elapsed:.1f} combos/sec)")
704
+
705
+ # Print results
706
+ print("\n" + "=" * 80)
707
+ print("RESULTS")
708
+ print("=" * 80)
709
+
710
+ print(f"\nTotal combinations tested: {result.total_combinations}")
711
+ print(f"\nBest combination: {result.best_combination}")
712
+ print(f"Best score: {result.best_score:.4f}")
713
+ print(f"Best structure: {result.best_structure.value}")
714
+
715
+ print(f"\nBest single layer: L{result.single_layer_best}")
716
+ print(f"Best single layer score: {result.single_layer_best_score:.4f}")
717
+ print(f"Combination beats single: {result.combination_beats_single}")
718
+ print(f"Improvement over single: {result.improvement_over_single:.4f}")
719
+
720
+ print("\n--- Top 10 Combinations ---")
721
+ for i, r in enumerate(result.top_10):
722
+ layers_str = f"L{r.layers[0]}-L{r.layers[-1]}" if len(r.layers) > 1 else f"L{r.layers[0]}"
723
+ print(f"{i+1}. {layers_str} ({len(r.layers)} layers): {r.best_score:.4f} ({r.best_structure.value})")
724
+
725
+ print(f"\nRecommendation: {result.recommendation}")
726
+
727
+ # Save results
728
+ os.makedirs(output_dir, exist_ok=True)
729
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
730
+ output_file = os.path.join(output_dir, f"geometry_contiguous_{task}_{timestamp}.json")
731
+
732
+ results_json = {
733
+ "task": task,
734
+ "model": model,
735
+ "num_pairs": num_pairs,
736
+ "mode": "contiguous",
737
+ "total_combinations": result.total_combinations,
738
+ "best_combination": list(result.best_combination),
739
+ "best_score": result.best_score,
740
+ "best_structure": result.best_structure.value,
741
+ "single_layer_best": result.single_layer_best,
742
+ "single_layer_best_score": result.single_layer_best_score,
743
+ "combination_beats_single": result.combination_beats_single,
744
+ "improvement_over_single": result.improvement_over_single,
745
+ "top_10": [
746
+ {
747
+ "layers": list(r.layers),
748
+ "best_score": r.best_score,
749
+ "best_structure": r.best_structure.value,
750
+ "all_scores": r.all_scores,
751
+ }
752
+ for r in result.top_10
753
+ ],
754
+ "top_100": [
755
+ {
756
+ "layers": list(r.layers),
757
+ "best_score": r.best_score,
758
+ "best_structure": r.best_structure.value,
759
+ }
760
+ for r in result.all_results[:100]
761
+ ],
762
+ "patterns": result.patterns,
763
+ "recommendation": result.recommendation,
764
+ }
765
+
766
+ with open(output_file, "w") as f:
767
+ json.dump(results_json, f, indent=2)
768
+ print(f"\nResults saved to: {output_file}")
769
+
770
+ return result
771
+
772
+
773
+ TOKEN_AGGREGATIONS = ["final", "average", "first", "max", "min", "max_score"]
774
+ PROMPT_STRATEGIES = ["chat_template", "direct_completion", "instruction_following", "multiple_choice", "role_playing"]
775
+
776
+
777
+ def run_smart_layer_analysis(
778
+ task: str = "truthfulqa_gen",
779
+ model: str = "meta-llama/Llama-3.2-1B-Instruct",
780
+ num_pairs: int = 50,
781
+ max_combo_size: int = 3,
782
+ token_aggregation: str = "final",
783
+ prompt_strategy: str = "chat_template",
784
+ output_dir: str = "/home/ubuntu/output",
785
+ ):
786
+ """
787
+ Run smart layer combination analysis.
788
+
789
+ Combines contiguous + limited search: tests all contiguous ranges (L1-L5, L3-L8, etc.)
790
+ plus all 1,2,3-layer non-contiguous combinations. Deduplicates overlaps.
791
+
792
+ For 36 layers: ~7,800 unique combinations
793
+ For 24 layers: ~2,600 unique combinations
794
+ """
795
+ from wisent.core.contrastive_pairs.diagnostics.control_vectors import (
796
+ detect_geometry_smart,
797
+ )
798
+ from math import comb
799
+
800
+ sys.stdout.reconfigure(line_buffering=True)
801
+
802
+ print("=" * 80)
803
+ print("SMART LAYER COMBINATION ANALYSIS")
804
+ print("(Contiguous + Limited 1,2,3-layer combinations)")
805
+ print("=" * 80)
806
+ print(f"Task: {task}")
807
+ print(f"Model: {model}")
808
+ print(f"Num pairs: {num_pairs}")
809
+ print(f"Max combo size: {max_combo_size}")
810
+ print(f"Token aggregation: {token_aggregation}")
811
+ print(f"Prompt strategy: {prompt_strategy}")
812
+ print(f"Output dir: {output_dir}")
813
+
814
+ # Auto-detect model layer count from config
815
+ print(f"\n[0] Detecting model layer count from config...")
816
+ start = time.time()
817
+ from transformers import AutoConfig
818
+ config = AutoConfig.from_pretrained(model, trust_remote_code=True)
819
+ model_layers = getattr(config, 'num_hidden_layers', None) or \
820
+ getattr(config, 'n_layer', None) or \
821
+ getattr(config, 'num_layers', None) or 32
822
+ print(f" Model has {model_layers} layers (detected in {time.time() - start:.1f}s)")
823
+
824
+ # Calculate expected combinations (estimate, actual will be less due to deduplication)
825
+ contiguous = model_layers * (model_layers + 1) // 2
826
+ limited = sum(comb(model_layers, r) for r in range(1, min(max_combo_size, model_layers) + 1))
827
+ print(f" Contiguous: {contiguous:,}, Limited 1-{max_combo_size}: {limited:,}")
828
+ print(f" (Actual will be less due to deduplication)")
829
+ print("=" * 80)
830
+
831
+ with tempfile.TemporaryDirectory() as tmpdir:
832
+ pairs_file = os.path.join(tmpdir, "pairs.json")
833
+ activations_file = os.path.join(tmpdir, "activations.json")
834
+
835
+ # Step 1: Generate pairs
836
+ print(f"\n[1] Generating {num_pairs} pairs for {task}...")
837
+ start = time.time()
838
+ result = subprocess.run(
839
+ [
840
+ sys.executable, "-m", "wisent.core.main", "generate-pairs-from-task",
841
+ task,
842
+ "--output", pairs_file,
843
+ "--limit", str(num_pairs),
844
+ ],
845
+ capture_output=True,
846
+ text=True,
847
+ timeout=600
848
+ )
849
+ if result.returncode != 0:
850
+ print(f"ERROR: Pair generation failed: {result.stderr}")
851
+ return
852
+ print(f" Generated pairs in {time.time() - start:.1f}s")
853
+
854
+ # Step 2: Get activations for ALL layers
855
+ layers_str = ",".join(str(i) for i in range(1, model_layers + 1))
856
+
857
+ print(f"\n[2] Extracting activations for layers 1-{model_layers}...")
858
+ print(f" Token aggregation: {token_aggregation}, Prompt strategy: {prompt_strategy}")
859
+ start = time.time()
860
+ result = subprocess.run(
861
+ [
862
+ sys.executable, "-m", "wisent.core.main", "get-activations",
863
+ pairs_file,
864
+ "--output", activations_file,
865
+ "--model", model,
866
+ "--layers", layers_str,
867
+ "--token-aggregation", token_aggregation,
868
+ "--prompt-strategy", prompt_strategy,
869
+ ],
870
+ capture_output=True,
871
+ text=True,
872
+ timeout=1800
873
+ )
874
+ if result.returncode != 0:
875
+ print(f"ERROR: Activation extraction failed: {result.stderr}")
876
+ return
877
+ print(f" Extracted activations in {time.time() - start:.1f}s")
878
+
879
+ # Step 3: Load activations
880
+ print("\n[3] Loading activations from file...")
881
+ with open(activations_file, 'r') as f:
882
+ data = json.load(f)
883
+
884
+ pairs_list = data.get('pairs', [])
885
+ print(f" Loaded {len(pairs_list)} pairs with activations")
886
+
887
+ # Step 4: Convert to tensors by layer
888
+ print("\n[4] Converting to tensors by layer...")
889
+ pos_by_layer: Dict[int, List[torch.Tensor]] = {}
890
+ neg_by_layer: Dict[int, List[torch.Tensor]] = {}
891
+
892
+ for pair in pairs_list:
893
+ pos_la = pair.get('positive_response', {}).get('layers_activations', {})
894
+ neg_la = pair.get('negative_response', {}).get('layers_activations', {})
895
+
896
+ for layer_key in pos_la:
897
+ layer = int(layer_key)
898
+ if layer not in pos_by_layer:
899
+ pos_by_layer[layer] = []
900
+ neg_by_layer[layer] = []
901
+
902
+ if layer_key in pos_la and layer_key in neg_la:
903
+ pos_by_layer[layer].append(torch.tensor(pos_la[layer_key]).reshape(-1))
904
+ neg_by_layer[layer].append(torch.tensor(neg_la[layer_key]).reshape(-1))
905
+
906
+ pos_tensors: Dict[int, torch.Tensor] = {}
907
+ neg_tensors: Dict[int, torch.Tensor] = {}
908
+ for layer in sorted(pos_by_layer.keys()):
909
+ if pos_by_layer[layer]:
910
+ pos_tensors[layer] = torch.stack(pos_by_layer[layer])
911
+ neg_tensors[layer] = torch.stack(neg_by_layer[layer])
912
+ print(f" Layer {layer}: {pos_tensors[layer].shape}")
913
+
914
+ num_layers = len(pos_tensors)
915
+ print(f"\n {num_layers} layers available")
916
+
917
+ # Step 5: Run smart analysis
918
+ print(f"\n[5] Running smart analysis...")
919
+ start = time.time()
920
+
921
+ last_report = [0, time.time()]
922
+ def progress_callback(current: int, total: int):
923
+ now = time.time()
924
+ if current - last_report[0] >= 100 or now - last_report[1] >= 30 or current == total:
925
+ elapsed = now - start
926
+ rate = current / elapsed if elapsed > 0 else 0
927
+ remaining = (total - current) / rate if rate > 0 else 0
928
+ pct = 100 * current / total
929
+ print(f" Progress: {current:,}/{total:,} ({pct:.1f}%) - {rate:.1f} combos/sec - ETA: {remaining:.0f}s")
930
+ last_report[0] = current
931
+ last_report[1] = now
932
+
933
+ result = detect_geometry_smart(
934
+ pos_tensors,
935
+ neg_tensors,
936
+ max_combo_size=max_combo_size,
937
+ combination_method="concat",
938
+ progress_callback=progress_callback,
939
+ )
940
+
941
+ elapsed = time.time() - start
942
+ print(f"\n Completed in {elapsed:.1f}s ({result.total_combinations / elapsed:.1f} combos/sec)")
943
+
944
+ # Print results
945
+ print("\n" + "=" * 80)
946
+ print("RESULTS")
947
+ print("=" * 80)
948
+
949
+ print(f"\nTotal combinations tested: {result.total_combinations}")
950
+ print(f"\nBest combination: {result.best_combination}")
951
+ print(f"Best score: {result.best_score:.4f}")
952
+ print(f"Best structure: {result.best_structure.value}")
953
+
954
+ print(f"\nBest single layer: L{result.single_layer_best}")
955
+ print(f"Best single layer score: {result.single_layer_best_score:.4f}")
956
+ print(f"Combination beats single: {result.combination_beats_single}")
957
+ print(f"Improvement over single: {result.improvement_over_single:.4f}")
958
+
959
+ print("\n--- Top 10 Combinations ---")
960
+ for i, r in enumerate(result.top_10):
961
+ if len(r.layers) > 1 and r.layers[-1] - r.layers[0] == len(r.layers) - 1:
962
+ # Contiguous
963
+ layers_str = f"L{r.layers[0]}-L{r.layers[-1]}"
964
+ else:
965
+ layers_str = "+".join(f"L{l}" for l in r.layers)
966
+ print(f"{i+1}. {layers_str} ({len(r.layers)} layers): {r.best_score:.4f} ({r.best_structure.value})")
967
+
968
+ print(f"\nRecommendation: {result.recommendation}")
969
+
970
+ # Save results
971
+ os.makedirs(output_dir, exist_ok=True)
972
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
973
+ output_file = os.path.join(output_dir, f"geometry_smart_{task}_{token_aggregation}_{prompt_strategy}_{timestamp}.json")
974
+
975
+ results_json = {
976
+ "task": task,
977
+ "model": model,
978
+ "num_pairs": num_pairs,
979
+ "mode": "smart",
980
+ "max_combo_size": max_combo_size,
981
+ "token_aggregation": token_aggregation,
982
+ "prompt_strategy": prompt_strategy,
983
+ "total_combinations": result.total_combinations,
984
+ "best_combination": list(result.best_combination),
985
+ "best_score": result.best_score,
986
+ "best_structure": result.best_structure.value,
987
+ "single_layer_best": result.single_layer_best,
988
+ "single_layer_best_score": result.single_layer_best_score,
989
+ "combination_beats_single": result.combination_beats_single,
990
+ "improvement_over_single": result.improvement_over_single,
991
+ "top_10": [
992
+ {
993
+ "layers": list(r.layers),
994
+ "best_score": r.best_score,
995
+ "best_structure": r.best_structure.value,
996
+ "all_scores": r.all_scores,
997
+ }
998
+ for r in result.top_10
999
+ ],
1000
+ "top_100": [
1001
+ {
1002
+ "layers": list(r.layers),
1003
+ "best_score": r.best_score,
1004
+ "best_structure": r.best_structure.value,
1005
+ }
1006
+ for r in result.all_results[:100]
1007
+ ],
1008
+ "patterns": result.patterns,
1009
+ "recommendation": result.recommendation,
1010
+ }
1011
+
1012
+ with open(output_file, "w") as f:
1013
+ json.dump(results_json, f, indent=2)
1014
+ print(f"\nResults saved to: {output_file}")
1015
+
1016
+ return result
1017
+
1018
+
1019
+ def run_comprehensive_sweep(
1020
+ task: str = "truthfulqa_gen",
1021
+ model: str = "meta-llama/Llama-3.2-1B-Instruct",
1022
+ num_pairs: int = 50,
1023
+ max_combo_size: int = 3,
1024
+ output_dir: str = "/home/ubuntu/output",
1025
+ ):
1026
+ """
1027
+ Run comprehensive sweep across all token aggregations and prompt strategies.
1028
+
1029
+ Tests 6 token aggregations x 5 prompt strategies = 30 configurations,
1030
+ each with smart layer combination search.
1031
+ """
1032
+ sys.stdout.reconfigure(line_buffering=True)
1033
+
1034
+ print("=" * 80)
1035
+ print("COMPREHENSIVE GEOMETRY SWEEP")
1036
+ print("=" * 80)
1037
+ print(f"Task: {task}")
1038
+ print(f"Model: {model}")
1039
+ print(f"Num pairs: {num_pairs}")
1040
+ print(f"Token aggregations: {TOKEN_AGGREGATIONS}")
1041
+ print(f"Prompt strategies: {PROMPT_STRATEGIES}")
1042
+ print(f"Total configurations: {len(TOKEN_AGGREGATIONS) * len(PROMPT_STRATEGIES)}")
1043
+ print("=" * 80)
1044
+
1045
+ all_results = []
1046
+ total_configs = len(TOKEN_AGGREGATIONS) * len(PROMPT_STRATEGIES)
1047
+ config_idx = 0
1048
+
1049
+ for token_agg in TOKEN_AGGREGATIONS:
1050
+ for prompt_strat in PROMPT_STRATEGIES:
1051
+ config_idx += 1
1052
+ print(f"\n{'='*80}")
1053
+ print(f"CONFIG {config_idx}/{total_configs}: {token_agg} + {prompt_strat}")
1054
+ print("=" * 80)
1055
+
1056
+ try:
1057
+ result = run_smart_layer_analysis(
1058
+ task=task,
1059
+ model=model,
1060
+ num_pairs=num_pairs,
1061
+ max_combo_size=max_combo_size,
1062
+ token_aggregation=token_agg,
1063
+ prompt_strategy=prompt_strat,
1064
+ output_dir=output_dir,
1065
+ )
1066
+
1067
+ if result:
1068
+ all_results.append({
1069
+ "token_aggregation": token_agg,
1070
+ "prompt_strategy": prompt_strat,
1071
+ "best_combination": list(result.best_combination),
1072
+ "best_score": result.best_score,
1073
+ "best_structure": result.best_structure.value,
1074
+ "single_layer_best": result.single_layer_best,
1075
+ "single_layer_best_score": result.single_layer_best_score,
1076
+ "improvement_over_single": result.improvement_over_single,
1077
+ })
1078
+ except Exception as e:
1079
+ print(f"ERROR in config {token_agg}+{prompt_strat}: {e}")
1080
+ all_results.append({
1081
+ "token_aggregation": token_agg,
1082
+ "prompt_strategy": prompt_strat,
1083
+ "error": str(e),
1084
+ })
1085
+
1086
+ # Save summary
1087
+ print("\n" + "=" * 80)
1088
+ print("SWEEP SUMMARY")
1089
+ print("=" * 80)
1090
+
1091
+ # Sort by best_score
1092
+ successful = [r for r in all_results if "best_score" in r]
1093
+ successful.sort(key=lambda x: x["best_score"], reverse=True)
1094
+
1095
+ print(f"\nCompleted {len(successful)}/{total_configs} configurations")
1096
+ print("\n--- Top 10 Configurations ---")
1097
+ for i, r in enumerate(successful[:10]):
1098
+ print(f"{i+1}. {r['token_aggregation']}+{r['prompt_strategy']}: {r['best_score']:.4f} ({r['best_structure']}) - layers {r['best_combination']}")
1099
+
1100
+ # Save sweep summary
1101
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1102
+ summary_file = os.path.join(output_dir, f"geometry_sweep_summary_{task}_{timestamp}.json")
1103
+
1104
+ summary = {
1105
+ "task": task,
1106
+ "model": model,
1107
+ "num_pairs": num_pairs,
1108
+ "max_combo_size": max_combo_size,
1109
+ "token_aggregations": TOKEN_AGGREGATIONS,
1110
+ "prompt_strategies": PROMPT_STRATEGIES,
1111
+ "total_configurations": total_configs,
1112
+ "successful_configurations": len(successful),
1113
+ "all_results": all_results,
1114
+ "top_10": successful[:10],
1115
+ }
1116
+
1117
+ with open(summary_file, "w") as f:
1118
+ json.dump(summary, f, indent=2)
1119
+ print(f"\nSweep summary saved to: {summary_file}")
1120
+
1121
+ return summary
1122
+
1123
+
1124
+ if __name__ == "__main__":
1125
+ import argparse
1126
+ parser = argparse.ArgumentParser()
1127
+ parser.add_argument("--task", default="truthfulqa_gen")
1128
+ parser.add_argument("--model", default="meta-llama/Llama-3.2-1B-Instruct")
1129
+ parser.add_argument("--num-pairs", type=int, default=50)
1130
+ # WARNING: Do NOT use --max-layers in production runs!
1131
+ # The whole point of exhaustive analysis is to test ALL layers.
1132
+ # If you need more memory, use a larger instance type instead.
1133
+ parser.add_argument("--max-layers", type=int, default=None,
1134
+ help="DEBUG ONLY - DO NOT USE IN PRODUCTION. Use larger instance instead.")
1135
+ parser.add_argument("--output-dir", default="/home/ubuntu/output")
1136
+ parser.add_argument("--sweep", action="store_true",
1137
+ help="Run comprehensive sweep across all token aggregations and prompt strategies")
1138
+ parser.add_argument("--smart", action="store_true", default=True,
1139
+ help="Use smart search (contiguous + 1,2,3-layer) - DEFAULT")
1140
+ parser.add_argument("--limited", action="store_true",
1141
+ help="Use limited search (1,2,3-layer combos + all layers)")
1142
+ parser.add_argument("--contiguous", action="store_true",
1143
+ help="Use contiguous search (adjacent layers only)")
1144
+ parser.add_argument("--exhaustive", action="store_true",
1145
+ help="Use exhaustive search (all 2^N combinations) - VERY SLOW")
1146
+ parser.add_argument("--max-combo-size", type=int, default=3,
1147
+ help="Max combination size for limited/smart search (default: 3)")
1148
+ parser.add_argument("--token-aggregation", default="final", choices=TOKEN_AGGREGATIONS,
1149
+ help="Token aggregation method (default: final)")
1150
+ parser.add_argument("--prompt-strategy", default="chat_template", choices=PROMPT_STRATEGIES,
1151
+ help="Prompt construction strategy (default: chat_template)")
1152
+ args = parser.parse_args()
1153
+
1154
+ # Print loud warning if max_layers is set
1155
+ if args.max_layers is not None:
1156
+ print("!" * 80)
1157
+ print("WARNING: --max-layers is set! This should ONLY be used for debugging.")
1158
+ print("For production runs, use a larger instance type instead of capping layers.")
1159
+ print("!" * 80)
1160
+
1161
+ if args.sweep:
1162
+ run_comprehensive_sweep(
1163
+ task=args.task,
1164
+ model=args.model,
1165
+ num_pairs=args.num_pairs,
1166
+ max_combo_size=args.max_combo_size,
1167
+ output_dir=args.output_dir,
1168
+ )
1169
+ elif args.exhaustive:
1170
+ run_exhaustive_layer_analysis(
1171
+ task=args.task,
1172
+ model=args.model,
1173
+ num_pairs=args.num_pairs,
1174
+ max_layers=args.max_layers,
1175
+ output_dir=args.output_dir,
1176
+ )
1177
+ elif args.contiguous:
1178
+ run_contiguous_layer_analysis(
1179
+ task=args.task,
1180
+ model=args.model,
1181
+ num_pairs=args.num_pairs,
1182
+ output_dir=args.output_dir,
1183
+ )
1184
+ elif args.limited:
1185
+ run_limited_layer_analysis(
1186
+ task=args.task,
1187
+ model=args.model,
1188
+ num_pairs=args.num_pairs,
1189
+ max_combo_size=args.max_combo_size,
1190
+ output_dir=args.output_dir,
1191
+ )
1192
+ else:
1193
+ # Default: smart search
1194
+ run_smart_layer_analysis(
1195
+ task=args.task,
1196
+ model=args.model,
1197
+ num_pairs=args.num_pairs,
1198
+ max_combo_size=args.max_combo_size,
1199
+ token_aggregation=args.token_aggregation,
1200
+ prompt_strategy=args.prompt_strategy,
1201
+ output_dir=args.output_dir,
1202
+ )