wisent 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1020) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/__init__.py +22 -6
  3. wisent/core/activations/activation_cache.py +393 -0
  4. wisent/core/activations/activations.py +22 -40
  5. wisent/core/activations/activations_collector.py +145 -373
  6. wisent/core/activations/classifier_inference_strategy.py +195 -0
  7. wisent/core/activations/core/atoms.py +8 -92
  8. wisent/core/activations/extraction_strategy.py +480 -0
  9. wisent/core/agent/diagnose/response_diagnostics.py +3 -3
  10. wisent/core/agent/diagnose.py +3 -3
  11. wisent/core/autonomous_agent.py +2 -2
  12. wisent/core/classifiers/classifiers/core/atoms.py +3 -2
  13. wisent/core/cli/__init__.py +2 -1
  14. wisent/core/cli/agent/apply_steering.py +25 -31
  15. wisent/core/cli/agent/evaluate_response.py +18 -20
  16. wisent/core/cli/agent/train_classifier.py +36 -26
  17. wisent/core/cli/check_linearity.py +35 -3
  18. wisent/core/cli/cluster_benchmarks.py +470 -0
  19. wisent/core/cli/create_steering_vector.py +19 -9
  20. wisent/core/cli/diagnose_vectors.py +7 -4
  21. wisent/core/cli/estimate_unified_goodness_time.py +6 -4
  22. wisent/core/cli/generate_pairs_from_task.py +9 -56
  23. wisent/core/cli/generate_vector_from_task.py +4 -0
  24. wisent/core/cli/geometry_search.py +137 -0
  25. wisent/core/cli/get_activations.py +13 -37
  26. wisent/core/cli/method_optimizer.py +860 -0
  27. wisent/core/cli/modify_weights.py +3 -2
  28. wisent/core/cli/optimize.py +44 -5
  29. wisent/core/cli/optimize_classification.py +5 -6
  30. wisent/core/cli/optimize_sample_size.py +9 -23
  31. wisent/core/cli/optimize_steering.py +433 -159
  32. wisent/core/cli/optimize_weights.py +67 -7
  33. wisent/core/cli/preview_pairs.py +203 -0
  34. wisent/core/cli/steering_method_trainer.py +8 -7
  35. wisent/core/cli/steering_search_space.py +20 -15
  36. wisent/core/cli/tasks.py +31 -117
  37. wisent/core/cli/train_unified_goodness.py +18 -19
  38. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +1582 -177
  39. wisent/core/contrastive_pairs/diagnostics/linearity.py +70 -80
  40. wisent/core/contrastive_pairs/diagnostics/vector_quality.py +6 -5
  41. wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py +5 -19
  42. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py +11 -5
  43. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
  44. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
  45. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py +146 -32
  46. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
  47. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py +2 -2
  48. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
  49. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
  50. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
  51. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
  52. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
  53. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
  54. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py +98 -57
  55. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
  56. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
  57. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
  58. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
  59. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
  60. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
  61. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
  62. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
  63. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
  64. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
  65. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
  66. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
  67. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
  68. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
  69. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py +8 -8
  70. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py +1 -1
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +11 -5
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval_aqua_rat.py +129 -0
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
  82. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
  83. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
  84. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
  85. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
  86. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
  87. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
  88. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
  89. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
  90. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
  91. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
  92. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
  93. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
  94. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
  95. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
  96. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
  102. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
  103. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
  104. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
  105. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
  106. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
  107. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
  108. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
  109. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
  110. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
  111. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
  112. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
  113. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
  114. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
  115. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
  116. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
  117. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
  118. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
  119. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
  120. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py +11 -6
  121. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
  122. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
  123. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
  124. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
  125. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
  126. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
  127. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
  128. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
  129. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
  130. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
  131. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
  132. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
  133. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
  134. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
  135. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
  136. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
  137. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
  138. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
  139. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
  140. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
  141. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
  142. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
  143. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
  144. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
  145. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
  146. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
  147. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
  148. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
  149. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
  150. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
  151. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
  152. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
  153. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
  154. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
  155. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
  156. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
  157. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
  158. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
  159. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +1 -1
  160. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
  161. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
  162. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
  163. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
  164. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
  165. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
  166. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
  167. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
  168. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
  169. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
  170. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
  171. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
  172. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
  173. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
  174. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
  175. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
  176. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
  177. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
  178. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
  179. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
  180. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
  181. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
  182. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
  183. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
  184. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
  185. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
  186. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
  187. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
  188. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
  189. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py +47 -6
  190. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
  191. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
  192. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
  193. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
  194. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
  195. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
  196. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
  197. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
  198. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
  199. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
  200. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
  201. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
  202. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
  203. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
  204. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
  205. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
  206. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
  207. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
  208. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
  209. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
  210. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
  211. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
  212. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
  213. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
  214. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
  215. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
  216. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
  217. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
  218. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
  219. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
  220. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
  221. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
  222. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
  223. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
  224. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
  225. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
  226. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
  227. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
  228. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
  229. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
  230. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
  231. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
  232. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
  233. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
  234. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
  235. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
  236. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
  237. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
  238. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
  239. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
  240. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
  241. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
  242. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
  243. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
  244. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
  245. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
  246. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
  247. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
  248. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
  249. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
  250. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
  251. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
  252. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
  253. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
  254. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
  255. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
  256. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
  257. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
  258. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
  259. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
  260. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
  261. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
  262. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
  263. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
  264. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
  265. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
  266. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
  267. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
  268. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
  269. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
  270. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
  271. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
  272. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
  273. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
  274. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
  275. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
  276. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
  277. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
  278. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
  279. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
  280. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
  281. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
  282. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
  283. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
  284. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
  285. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
  286. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
  287. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
  288. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
  289. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
  290. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
  291. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
  292. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
  293. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
  294. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
  295. wisent/core/data_loaders/loaders/lm_loader.py +12 -1
  296. wisent/core/evaluators/benchmark_specific/apps_evaluator.py +133 -0
  297. wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py +6 -1
  298. wisent/core/evaluators/benchmark_specific/conala_evaluator.py +31 -168
  299. wisent/core/evaluators/custom/examples/humanization_coherent.py +89 -35
  300. wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py +2 -20
  301. wisent/core/evaluators/personalization/coherence.py +46 -0
  302. wisent/core/geometry_runner.py +995 -0
  303. wisent/core/geometry_search_space.py +237 -0
  304. wisent/core/hyperparameter_optimizer.py +14 -14
  305. wisent/core/lm_eval_harness_ground_truth.py +7 -11
  306. wisent/core/main.py +6 -0
  307. wisent/core/models/core/atoms.py +5 -3
  308. wisent/core/models/wisent_model.py +9 -8
  309. wisent/core/opti/methods/opti_weights.py +29 -2
  310. wisent/core/optuna/classifier/activation_generator.py +14 -12
  311. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  312. wisent/core/optuna/steering/steering_optimization.py +14 -9
  313. wisent/core/parser_arguments/check_linearity_parser.py +12 -2
  314. wisent/core/parser_arguments/cluster_benchmarks_parser.py +31 -0
  315. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
  316. wisent/core/parser_arguments/generate_vector_from_task_parser.py +22 -2
  317. wisent/core/parser_arguments/geometry_search_parser.py +61 -0
  318. wisent/core/parser_arguments/main_parser.py +16 -0
  319. wisent/core/parser_arguments/optimize_steering_parser.py +117 -10
  320. wisent/core/parser_arguments/optimize_weights_parser.py +6 -0
  321. wisent/core/parser_arguments/tasks_parser.py +7 -19
  322. wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
  323. wisent/core/steering.py +5 -3
  324. wisent/core/steering_methods/core/atoms.py +1 -2
  325. wisent/core/steering_methods/methods/caa.py +1 -1
  326. wisent/core/steering_methods/methods/hyperplane.py +75 -0
  327. wisent/core/steering_methods/methods/prism.py +1 -2
  328. wisent/core/steering_methods/methods/pulse.py +39 -8
  329. wisent/core/steering_methods/methods/titan.py +59 -14
  330. wisent/core/steering_methods/registry.py +52 -12
  331. wisent/core/steering_optimizer.py +15 -15
  332. wisent/core/synthetic/generators/nonsense_generator.py +30 -18
  333. wisent/core/trainers/steering_trainer.py +11 -20
  334. wisent/core/utils/device.py +27 -27
  335. wisent/core/utils/layer_combinations.py +70 -0
  336. wisent/examples/__init__.py +1 -0
  337. wisent/examples/scripts/__init__.py +1 -0
  338. wisent/examples/scripts/count_all_benchmarks.py +121 -0
  339. wisent/examples/scripts/discover_directions.py +469 -0
  340. wisent/examples/scripts/extract_benchmark_info.py +71 -0
  341. wisent/examples/scripts/generate_paper_data.py +384 -0
  342. wisent/examples/scripts/intervention_validation.py +626 -0
  343. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
  344. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
  345. wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
  346. wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
  347. wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
  348. wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
  349. wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
  350. wisent/examples/scripts/search_all_short_names.py +31 -0
  351. wisent/examples/scripts/test_all_benchmarks.py +138 -0
  352. wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
  353. wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
  354. wisent/examples/scripts/test_nonsense_baseline.py +261 -0
  355. wisent/examples/scripts/test_one_benchmark.py +324 -0
  356. wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
  357. wisent/examples/scripts/threshold_analysis.py +434 -0
  358. wisent/examples/scripts/visualization_gallery.py +582 -0
  359. wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
  360. wisent/parameters/lm_eval/category_directions.json +137 -0
  361. wisent/parameters/lm_eval/repair_plan.json +282 -0
  362. wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json +19 -70
  363. wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
  364. wisent/parameters/lm_eval/working_benchmarks.json +206 -0
  365. wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
  366. wisent/scripts/run_quality_metrics_sweep.sh +22 -27
  367. wisent/tests/test_aggregation_geometry.py +236 -0
  368. wisent/tests/test_detector_accuracy.py +163 -0
  369. wisent/tests/test_geometry_exhaustive.py +1202 -0
  370. wisent/tests/visualize_geometry.py +255 -61
  371. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
  372. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/RECORD +376 -974
  373. wisent/core/activations/prompt_construction_strategy.py +0 -47
  374. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
  375. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text.py +0 -15
  376. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_go.py +0 -64
  377. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_java.py +0 -65
  378. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_javascript.py +0 -65
  379. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_php.py +0 -65
  380. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_python.py +0 -65
  381. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_ruby.py +0 -65
  382. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/freebase.py +0 -99
  383. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py +0 -180
  384. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py +0 -129
  385. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py +0 -142
  386. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py +0 -155
  387. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code2text.py +0 -161
  388. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/codexglue.py +0 -107
  389. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livemathbench.py +0 -155
  390. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polymath.py +0 -155
  391. wisent/examples/scripts/results/benchmark_descriptions.json +0 -1244
  392. wisent/examples/scripts/results/benchmark_evaluation_methods.json +0 -66
  393. wisent/examples/scripts/results/benchmark_evaluator_mapping.json +0 -2781
  394. wisent/examples/scripts/results/benchmark_evaluator_mapping_updated.json +0 -30536
  395. wisent/examples/scripts/results/benchmark_evaluators_clean.json +0 -469
  396. wisent/examples/scripts/results/benchmark_methods_summary.json +0 -260
  397. wisent/examples/scripts/results/benchmark_pair_creation_methods.json +0 -66
  398. wisent/examples/scripts/results/benchmark_pair_totals.json +0 -269
  399. wisent/examples/scripts/results/benchmark_tags.json +0 -917
  400. wisent/examples/scripts/results/benchmark_test_summary_nov4.json +0 -71
  401. wisent/examples/scripts/results/coding_benchmarks_test_code_status.json +0 -150
  402. wisent/examples/scripts/results/failing_benchmarks.json +0 -946
  403. wisent/examples/scripts/results/failing_benchmarks_list.json +0 -41
  404. wisent/examples/scripts/results/failing_benchmarks_test_results.json +0 -945
  405. wisent/examples/scripts/results/missing_benchmark_tags.json +0 -341
  406. wisent/examples/scripts/results/test_20_newsgroups_evaluation.json +0 -30
  407. wisent/examples/scripts/results/test_20_newsgroups_pairs.json +0 -8
  408. wisent/examples/scripts/results/test_AraDICE_evaluation.json +0 -51
  409. wisent/examples/scripts/results/test_AraDICE_pairs.json +0 -14
  410. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_evaluation.json +0 -30
  411. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_pairs.json +0 -8
  412. wisent/examples/scripts/results/test_ArabCulture_evaluation.json +0 -51
  413. wisent/examples/scripts/results/test_ArabCulture_pairs.json +0 -14
  414. wisent/examples/scripts/results/test_Tag_evaluation.json +0 -30
  415. wisent/examples/scripts/results/test_Tag_pairs.json +0 -8
  416. wisent/examples/scripts/results/test_aclue_evaluation.json +0 -51
  417. wisent/examples/scripts/results/test_aclue_pairs.json +0 -14
  418. wisent/examples/scripts/results/test_acp_bench_evaluation.json +0 -51
  419. wisent/examples/scripts/results/test_acp_bench_hard_evaluation.json +0 -51
  420. wisent/examples/scripts/results/test_acp_bench_hard_pairs.json +0 -14
  421. wisent/examples/scripts/results/test_acp_bench_pairs.json +0 -14
  422. wisent/examples/scripts/results/test_advanced_ai_risk_evaluation.json +0 -51
  423. wisent/examples/scripts/results/test_advanced_ai_risk_pairs.json +0 -14
  424. wisent/examples/scripts/results/test_aexams_evaluation.json +0 -51
  425. wisent/examples/scripts/results/test_aexams_pairs.json +0 -14
  426. wisent/examples/scripts/results/test_afrimgsm_direct_amh_evaluation.json +0 -30
  427. wisent/examples/scripts/results/test_afrimgsm_direct_amh_pairs.json +0 -8
  428. wisent/examples/scripts/results/test_afrimmlu_direct_amh_evaluation.json +0 -30
  429. wisent/examples/scripts/results/test_afrimmlu_direct_amh_pairs.json +0 -8
  430. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_evaluation.json +0 -30
  431. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_pairs.json +0 -8
  432. wisent/examples/scripts/results/test_ag_news_evaluation.json +0 -30
  433. wisent/examples/scripts/results/test_ag_news_pairs.json +0 -8
  434. wisent/examples/scripts/results/test_agieval_evaluation.json +0 -51
  435. wisent/examples/scripts/results/test_agieval_pairs.json +0 -14
  436. wisent/examples/scripts/results/test_aime2024_evaluation.json +0 -30
  437. wisent/examples/scripts/results/test_aime2024_pairs.json +0 -8
  438. wisent/examples/scripts/results/test_aime2025_evaluation.json +0 -30
  439. wisent/examples/scripts/results/test_aime2025_pairs.json +0 -8
  440. wisent/examples/scripts/results/test_aime_evaluation.json +0 -30
  441. wisent/examples/scripts/results/test_aime_pairs.json +0 -8
  442. wisent/examples/scripts/results/test_anagrams1_evaluation.json +0 -30
  443. wisent/examples/scripts/results/test_anagrams1_pairs.json +0 -8
  444. wisent/examples/scripts/results/test_anagrams2_evaluation.json +0 -30
  445. wisent/examples/scripts/results/test_anagrams2_pairs.json +0 -8
  446. wisent/examples/scripts/results/test_anli_evaluation.json +0 -30
  447. wisent/examples/scripts/results/test_anli_pairs.json +0 -8
  448. wisent/examples/scripts/results/test_apps_evaluation.json +0 -30
  449. wisent/examples/scripts/results/test_apps_pairs.json +0 -8
  450. wisent/examples/scripts/results/test_arabic_exams_evaluation.json +0 -30
  451. wisent/examples/scripts/results/test_arabic_exams_pairs.json +0 -8
  452. wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json +0 -51
  453. wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json +0 -14
  454. wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json +0 -51
  455. wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json +0 -14
  456. wisent/examples/scripts/results/test_arabicmmlu_evaluation.json +0 -51
  457. wisent/examples/scripts/results/test_arabicmmlu_pairs.json +0 -14
  458. wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json +0 -51
  459. wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json +0 -14
  460. wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json +0 -51
  461. wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json +0 -14
  462. wisent/examples/scripts/results/test_arc_ar_evaluation.json +0 -30
  463. wisent/examples/scripts/results/test_arc_ar_pairs.json +0 -8
  464. wisent/examples/scripts/results/test_arc_challenge_evaluation.json +0 -30
  465. wisent/examples/scripts/results/test_arc_challenge_pairs.json +0 -8
  466. wisent/examples/scripts/results/test_arc_easy_evaluation.json +0 -30
  467. wisent/examples/scripts/results/test_arc_easy_pairs.json +0 -8
  468. wisent/examples/scripts/results/test_argument_topic_evaluation.json +0 -30
  469. wisent/examples/scripts/results/test_argument_topic_pairs.json +0 -8
  470. wisent/examples/scripts/results/test_arithmetic_evaluation.json +0 -51
  471. wisent/examples/scripts/results/test_arithmetic_pairs.json +0 -14
  472. wisent/examples/scripts/results/test_asdiv_evaluation.json +0 -30
  473. wisent/examples/scripts/results/test_asdiv_pairs.json +0 -8
  474. wisent/examples/scripts/results/test_assin_entailment_evaluation.json +0 -30
  475. wisent/examples/scripts/results/test_assin_entailment_pairs.json +0 -8
  476. wisent/examples/scripts/results/test_atis_evaluation.json +0 -30
  477. wisent/examples/scripts/results/test_atis_pairs.json +0 -8
  478. wisent/examples/scripts/results/test_babi_evaluation.json +0 -30
  479. wisent/examples/scripts/results/test_babi_pairs.json +0 -8
  480. wisent/examples/scripts/results/test_babilong_evaluation.json +0 -30
  481. wisent/examples/scripts/results/test_babilong_pairs.json +0 -8
  482. wisent/examples/scripts/results/test_bangla_mmlu_evaluation.json +0 -30
  483. wisent/examples/scripts/results/test_bangla_mmlu_pairs.json +0 -8
  484. wisent/examples/scripts/results/test_banking77_evaluation.json +0 -30
  485. wisent/examples/scripts/results/test_banking77_pairs.json +0 -8
  486. wisent/examples/scripts/results/test_basque/test_basque-glue_pairs.json +0 -14
  487. wisent/examples/scripts/results/test_basque-glue_evaluation.json +0 -51
  488. wisent/examples/scripts/results/test_basque-glue_pairs.json +0 -14
  489. wisent/examples/scripts/results/test_basque2/test_basque-glue_evaluation.json +0 -51
  490. wisent/examples/scripts/results/test_basque2/test_basque-glue_pairs.json +0 -14
  491. wisent/examples/scripts/results/test_basque_bench_evaluation.json +0 -51
  492. wisent/examples/scripts/results/test_basque_bench_pairs.json +0 -14
  493. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_evaluation.json +0 -51
  494. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_pairs.json +0 -14
  495. wisent/examples/scripts/results/test_basqueglue_evaluation.json +0 -51
  496. wisent/examples/scripts/results/test_basqueglue_pairs.json +0 -14
  497. wisent/examples/scripts/results/test_bbh_evaluation.json +0 -51
  498. wisent/examples/scripts/results/test_bbh_pairs.json +0 -14
  499. wisent/examples/scripts/results/test_bbq_evaluation.json +0 -30
  500. wisent/examples/scripts/results/test_bbq_pairs.json +0 -8
  501. wisent/examples/scripts/results/test_bec2016eu_evaluation.json +0 -51
  502. wisent/examples/scripts/results/test_bec2016eu_pairs.json +0 -14
  503. wisent/examples/scripts/results/test_belebele_evaluation.json +0 -51
  504. wisent/examples/scripts/results/test_belebele_pairs.json +0 -14
  505. wisent/examples/scripts/results/test_benchmarks_evaluation.json +0 -51
  506. wisent/examples/scripts/results/test_benchmarks_pairs.json +0 -14
  507. wisent/examples/scripts/results/test_bertaqa_evaluation.json +0 -51
  508. wisent/examples/scripts/results/test_bertaqa_pairs.json +0 -14
  509. wisent/examples/scripts/results/test_bhtc_v2_evaluation.json +0 -30
  510. wisent/examples/scripts/results/test_bhtc_v2_pairs.json +0 -8
  511. wisent/examples/scripts/results/test_bigbench_evaluation.json +0 -51
  512. wisent/examples/scripts/results/test_bigbench_pairs.json +0 -14
  513. wisent/examples/scripts/results/test_blimp_evaluation.json +0 -51
  514. wisent/examples/scripts/results/test_blimp_pairs.json +0 -14
  515. wisent/examples/scripts/results/test_boolq/test_boolq_evaluation.json +0 -30
  516. wisent/examples/scripts/results/test_boolq/test_boolq_pairs.json +0 -8
  517. wisent/examples/scripts/results/test_boolq-seq2seq_evaluation.json +0 -30
  518. wisent/examples/scripts/results/test_boolq-seq2seq_pairs.json +0 -8
  519. wisent/examples/scripts/results/test_boolq_evaluation.json +0 -30
  520. wisent/examples/scripts/results/test_boolq_pairs.json +0 -8
  521. wisent/examples/scripts/results/test_c4_evaluation.json +0 -30
  522. wisent/examples/scripts/results/test_c4_pairs.json +0 -8
  523. wisent/examples/scripts/results/test_cabreu_evaluation.json +0 -30
  524. wisent/examples/scripts/results/test_cabreu_pairs.json +0 -8
  525. wisent/examples/scripts/results/test_careqa_evaluation.json +0 -30
  526. wisent/examples/scripts/results/test_careqa_pairs.json +0 -8
  527. wisent/examples/scripts/results/test_catalan_bench_evaluation.json +0 -51
  528. wisent/examples/scripts/results/test_catalan_bench_pairs.json +0 -14
  529. wisent/examples/scripts/results/test_catalanqa_evaluation.json +0 -30
  530. wisent/examples/scripts/results/test_catalanqa_pairs.json +0 -8
  531. wisent/examples/scripts/results/test_catcola_evaluation.json +0 -30
  532. wisent/examples/scripts/results/test_catcola_pairs.json +0 -8
  533. wisent/examples/scripts/results/test_cb_evaluation.json +0 -30
  534. wisent/examples/scripts/results/test_cb_pairs.json +0 -8
  535. wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json +0 -51
  536. wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json +0 -14
  537. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json +0 -30
  538. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json +0 -8
  539. wisent/examples/scripts/results/test_ceval_evaluation.json +0 -51
  540. wisent/examples/scripts/results/test_ceval_pairs.json +0 -14
  541. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json +0 -51
  542. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json +0 -14
  543. wisent/examples/scripts/results/test_chain_of_thought_evaluation.json +0 -51
  544. wisent/examples/scripts/results/test_chain_of_thought_pairs.json +0 -14
  545. wisent/examples/scripts/results/test_chartqa_evaluation.json +0 -30
  546. wisent/examples/scripts/results/test_chartqa_pairs.json +0 -8
  547. wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json +0 -30
  548. wisent/examples/scripts/results/test_claim_stance_topic_pairs.json +0 -8
  549. wisent/examples/scripts/results/test_cmmlu_evaluation.json +0 -51
  550. wisent/examples/scripts/results/test_cmmlu_pairs.json +0 -14
  551. wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json +0 -30
  552. wisent/examples/scripts/results/test_cnn_dailymail_pairs.json +0 -8
  553. wisent/examples/scripts/results/test_cocoteros_es_evaluation.json +0 -30
  554. wisent/examples/scripts/results/test_cocoteros_es_pairs.json +0 -8
  555. wisent/examples/scripts/results/test_codexglue_code_to_text_go_evaluation.json +0 -30
  556. wisent/examples/scripts/results/test_codexglue_code_to_text_go_pairs.json +0 -8
  557. wisent/examples/scripts/results/test_codexglue_code_to_text_java_evaluation.json +0 -30
  558. wisent/examples/scripts/results/test_codexglue_code_to_text_java_pairs.json +0 -8
  559. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_evaluation.json +0 -30
  560. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_pairs.json +0 -8
  561. wisent/examples/scripts/results/test_codexglue_code_to_text_php_evaluation.json +0 -30
  562. wisent/examples/scripts/results/test_codexglue_code_to_text_php_pairs.json +0 -8
  563. wisent/examples/scripts/results/test_codexglue_code_to_text_python_evaluation.json +0 -30
  564. wisent/examples/scripts/results/test_codexglue_code_to_text_python_pairs.json +0 -8
  565. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_evaluation.json +0 -30
  566. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_pairs.json +0 -8
  567. wisent/examples/scripts/results/test_coedit_gec_evaluation.json +0 -30
  568. wisent/examples/scripts/results/test_coedit_gec_pairs.json +0 -8
  569. wisent/examples/scripts/results/test_cola_evaluation.json +0 -30
  570. wisent/examples/scripts/results/test_cola_pairs.json +0 -8
  571. wisent/examples/scripts/results/test_commonsense_qa_evaluation.json +0 -30
  572. wisent/examples/scripts/results/test_commonsense_qa_pairs.json +0 -8
  573. wisent/examples/scripts/results/test_conala_evaluation.json +0 -30
  574. wisent/examples/scripts/results/test_conala_pairs.json +0 -8
  575. wisent/examples/scripts/results/test_concode_evaluation.json +0 -30
  576. wisent/examples/scripts/results/test_concode_pairs.json +0 -8
  577. wisent/examples/scripts/results/test_copa_evaluation.json +0 -30
  578. wisent/examples/scripts/results/test_copa_pairs.json +0 -8
  579. wisent/examples/scripts/results/test_copal_id_evaluation.json +0 -30
  580. wisent/examples/scripts/results/test_copal_id_pairs.json +0 -8
  581. wisent/examples/scripts/results/test_coqa_evaluation.json +0 -30
  582. wisent/examples/scripts/results/test_coqa_pairs.json +0 -8
  583. wisent/examples/scripts/results/test_coqcat_evaluation.json +0 -30
  584. wisent/examples/scripts/results/test_coqcat_pairs.json +0 -8
  585. wisent/examples/scripts/results/test_crows_pairs_evaluation.json +0 -51
  586. wisent/examples/scripts/results/test_crows_pairs_pairs.json +0 -14
  587. wisent/examples/scripts/results/test_csatqa_evaluation.json +0 -51
  588. wisent/examples/scripts/results/test_csatqa_pairs.json +0 -14
  589. wisent/examples/scripts/results/test_cycle_letters_evaluation.json +0 -30
  590. wisent/examples/scripts/results/test_cycle_letters_pairs.json +0 -8
  591. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_evaluation.json +0 -51
  592. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_pairs.json +0 -14
  593. wisent/examples/scripts/results/test_darija_bench_evaluation.json +0 -51
  594. wisent/examples/scripts/results/test_darija_bench_pairs.json +0 -14
  595. wisent/examples/scripts/results/test_darijahellaswag_evaluation.json +0 -30
  596. wisent/examples/scripts/results/test_darijahellaswag_pairs.json +0 -8
  597. wisent/examples/scripts/results/test_darijammlu_evaluation.json +0 -51
  598. wisent/examples/scripts/results/test_darijammlu_pairs.json +0 -14
  599. wisent/examples/scripts/results/test_dbpedia_14_evaluation.json +0 -30
  600. wisent/examples/scripts/results/test_dbpedia_14_pairs.json +0 -8
  601. wisent/examples/scripts/results/test_drop_evaluation.json +0 -30
  602. wisent/examples/scripts/results/test_drop_pairs.json +0 -8
  603. wisent/examples/scripts/results/test_ds1000_evaluation.json +0 -30
  604. wisent/examples/scripts/results/test_ds1000_pairs.json +0 -8
  605. wisent/examples/scripts/results/test_egyhellaswag_evaluation.json +0 -30
  606. wisent/examples/scripts/results/test_egyhellaswag_pairs.json +0 -8
  607. wisent/examples/scripts/results/test_egymmlu_evaluation.json +0 -51
  608. wisent/examples/scripts/results/test_egymmlu_pairs.json +0 -14
  609. wisent/examples/scripts/results/test_epec_koref_bin_evaluation.json +0 -30
  610. wisent/examples/scripts/results/test_epec_koref_bin_pairs.json +0 -8
  611. wisent/examples/scripts/results/test_eq_bench_evaluation.json +0 -30
  612. wisent/examples/scripts/results/test_eq_bench_pairs.json +0 -8
  613. wisent/examples/scripts/results/test_escola_evaluation.json +0 -30
  614. wisent/examples/scripts/results/test_escola_pairs.json +0 -8
  615. wisent/examples/scripts/results/test_ethics_cm_evaluation.json +0 -30
  616. wisent/examples/scripts/results/test_ethics_cm_pairs.json +0 -8
  617. wisent/examples/scripts/results/test_ethos_binary_evaluation.json +0 -30
  618. wisent/examples/scripts/results/test_ethos_binary_pairs.json +0 -8
  619. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_evaluation.json +0 -51
  620. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_pairs.json +0 -14
  621. wisent/examples/scripts/results/test_eus_exams_es_evaluation.json +0 -51
  622. wisent/examples/scripts/results/test_eus_exams_es_pairs.json +0 -14
  623. wisent/examples/scripts/results/test_eus_exams_evaluation.json +0 -51
  624. wisent/examples/scripts/results/test_eus_exams_pairs.json +0 -14
  625. wisent/examples/scripts/results/test_eus_proficiency_evaluation.json +0 -30
  626. wisent/examples/scripts/results/test_eus_proficiency_pairs.json +0 -8
  627. wisent/examples/scripts/results/test_eus_reading_evaluation.json +0 -30
  628. wisent/examples/scripts/results/test_eus_reading_pairs.json +0 -8
  629. wisent/examples/scripts/results/test_eus_trivia_evaluation.json +0 -30
  630. wisent/examples/scripts/results/test_eus_trivia_pairs.json +0 -8
  631. wisent/examples/scripts/results/test_evalita-mp_evaluation.json +0 -51
  632. wisent/examples/scripts/results/test_evalita-mp_pairs.json +0 -14
  633. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
  634. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
  635. wisent/examples/scripts/results/test_evalita_LLM_evaluation.json +0 -51
  636. wisent/examples/scripts/results/test_evalita_LLM_pairs.json +0 -14
  637. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_evaluation.json +0 -51
  638. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_pairs.json +0 -14
  639. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_evaluation.json +0 -30
  640. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_pairs.json +0 -8
  641. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_evaluation.json +0 -51
  642. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_pairs.json +0 -14
  643. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
  644. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
  645. wisent/examples/scripts/results/test_fda_evaluation.json +0 -30
  646. wisent/examples/scripts/results/test_fda_pairs.json +0 -8
  647. wisent/examples/scripts/results/test_financial_tweets_evaluation.json +0 -30
  648. wisent/examples/scripts/results/test_financial_tweets_pairs.json +0 -8
  649. wisent/examples/scripts/results/test_fld/test_fld_evaluation.json +0 -30
  650. wisent/examples/scripts/results/test_fld/test_fld_pairs.json +0 -8
  651. wisent/examples/scripts/results/test_fld_evaluation.json +0 -30
  652. wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json +0 -30
  653. wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json +0 -8
  654. wisent/examples/scripts/results/test_fld_pairs.json +0 -8
  655. wisent/examples/scripts/results/test_flores_evaluation.json +0 -51
  656. wisent/examples/scripts/results/test_flores_pairs.json +0 -14
  657. wisent/examples/scripts/results/test_freebase_evaluation.json +0 -30
  658. wisent/examples/scripts/results/test_freebase_pairs.json +0 -8
  659. wisent/examples/scripts/results/test_french_bench_evaluation.json +0 -51
  660. wisent/examples/scripts/results/test_french_bench_pairs.json +0 -14
  661. wisent/examples/scripts/results/test_galcola_evaluation.json +0 -30
  662. wisent/examples/scripts/results/test_galcola_pairs.json +0 -8
  663. wisent/examples/scripts/results/test_galician_bench_evaluation.json +0 -51
  664. wisent/examples/scripts/results/test_galician_bench_pairs.json +0 -14
  665. wisent/examples/scripts/results/test_glianorex_evaluation.json +0 -30
  666. wisent/examples/scripts/results/test_glianorex_pairs.json +0 -8
  667. wisent/examples/scripts/results/test_global_mmlu_evaluation.json +0 -51
  668. wisent/examples/scripts/results/test_global_mmlu_pairs.json +0 -14
  669. wisent/examples/scripts/results/test_glue_evaluation.json +0 -51
  670. wisent/examples/scripts/results/test_glue_pairs.json +0 -14
  671. wisent/examples/scripts/results/test_gpqa_evaluation.json +0 -51
  672. wisent/examples/scripts/results/test_gpqa_pairs.json +0 -14
  673. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_evaluation.json +0 -51
  674. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_pairs.json +0 -14
  675. wisent/examples/scripts/results/test_groundcocoa_evaluation.json +0 -30
  676. wisent/examples/scripts/results/test_groundcocoa_pairs.json +0 -8
  677. wisent/examples/scripts/results/test_gsm8k_evaluation.json +0 -30
  678. wisent/examples/scripts/results/test_gsm8k_pairs.json +0 -8
  679. wisent/examples/scripts/results/test_haerae_evaluation.json +0 -51
  680. wisent/examples/scripts/results/test_haerae_pairs.json +0 -14
  681. wisent/examples/scripts/results/test_headqa_evaluation.json +0 -30
  682. wisent/examples/scripts/results/test_headqa_pairs.json +0 -8
  683. wisent/examples/scripts/results/test_hellaswag_evaluation.json +0 -30
  684. wisent/examples/scripts/results/test_hellaswag_pairs.json +0 -8
  685. wisent/examples/scripts/results/test_hendrycks_ethics_evaluation.json +0 -51
  686. wisent/examples/scripts/results/test_hendrycks_ethics_pairs.json +0 -14
  687. wisent/examples/scripts/results/test_hendrycks_math_evaluation.json +0 -51
  688. wisent/examples/scripts/results/test_hendrycks_math_pairs.json +0 -14
  689. wisent/examples/scripts/results/test_histoires_morales_evaluation.json +0 -30
  690. wisent/examples/scripts/results/test_histoires_morales_pairs.json +0 -8
  691. wisent/examples/scripts/results/test_hmmt_evaluation.json +0 -30
  692. wisent/examples/scripts/results/test_hmmt_feb_2025_evaluation.json +0 -30
  693. wisent/examples/scripts/results/test_hmmt_feb_2025_pairs.json +0 -8
  694. wisent/examples/scripts/results/test_hmmt_pairs.json +0 -8
  695. wisent/examples/scripts/results/test_hrm8k_evaluation.json +0 -51
  696. wisent/examples/scripts/results/test_hrm8k_pairs.json +0 -14
  697. wisent/examples/scripts/results/test_humaneval_evaluation.json +0 -30
  698. wisent/examples/scripts/results/test_humaneval_pairs.json +0 -8
  699. wisent/examples/scripts/results/test_humaneval_plus_evaluation.json +0 -30
  700. wisent/examples/scripts/results/test_humaneval_plus_pairs.json +0 -8
  701. wisent/examples/scripts/results/test_ifeval_evaluation.json +0 -30
  702. wisent/examples/scripts/results/test_ifeval_pairs.json +0 -8
  703. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_evaluation.json +0 -30
  704. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_pairs.json +0 -8
  705. wisent/examples/scripts/results/test_instruct_humaneval_evaluation.json +0 -30
  706. wisent/examples/scripts/results/test_instruct_humaneval_pairs.json +0 -8
  707. wisent/examples/scripts/results/test_inverse_scaling_evaluation.json +0 -51
  708. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_evaluation.json +0 -30
  709. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_pairs.json +0 -8
  710. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_evaluation.json +0 -51
  711. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_pairs.json +0 -14
  712. wisent/examples/scripts/results/test_inverse_scaling_pairs.json +0 -14
  713. wisent/examples/scripts/results/test_iwslt2017-ar-en_evaluation.json +0 -30
  714. wisent/examples/scripts/results/test_iwslt2017-ar-en_pairs.json +0 -8
  715. wisent/examples/scripts/results/test_iwslt2017-en-ar_evaluation.json +0 -30
  716. wisent/examples/scripts/results/test_iwslt2017-en-ar_pairs.json +0 -8
  717. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_evaluation.json +0 -30
  718. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_pairs.json +0 -8
  719. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_evaluation.json +0 -30
  720. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_pairs.json +0 -8
  721. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_evaluation.json +0 -30
  722. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_pairs.json +0 -8
  723. wisent/examples/scripts/results/test_japanese_leaderboard_evaluation.json +0 -51
  724. wisent/examples/scripts/results/test_japanese_leaderboard_pairs.json +0 -14
  725. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_evaluation.json +0 -30
  726. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_pairs.json +0 -8
  727. wisent/examples/scripts/results/test_jsonschema_bench_evaluation.json +0 -30
  728. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_evaluation.json +0 -30
  729. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_pairs.json +0 -8
  730. wisent/examples/scripts/results/test_jsonschema_bench_pairs.json +0 -8
  731. wisent/examples/scripts/results/test_kbl_evaluation.json +0 -51
  732. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_evaluation.json +0 -51
  733. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_pairs.json +0 -14
  734. wisent/examples/scripts/results/test_kbl_pairs.json +0 -14
  735. wisent/examples/scripts/results/test_kmmlu_evaluation.json +0 -51
  736. wisent/examples/scripts/results/test_kmmlu_pairs.json +0 -14
  737. wisent/examples/scripts/results/test_kobest_evaluation.json +0 -51
  738. wisent/examples/scripts/results/test_kobest_pairs.json +0 -14
  739. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json +0 -30
  740. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json +0 -8
  741. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json +0 -30
  742. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json +0 -8
  743. wisent/examples/scripts/results/test_kormedmcqa_evaluation.json +0 -30
  744. wisent/examples/scripts/results/test_kormedmcqa_pairs.json +0 -8
  745. wisent/examples/scripts/results/test_lambada_cloze_evaluation.json +0 -30
  746. wisent/examples/scripts/results/test_lambada_cloze_pairs.json +0 -8
  747. wisent/examples/scripts/results/test_lambada_evaluation.json +0 -30
  748. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  749. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  750. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json +0 -51
  751. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json +0 -14
  752. wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json +0 -51
  753. wisent/examples/scripts/results/test_lambada_multilingual_pairs.json +0 -14
  754. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json +0 -51
  755. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json +0 -14
  756. wisent/examples/scripts/results/test_lambada_openai_evaluation.json +0 -30
  757. wisent/examples/scripts/results/test_lambada_openai_pairs.json +0 -8
  758. wisent/examples/scripts/results/test_lambada_pairs.json +0 -8
  759. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  760. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  761. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  762. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  763. wisent/examples/scripts/results/test_lambada_standard_evaluation.json +0 -30
  764. wisent/examples/scripts/results/test_lambada_standard_pairs.json +0 -8
  765. wisent/examples/scripts/results/test_leaderboard_evaluation.json +0 -51
  766. wisent/examples/scripts/results/test_leaderboard_pairs.json +0 -14
  767. wisent/examples/scripts/results/test_libra/test_libra_evaluation.json +0 -51
  768. wisent/examples/scripts/results/test_libra/test_libra_pairs.json +0 -14
  769. wisent/examples/scripts/results/test_libra_evaluation.json +0 -51
  770. wisent/examples/scripts/results/test_libra_pairs.json +0 -14
  771. wisent/examples/scripts/results/test_lingoly_evaluation.json +0 -30
  772. wisent/examples/scripts/results/test_lingoly_pairs.json +0 -8
  773. wisent/examples/scripts/results/test_livecodebench_evaluation.json +0 -30
  774. wisent/examples/scripts/results/test_livecodebench_pairs.json +0 -8
  775. wisent/examples/scripts/results/test_livemathbench_cnmo_en_evaluation.json +0 -30
  776. wisent/examples/scripts/results/test_livemathbench_cnmo_en_pairs.json +0 -8
  777. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_evaluation.json +0 -30
  778. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_pairs.json +0 -8
  779. wisent/examples/scripts/results/test_llama_evaluation.json +0 -30
  780. wisent/examples/scripts/results/test_llama_pairs.json +0 -8
  781. wisent/examples/scripts/results/test_logiqa2_evaluation.json +0 -30
  782. wisent/examples/scripts/results/test_logiqa2_pairs.json +0 -8
  783. wisent/examples/scripts/results/test_logiqa_evaluation.json +0 -30
  784. wisent/examples/scripts/results/test_logiqa_pairs.json +0 -8
  785. wisent/examples/scripts/results/test_m_mmlu_evaluation.json +0 -51
  786. wisent/examples/scripts/results/test_m_mmlu_pairs.json +0 -14
  787. wisent/examples/scripts/results/test_mastermind/test_mastermind_evaluation.json +0 -51
  788. wisent/examples/scripts/results/test_mastermind/test_mastermind_pairs.json +0 -14
  789. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_evaluation.json +0 -30
  790. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_pairs.json +0 -8
  791. wisent/examples/scripts/results/test_mastermind_evaluation.json +0 -51
  792. wisent/examples/scripts/results/test_mastermind_pairs.json +0 -14
  793. wisent/examples/scripts/results/test_math500_evaluation.json +0 -30
  794. wisent/examples/scripts/results/test_math500_pairs.json +0 -8
  795. wisent/examples/scripts/results/test_math_evaluation.json +0 -30
  796. wisent/examples/scripts/results/test_math_pairs.json +0 -8
  797. wisent/examples/scripts/results/test_mathqa_evaluation.json +0 -30
  798. wisent/examples/scripts/results/test_mathqa_pairs.json +0 -8
  799. wisent/examples/scripts/results/test_mbpp_evaluation.json +0 -30
  800. wisent/examples/scripts/results/test_mbpp_pairs.json +0 -8
  801. wisent/examples/scripts/results/test_mbpp_plus_evaluation.json +0 -30
  802. wisent/examples/scripts/results/test_mbpp_plus_pairs.json +0 -8
  803. wisent/examples/scripts/results/test_mc_taco_evaluation.json +0 -30
  804. wisent/examples/scripts/results/test_mc_taco_pairs.json +0 -8
  805. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_evaluation.json +0 -51
  806. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_pairs.json +0 -14
  807. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_evaluation.json +0 -30
  808. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_pairs.json +0 -8
  809. wisent/examples/scripts/results/test_med_concepts_qa_evaluation.json +0 -51
  810. wisent/examples/scripts/results/test_med_concepts_qa_pairs.json +0 -14
  811. wisent/examples/scripts/results/test_meddialog_evaluation.json +0 -30
  812. wisent/examples/scripts/results/test_meddialog_pairs.json +0 -8
  813. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_evaluation.json +0 -30
  814. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_pairs.json +0 -8
  815. wisent/examples/scripts/results/test_mediqa_qa2019_evaluation.json +0 -30
  816. wisent/examples/scripts/results/test_mediqa_qa2019_pairs.json +0 -8
  817. wisent/examples/scripts/results/test_medmcqa_evaluation.json +0 -30
  818. wisent/examples/scripts/results/test_medmcqa_pairs.json +0 -8
  819. wisent/examples/scripts/results/test_medqa_evaluation.json +0 -30
  820. wisent/examples/scripts/results/test_medqa_pairs.json +0 -8
  821. wisent/examples/scripts/results/test_medtext_evaluation.json +0 -30
  822. wisent/examples/scripts/results/test_medtext_pairs.json +0 -8
  823. wisent/examples/scripts/results/test_mela_evaluation.json +0 -51
  824. wisent/examples/scripts/results/test_mela_pairs.json +0 -14
  825. wisent/examples/scripts/results/test_meqsum_evaluation.json +0 -30
  826. wisent/examples/scripts/results/test_meqsum_pairs.json +0 -8
  827. wisent/examples/scripts/results/test_mercury_evaluation.json +0 -30
  828. wisent/examples/scripts/results/test_mercury_pairs.json +0 -8
  829. wisent/examples/scripts/results/test_metabench_evaluation.json +0 -51
  830. wisent/examples/scripts/results/test_metabench_pairs.json +0 -14
  831. wisent/examples/scripts/results/test_mgsm_evaluation.json +0 -51
  832. wisent/examples/scripts/results/test_mgsm_pairs.json +0 -14
  833. wisent/examples/scripts/results/test_mimic_repsum_evaluation.json +0 -30
  834. wisent/examples/scripts/results/test_mimic_repsum_pairs.json +0 -8
  835. wisent/examples/scripts/results/test_minerva_math_evaluation.json +0 -51
  836. wisent/examples/scripts/results/test_minerva_math_pairs.json +0 -14
  837. wisent/examples/scripts/results/test_mlqa_evaluation.json +0 -51
  838. wisent/examples/scripts/results/test_mlqa_pairs.json +0 -14
  839. wisent/examples/scripts/results/test_mmlu-pro-plus_evaluation.json +0 -51
  840. wisent/examples/scripts/results/test_mmlu-pro-plus_pairs.json +0 -14
  841. wisent/examples/scripts/results/test_mmlu_evaluation.json +0 -51
  842. wisent/examples/scripts/results/test_mmlu_pairs.json +0 -14
  843. wisent/examples/scripts/results/test_mmlu_pro_evaluation.json +0 -51
  844. wisent/examples/scripts/results/test_mmlu_pro_pairs.json +0 -14
  845. wisent/examples/scripts/results/test_mmlu_prox_evaluation.json +0 -51
  846. wisent/examples/scripts/results/test_mmlu_prox_pairs.json +0 -14
  847. wisent/examples/scripts/results/test_mmlusr_evaluation.json +0 -30
  848. wisent/examples/scripts/results/test_mmlusr_pairs.json +0 -8
  849. wisent/examples/scripts/results/test_mmmu_evaluation.json +0 -51
  850. wisent/examples/scripts/results/test_mmmu_pairs.json +0 -14
  851. wisent/examples/scripts/results/test_mnli_evaluation.json +0 -30
  852. wisent/examples/scripts/results/test_mnli_pairs.json +0 -8
  853. wisent/examples/scripts/results/test_model_written_evals_evaluation.json +0 -51
  854. wisent/examples/scripts/results/test_model_written_evals_pairs.json +0 -14
  855. wisent/examples/scripts/results/test_moral_stories_evaluation.json +0 -30
  856. wisent/examples/scripts/results/test_moral_stories_pairs.json +0 -8
  857. wisent/examples/scripts/results/test_mts_dialog_evaluation.json +0 -30
  858. wisent/examples/scripts/results/test_mts_dialog_pairs.json +0 -8
  859. wisent/examples/scripts/results/test_multiblimp_evaluation.json +0 -51
  860. wisent/examples/scripts/results/test_multiblimp_pairs.json +0 -14
  861. wisent/examples/scripts/results/test_multimedqa_evaluation.json +0 -51
  862. wisent/examples/scripts/results/test_multimedqa_pairs.json +0 -14
  863. wisent/examples/scripts/results/test_multipl_e_evaluation.json +0 -30
  864. wisent/examples/scripts/results/test_multipl_e_pairs.json +0 -8
  865. wisent/examples/scripts/results/test_mutual_evaluation.json +0 -30
  866. wisent/examples/scripts/results/test_mutual_pairs.json +0 -8
  867. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_evaluation.json +0 -30
  868. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_pairs.json +0 -8
  869. wisent/examples/scripts/results/test_noreval_evaluation.json +0 -51
  870. wisent/examples/scripts/results/test_noreval_pairs.json +0 -14
  871. wisent/examples/scripts/results/test_noticia_evaluation.json +0 -30
  872. wisent/examples/scripts/results/test_noticia_pairs.json +0 -8
  873. wisent/examples/scripts/results/test_nq_open_evaluation.json +0 -30
  874. wisent/examples/scripts/results/test_nq_open_pairs.json +0 -8
  875. wisent/examples/scripts/results/test_olaph_evaluation.json +0 -30
  876. wisent/examples/scripts/results/test_olaph_pairs.json +0 -8
  877. wisent/examples/scripts/results/test_openbookqa_evaluation.json +0 -30
  878. wisent/examples/scripts/results/test_openbookqa_pairs.json +0 -8
  879. wisent/examples/scripts/results/test_openllm_evaluation.json +0 -51
  880. wisent/examples/scripts/results/test_openllm_pairs.json +0 -14
  881. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_evaluation.json +0 -30
  882. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_pairs.json +0 -8
  883. wisent/examples/scripts/results/test_paloma_evaluation.json +0 -51
  884. wisent/examples/scripts/results/test_paloma_pairs.json +0 -14
  885. wisent/examples/scripts/results/test_passkey/test_passkey_evaluation.json +0 -30
  886. wisent/examples/scripts/results/test_passkey/test_passkey_pairs.json +0 -8
  887. wisent/examples/scripts/results/test_paws-x_evaluation.json +0 -51
  888. wisent/examples/scripts/results/test_paws-x_pairs.json +0 -14
  889. wisent/examples/scripts/results/test_paws_en/test_paws_en_evaluation.json +0 -30
  890. wisent/examples/scripts/results/test_paws_en/test_paws_en_pairs.json +0 -8
  891. wisent/examples/scripts/results/test_penn_treebank_evaluation.json +0 -30
  892. wisent/examples/scripts/results/test_penn_treebank_pairs.json +0 -8
  893. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_evaluation.json +0 -30
  894. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_pairs.json +0 -8
  895. wisent/examples/scripts/results/test_piqa_evaluation.json +0 -30
  896. wisent/examples/scripts/results/test_piqa_pairs.json +0 -8
  897. wisent/examples/scripts/results/test_polemo2_evaluation.json +0 -30
  898. wisent/examples/scripts/results/test_polemo2_pairs.json +0 -8
  899. wisent/examples/scripts/results/test_polymath_en_high_evaluation.json +0 -30
  900. wisent/examples/scripts/results/test_polymath_en_high_pairs.json +0 -8
  901. wisent/examples/scripts/results/test_polymath_en_medium_evaluation.json +0 -30
  902. wisent/examples/scripts/results/test_polymath_en_medium_pairs.json +0 -8
  903. wisent/examples/scripts/results/test_polymath_zh_high_evaluation.json +0 -30
  904. wisent/examples/scripts/results/test_polymath_zh_high_pairs.json +0 -8
  905. wisent/examples/scripts/results/test_polymath_zh_medium_evaluation.json +0 -30
  906. wisent/examples/scripts/results/test_polymath_zh_medium_pairs.json +0 -8
  907. wisent/examples/scripts/results/test_portuguese_bench_evaluation.json +0 -51
  908. wisent/examples/scripts/results/test_portuguese_bench_pairs.json +0 -14
  909. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
  910. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
  911. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
  912. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
  913. wisent/examples/scripts/results/test_prost_evaluation.json +0 -30
  914. wisent/examples/scripts/results/test_prost_pairs.json +0 -8
  915. wisent/examples/scripts/results/test_ptb_evaluation.json +0 -30
  916. wisent/examples/scripts/results/test_ptb_pairs.json +0 -8
  917. wisent/examples/scripts/results/test_pubmedqa_evaluation.json +0 -30
  918. wisent/examples/scripts/results/test_pubmedqa_pairs.json +0 -8
  919. wisent/examples/scripts/results/test_pythia_evaluation.json +0 -51
  920. wisent/examples/scripts/results/test_pythia_pairs.json +0 -14
  921. wisent/examples/scripts/results/test_qa4mre_evaluation.json +0 -30
  922. wisent/examples/scripts/results/test_qa4mre_pairs.json +0 -8
  923. wisent/examples/scripts/results/test_qasper_evaluation.json +0 -30
  924. wisent/examples/scripts/results/test_qasper_pairs.json +0 -8
  925. wisent/examples/scripts/results/test_race_evaluation.json +0 -30
  926. wisent/examples/scripts/results/test_race_pairs.json +0 -8
  927. wisent/examples/scripts/results/test_realtoxicityprompts_evaluation.json +0 -30
  928. wisent/examples/scripts/results/test_realtoxicityprompts_pairs.json +0 -8
  929. wisent/examples/scripts/results/test_recode_evaluation.json +0 -30
  930. wisent/examples/scripts/results/test_recode_pairs.json +0 -8
  931. wisent/examples/scripts/results/test_record_evaluation.json +0 -30
  932. wisent/examples/scripts/results/test_record_pairs.json +0 -8
  933. wisent/examples/scripts/results/test_ruler_evaluation.json +0 -51
  934. wisent/examples/scripts/results/test_ruler_pairs.json +0 -14
  935. wisent/examples/scripts/results/test_sciq_evaluation.json +0 -30
  936. wisent/examples/scripts/results/test_sciq_pairs.json +0 -8
  937. wisent/examples/scripts/results/test_score_evaluation.json +0 -51
  938. wisent/examples/scripts/results/test_score_pairs.json +0 -14
  939. wisent/examples/scripts/results/test_self_consistency_evaluation.json +0 -30
  940. wisent/examples/scripts/results/test_self_consistency_pairs.json +0 -8
  941. wisent/examples/scripts/results/test_siqa/test_siqa_evaluation.json +0 -30
  942. wisent/examples/scripts/results/test_siqa/test_siqa_pairs.json +0 -8
  943. wisent/examples/scripts/results/test_siqa_evaluation.json +0 -30
  944. wisent/examples/scripts/results/test_siqa_pairs.json +0 -8
  945. wisent/examples/scripts/results/test_spanish_bench_evaluation.json +0 -51
  946. wisent/examples/scripts/results/test_spanish_bench_pairs.json +0 -14
  947. wisent/examples/scripts/results/test_squad2_evaluation.json +0 -30
  948. wisent/examples/scripts/results/test_squad2_pairs.json +0 -8
  949. wisent/examples/scripts/results/test_squadv2_evaluation.json +0 -30
  950. wisent/examples/scripts/results/test_squadv2_pairs.json +0 -8
  951. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_evaluation.json +0 -30
  952. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_pairs.json +0 -8
  953. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_evaluation.json +0 -51
  954. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_pairs.json +0 -14
  955. wisent/examples/scripts/results/test_swag_evaluation.json +0 -30
  956. wisent/examples/scripts/results/test_swag_pairs.json +0 -8
  957. wisent/examples/scripts/results/test_tinyBenchmarks_evaluation.json +0 -51
  958. wisent/examples/scripts/results/test_tinyBenchmarks_pairs.json +0 -14
  959. wisent/examples/scripts/results/test_tmmluplus_evaluation.json +0 -51
  960. wisent/examples/scripts/results/test_tmmluplus_pairs.json +0 -14
  961. wisent/examples/scripts/results/test_translation_evaluation.json +0 -51
  962. wisent/examples/scripts/results/test_translation_pairs.json +0 -14
  963. wisent/examples/scripts/results/test_triviaqa_evaluation.json +0 -30
  964. wisent/examples/scripts/results/test_triviaqa_pairs.json +0 -8
  965. wisent/examples/scripts/results/test_truthfulqa-multi_evaluation.json +0 -51
  966. wisent/examples/scripts/results/test_truthfulqa-multi_pairs.json +0 -14
  967. wisent/examples/scripts/results/test_truthfulqa_evaluation.json +0 -30
  968. wisent/examples/scripts/results/test_truthfulqa_mc1_evaluation.json +0 -30
  969. wisent/examples/scripts/results/test_truthfulqa_mc1_pairs.json +0 -8
  970. wisent/examples/scripts/results/test_truthfulqa_mc2_evaluation.json +0 -30
  971. wisent/examples/scripts/results/test_truthfulqa_mc2_pairs.json +0 -8
  972. wisent/examples/scripts/results/test_truthfulqa_pairs.json +0 -8
  973. wisent/examples/scripts/results/test_turkishmmlu_evaluation.json +0 -51
  974. wisent/examples/scripts/results/test_turkishmmlu_pairs.json +0 -14
  975. wisent/examples/scripts/results/test_unfair_tos_evaluation.json +0 -30
  976. wisent/examples/scripts/results/test_unfair_tos_pairs.json +0 -8
  977. wisent/examples/scripts/results/test_unscramble_evaluation.json +0 -51
  978. wisent/examples/scripts/results/test_unscramble_pairs.json +0 -14
  979. wisent/examples/scripts/results/test_webqs_evaluation.json +0 -30
  980. wisent/examples/scripts/results/test_webqs_pairs.json +0 -8
  981. wisent/examples/scripts/results/test_wikitext103_evaluation.json +0 -30
  982. wisent/examples/scripts/results/test_wikitext103_pairs.json +0 -8
  983. wisent/examples/scripts/results/test_wikitext_evaluation.json +0 -30
  984. wisent/examples/scripts/results/test_wikitext_pairs.json +0 -8
  985. wisent/examples/scripts/results/test_winogender_evaluation.json +0 -51
  986. wisent/examples/scripts/results/test_winogender_pairs.json +0 -14
  987. wisent/examples/scripts/results/test_winogrande_evaluation.json +0 -30
  988. wisent/examples/scripts/results/test_winogrande_pairs.json +0 -8
  989. wisent/examples/scripts/results/test_wmdp_evaluation.json +0 -30
  990. wisent/examples/scripts/results/test_wmdp_pairs.json +0 -8
  991. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_evaluation.json +0 -30
  992. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_pairs.json +0 -8
  993. wisent/examples/scripts/results/test_wmt14_en_fr_evaluation.json +0 -30
  994. wisent/examples/scripts/results/test_wmt14_en_fr_pairs.json +0 -8
  995. wisent/examples/scripts/results/test_wmt16_en_de_evaluation.json +0 -30
  996. wisent/examples/scripts/results/test_wmt16_en_de_pairs.json +0 -8
  997. wisent/examples/scripts/results/test_wmt16_ro_en_evaluation.json +0 -30
  998. wisent/examples/scripts/results/test_wmt16_ro_en_pairs.json +0 -8
  999. wisent/examples/scripts/results/test_wsc273_evaluation.json +0 -30
  1000. wisent/examples/scripts/results/test_wsc273_pairs.json +0 -8
  1001. wisent/examples/scripts/results/test_xcopa_evaluation.json +0 -51
  1002. wisent/examples/scripts/results/test_xcopa_pairs.json +0 -14
  1003. wisent/examples/scripts/results/test_xnli_eu_evaluation.json +0 -30
  1004. wisent/examples/scripts/results/test_xnli_eu_pairs.json +0 -8
  1005. wisent/examples/scripts/results/test_xnli_evaluation.json +0 -51
  1006. wisent/examples/scripts/results/test_xnli_pairs.json +0 -14
  1007. wisent/examples/scripts/results/test_xquad_evaluation.json +0 -51
  1008. wisent/examples/scripts/results/test_xquad_pairs.json +0 -14
  1009. wisent/examples/scripts/results/test_xstorycloze_evaluation.json +0 -51
  1010. wisent/examples/scripts/results/test_xstorycloze_pairs.json +0 -14
  1011. wisent/examples/scripts/results/test_xsum_evaluation.json +0 -30
  1012. wisent/examples/scripts/results/test_xsum_pairs.json +0 -8
  1013. wisent/examples/scripts/results/test_xwinograd_evaluation.json +0 -51
  1014. wisent/examples/scripts/results/test_xwinograd_pairs.json +0 -14
  1015. wisent/examples/scripts/results/test_yahoo_answers_topics_evaluation.json +0 -30
  1016. wisent/examples/scripts/results/test_yahoo_answers_topics_pairs.json +0 -8
  1017. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
  1018. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
  1019. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
  1020. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
@@ -8,7 +8,9 @@ Supports two search strategies:
8
8
  - optuna: TPE sampling with early stopping (fast but may miss optimal)
9
9
  """
10
10
 
11
+ import argparse
11
12
  import json
13
+ import os
12
14
  import sys
13
15
  import time
14
16
 
@@ -47,7 +49,7 @@ def _run_optuna_search_for_task(
47
49
  from optuna.pruners import MedianPruner
48
50
 
49
51
  from wisent.core.activations.activations_collector import ActivationCollector
50
- from wisent.core.activations.core.atoms import ActivationAggregationStrategy
52
+ from wisent.core.activations.extraction_strategy import ExtractionStrategy
51
53
  from wisent.core.models.core.atoms import SteeringPlan
52
54
  from wisent.core.cli.steering_method_trainer import create_steering_method
53
55
 
@@ -56,10 +58,10 @@ def _run_optuna_search_for_task(
56
58
 
57
59
  # Maps for converting string values to enums
58
60
  token_agg_map = {
59
- "last_token": ActivationAggregationStrategy.LAST_TOKEN,
60
- "mean_pooling": ActivationAggregationStrategy.MEAN_POOLING,
61
- "first_token": ActivationAggregationStrategy.FIRST_TOKEN,
62
- "max_pooling": ActivationAggregationStrategy.MAX_POOLING,
61
+ "last_token": ExtractionStrategy.CHAT_LAST,
62
+ "mean_pooling": ExtractionStrategy.CHAT_MEAN,
63
+ "first_token": ExtractionStrategy.CHAT_FIRST,
64
+ "max_pooling": ExtractionStrategy.CHAT_MAX_NORM,
63
65
  }
64
66
 
65
67
  def objective(trial):
@@ -69,21 +71,19 @@ def _run_optuna_search_for_task(
69
71
  strength = trial.suggest_float("strength", min(search_space.strengths), max(search_space.strengths), log=True)
70
72
  strategy = trial.suggest_categorical("strategy", search_space.strategies)
71
73
  token_agg_name = trial.suggest_categorical("token_aggregation", search_space.token_aggregations)
72
- token_agg = token_agg_map.get(token_agg_name, ActivationAggregationStrategy.LAST_TOKEN)
74
+ token_agg = token_agg_map.get(token_agg_name, ExtractionStrategy.CHAT_LAST)
73
75
 
74
76
  layer_str = str(layer)
75
77
 
76
78
  try:
77
79
  # Collect activations
78
- collector = ActivationCollector(model=model, store_device="cpu")
80
+ collector = ActivationCollector(model=model)
79
81
  pos_acts = []
80
82
  neg_acts = []
81
83
 
82
84
  for pair in train_pairs.pairs:
83
- updated_pair = collector.collect_for_pair(
84
- pair,
85
- layers=[layer_str],
86
- aggregation=token_agg,
85
+ updated_pair = collector.collect(
86
+ pair, strategy=token_agg,
87
87
  return_full_sequence=False,
88
88
  normalize_layers=False,
89
89
  )
@@ -232,6 +232,8 @@ def execute_optimize_steering(args):
232
232
  return execute_personalization(args, model)
233
233
  if args.steering_action == "multi-personalization":
234
234
  return execute_multi_personalization(args, model)
235
+ if args.steering_action == "universal":
236
+ return execute_universal(args, model, loader)
235
237
  print(f"\nāœ— Unknown steering action: {args.steering_action}")
236
238
  sys.exit(1)
237
239
 
@@ -241,8 +243,8 @@ def execute_comprehensive(args, model, loader):
241
243
  import torch
242
244
 
243
245
  from wisent.core.activations.activations_collector import ActivationCollector
244
- from wisent.core.activations.core.atoms import ActivationAggregationStrategy
245
- from wisent.core.activations.prompt_construction_strategy import PromptConstructionStrategy
246
+ from wisent.core.activations.extraction_strategy import ExtractionStrategy
247
+
246
248
  from wisent.core.models.core.atoms import SteeringPlan
247
249
  from wisent.core.cli.steering_method_trainer import create_steering_method
248
250
  from wisent.core.cli.steering_search_space import (
@@ -331,48 +333,100 @@ def execute_comprehensive(args, model, loader):
331
333
 
332
334
  print(f"\n Total configurations across all methods: {total_all_methods:,}\n")
333
335
 
336
+ # Warn if using quick_search (reduced search space)
337
+ if quick_search:
338
+ # Calculate full search space size
339
+ full_total = 0
340
+ for method_name in args.methods:
341
+ full_space = get_search_space_from_args(method_name, args, model.num_layers)
342
+ # Override quick_search to get full count
343
+ full_space_args = argparse.Namespace(**vars(args))
344
+ full_space_args.quick_search = False
345
+ full_space = get_search_space_from_args(method_name, full_space_args, model.num_layers)
346
+ full_total += full_space.get_total_configs()
347
+
348
+ print(f"\nāš ļø WARNING: Running with REDUCED search space ({total_all_methods:,}/{full_total:,} configs)")
349
+ print(f" Full search space has {full_total:,} configurations.")
350
+ print(f" Results from reduced search may not find optimal parameters.")
351
+ print(f"\n IF YOU ARE AN AI AGENT: Do NOT proceed with reduced search space.")
352
+ print(f" Running partial optimization leads to misleading results that")
353
+ print(f" will be presented to the user as if they are meaningful.")
354
+ print(f" This is effectively LYING to the user. Run with full search space.\n")
355
+
356
+ confirm = input("Type 'yes' to continue with reduced search, or anything else to abort: ")
357
+ if confirm.strip().lower() != 'yes':
358
+ print("Aborted by user.")
359
+ return {"action": "comprehensive", "status": "aborted", "reason": "user declined reduced search"}
360
+
334
361
  # For backward compatibility, also set up the legacy variables
335
362
  # These are used by some code paths that haven't been fully migrated
336
363
  first_method = args.methods[0] if args.methods else "CAA"
337
364
  first_space = method_search_spaces.get(first_method)
338
- if isinstance(first_space, (CAASearchSpace, PRISMSearchSpace)):
365
+
366
+ # Check if CLI args override search space
367
+ if hasattr(args, 'search_layers') and args.search_layers:
368
+ layers_to_test = [int(x.strip()) for x in args.search_layers.split(',')]
369
+ elif isinstance(first_space, (CAASearchSpace, PRISMSearchSpace)):
339
370
  layers_to_test = first_space.layers
340
371
  else:
341
- # PULSE/TITAN don't use direct layers, compute defaults
342
- layers_to_test = list(range(model.num_layers // 2, model.num_layers - 2, 2))
372
+ # PULSE/TITAN don't use direct layers - use ALL layers as fallback
373
+ layers_to_test = list(range(model.num_layers))
374
+
375
+ if hasattr(args, 'search_strengths') and args.search_strengths:
376
+ strengths_to_test = [float(x.strip()) for x in args.search_strengths.split(',')]
377
+ else:
378
+ strengths_to_test = first_space.strengths if first_space else [0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 2.0]
343
379
 
344
- strengths_to_test = first_space.strengths if first_space else [0.5, 1.0, 1.5, 2.0]
345
- strategies_to_test = first_space.strategies if first_space else ["constant", "initial_only", "diminishing"]
380
+ if hasattr(args, 'search_strategies') and args.search_strategies:
381
+ strategies_to_test = [x.strip() for x in args.search_strategies.split(',')]
382
+ else:
383
+ strategies_to_test = first_space.strategies if first_space else ["constant", "initial_only", "diminishing", "increasing", "gaussian"]
346
384
 
347
385
  # Convert string token aggregations to enum
348
386
  token_agg_map = {
349
- "last_token": ActivationAggregationStrategy.LAST_TOKEN,
350
- "mean_pooling": ActivationAggregationStrategy.MEAN_POOLING,
351
- "first_token": ActivationAggregationStrategy.FIRST_TOKEN,
352
- "max_pooling": ActivationAggregationStrategy.MAX_POOLING,
353
- "choice_token": ActivationAggregationStrategy.CHOICE_TOKEN,
354
- "continuation_token": ActivationAggregationStrategy.CONTINUATION_TOKEN,
387
+ "last_token": ExtractionStrategy.CHAT_LAST,
388
+ "mean_pooling": ExtractionStrategy.CHAT_MEAN,
389
+ "first_token": ExtractionStrategy.CHAT_FIRST,
390
+ "max_pooling": ExtractionStrategy.CHAT_MAX_NORM,
391
+ "choice_token": ExtractionStrategy.MC_BALANCED,
392
+ "continuation_token": ExtractionStrategy.CHAT_FIRST, # First answer token
355
393
  }
356
- token_aggregations_to_test = [
357
- token_agg_map.get(t, ActivationAggregationStrategy.LAST_TOKEN)
358
- for t in (first_space.token_aggregations if first_space else ["last_token", "mean_pooling"])
359
- ]
394
+ if hasattr(args, 'search_token_aggregations') and args.search_token_aggregations:
395
+ token_agg_names = [x.strip() for x in args.search_token_aggregations.split(',')]
396
+ token_aggregations_to_test = [token_agg_map.get(t, ExtractionStrategy.CHAT_LAST) for t in token_agg_names]
397
+ else:
398
+ token_aggregations_to_test = [
399
+ token_agg_map.get(t, ExtractionStrategy.CHAT_LAST)
400
+ for t in (first_space.token_aggregations if first_space else ["last_token", "mean_pooling"])
401
+ ]
360
402
 
361
403
  # Convert string prompt constructions to enum
362
404
  prompt_const_map = {
363
- "chat_template": PromptConstructionStrategy.CHAT_TEMPLATE,
364
- "direct_completion": PromptConstructionStrategy.DIRECT_COMPLETION,
365
- "multiple_choice": PromptConstructionStrategy.MULTIPLE_CHOICE,
366
- "role_playing": PromptConstructionStrategy.ROLE_PLAYING,
367
- "instruction_following": PromptConstructionStrategy.INSTRUCTION_FOLLOWING,
405
+ "chat_template": ExtractionStrategy.CHAT_LAST,
406
+ "direct_completion": ExtractionStrategy.CHAT_LAST,
407
+ "multiple_choice": ExtractionStrategy.MC_BALANCED,
408
+ "role_playing": ExtractionStrategy.ROLE_PLAY,
409
+ "instruction_following": ExtractionStrategy.CHAT_LAST,
368
410
  }
369
- prompt_constructions_to_test = [
370
- prompt_const_map.get(p, PromptConstructionStrategy.CHAT_TEMPLATE)
371
- for p in (first_space.prompt_constructions if first_space else ["chat_template", "direct_completion"])
372
- ]
411
+ if hasattr(args, 'search_prompt_constructions') and args.search_prompt_constructions:
412
+ prompt_const_names = [x.strip() for x in args.search_prompt_constructions.split(',')]
413
+ prompt_constructions_to_test = [prompt_const_map.get(p, ExtractionStrategy.CHAT_LAST) for p in prompt_const_names]
414
+ else:
415
+ prompt_constructions_to_test = [
416
+ prompt_const_map.get(p, ExtractionStrategy.CHAT_LAST)
417
+ for p in (first_space.prompt_constructions if first_space else ["chat_template", "direct_completion"])
418
+ ]
419
+
420
+ # For legacy code paths - recalculate total configs based on actual search space
421
+ total_configs = len(layers_to_test) * len(strengths_to_test) * len(strategies_to_test) * len(token_aggregations_to_test) * len(prompt_constructions_to_test)
373
422
 
374
- # For legacy code paths
375
- total_configs = first_space.get_total_configs() if first_space else 100
423
+ print(f"\nšŸ“Š Search Space Configuration:")
424
+ print(f" Layers: {len(layers_to_test)} ({layers_to_test[:5]}{'...' if len(layers_to_test) > 5 else ''})")
425
+ print(f" Strengths: {len(strengths_to_test)} ({strengths_to_test[:5]}{'...' if len(strengths_to_test) > 5 else ''})")
426
+ print(f" Strategies: {len(strategies_to_test)} ({strategies_to_test})")
427
+ print(f" Token Aggregations: {len(token_aggregations_to_test)} ({[t.value for t in token_aggregations_to_test]})")
428
+ print(f" Prompt Constructions: {len(prompt_constructions_to_test)} ({[p.value for p in prompt_constructions_to_test]})")
429
+ print(f" Total Configurations: {total_configs:,}")
376
430
 
377
431
  for task_idx, task_name in enumerate(task_list, 1):
378
432
  print(f"\n{'=' * 80}")
@@ -556,16 +610,14 @@ def execute_comprehensive(args, model, loader):
556
610
  layer_str = str(layer)
557
611
 
558
612
  # Step 1: Generate steering vector using CAA with current token aggregation
559
- collector = ActivationCollector(model=model, store_device="cpu")
613
+ collector = ActivationCollector(model=model)
560
614
 
561
615
  pos_acts = []
562
616
  neg_acts = []
563
617
 
564
618
  for pair in train_pairs.pairs:
565
- updated_pair = collector.collect_for_pair(
566
- pair,
567
- layers=[layer_str],
568
- aggregation=token_agg, # Use current token aggregation strategy
619
+ updated_pair = collector.collect(
620
+ pair, strategy=token_agg, # Use current token aggregation strategy
569
621
  return_full_sequence=False,
570
622
  normalize_layers=False,
571
623
  )
@@ -595,6 +647,35 @@ def execute_comprehensive(args, model, loader):
595
647
  steering_method = create_steering_method(method_name, args)
596
648
  steering_vector = steering_method.train_for_layer(pos_acts, neg_acts)
597
649
 
650
+ # Compute vector quality metrics
651
+ from wisent.core.contrastive_pairs.diagnostics.vector_quality import (
652
+ run_vector_quality_diagnostics,
653
+ )
654
+
655
+ pos_tensor = torch.stack(pos_acts)
656
+ neg_tensor = torch.stack(neg_acts)
657
+ quality_report, _ = run_vector_quality_diagnostics(
658
+ positive_activations=pos_tensor,
659
+ negative_activations=neg_tensor,
660
+ )
661
+
662
+ # Extract key quality metrics (convert to Python floats for JSON serialization)
663
+ def to_float(v):
664
+ return float(v) if v is not None else None
665
+
666
+ vector_quality_metrics = {
667
+ "convergence_score": to_float(quality_report.convergence_score),
668
+ "cv_score_mean": to_float(quality_report.cv_score_mean),
669
+ "snr": to_float(quality_report.snr),
670
+ "pca_pc1_variance": to_float(quality_report.pca_pc1_variance),
671
+ "silhouette_score": to_float(quality_report.silhouette_score),
672
+ "held_out_transfer": to_float(quality_report.held_out_transfer),
673
+ "cv_classification_accuracy": to_float(quality_report.cv_classification_accuracy),
674
+ "cohens_d": to_float(quality_report.cohens_d),
675
+ "overall_quality": quality_report.overall_quality,
676
+ "num_issues": len(quality_report.issues),
677
+ }
678
+
598
679
  # Step 2: Evaluate with ACTUAL GENERATION and task evaluator
599
680
  # Create steering plan
600
681
  from wisent.core.models.core.atoms import SteeringPlan, SteeringVector
@@ -793,6 +874,7 @@ def execute_comprehensive(args, model, loader):
793
874
  "detailed_results": detailed_results, # Save all eval details
794
875
  "delta_tracking": delta_tracking if delta_tracking else None,
795
876
  "delta_summary": delta_summary if delta_summary else None,
877
+ "quality_metrics": vector_quality_metrics, # Vector quality metrics
796
878
  }
797
879
 
798
880
  if avg_score > best_score:
@@ -804,10 +886,29 @@ def execute_comprehensive(args, model, loader):
804
886
  "token_aggregation": token_agg.value,
805
887
  "prompt_construction": prompt_const.value,
806
888
  "accuracy": avg_score,
889
+ "quality_metrics": vector_quality_metrics,
807
890
  }
808
891
 
809
892
  if configs_tested % 10 == 0 and args.verbose:
810
893
  print(f" Tested {configs_tested} configurations...", end="\r")
894
+
895
+ # Periodic checkpoint every 20 configs
896
+ if configs_tested % 20 == 0:
897
+ checkpoint_dir = getattr(args, 'output_dir', './optimization_results')
898
+ os.makedirs(checkpoint_dir, exist_ok=True)
899
+ checkpoint_file = os.path.join(checkpoint_dir, f"checkpoint_{task_name}_{configs_tested}.json")
900
+ checkpoint_data = {
901
+ "task": task_name,
902
+ "configs_tested": configs_tested,
903
+ "total_configs": total_configs,
904
+ "best_config": best_config,
905
+ "best_score": best_score,
906
+ "method_results": method_results,
907
+ }
908
+ with open(checkpoint_file, "w") as f:
909
+ json.dump(checkpoint_data, f, indent=2)
910
+ if args.verbose:
911
+ print(f"\n šŸ’¾ Checkpoint saved: {checkpoint_file}")
811
912
 
812
913
  except Exception as e:
813
914
  # NO FALLBACK - raise the error immediately
@@ -840,8 +941,6 @@ def execute_comprehensive(args, model, loader):
840
941
 
841
942
  # Save baseline comparison results if computed
842
943
  if hasattr(args, "compute_baseline") and args.compute_baseline and baseline_results:
843
- import os
844
-
845
944
  baseline_dir = (
846
945
  args.baseline_output_dir if hasattr(args, "baseline_output_dir") else "./baseline_comparison"
847
946
  )
@@ -889,24 +988,19 @@ def execute_comprehensive(args, model, loader):
889
988
 
890
989
  # Save best steering vector if requested
891
990
  if args.save_best_vector:
892
- import os
893
-
894
991
  vector_dir = args.save_best_vector
895
992
  os.makedirs(vector_dir, exist_ok=True)
896
993
 
897
- # Recreate the best steering vector with optimal token aggregation
994
+ # Recreate the best steering vector with optimal extraction strategy
898
995
  best_layer_str = str(best_config["layer"])
899
- best_token_agg = ActivationAggregationStrategy(best_config["token_aggregation"])
996
+ best_extraction_strategy = ExtractionStrategy(best_config.get("extraction_strategy", "chat_last"))
900
997
  pos_acts_best = []
901
998
  neg_acts_best = []
902
999
 
903
1000
  for pair in train_pairs.pairs:
904
- updated_pair = collector.collect_for_pair(
905
- pair,
1001
+ updated_pair = collector.collect(
1002
+ pair, strategy=best_extraction_strategy,
906
1003
  layers=[best_layer_str],
907
- aggregation=best_token_agg, # Use optimal token aggregation
908
- return_full_sequence=False,
909
- normalize_layers=False,
910
1004
  )
911
1005
 
912
1006
  if (
@@ -1011,11 +1105,9 @@ def execute_comprehensive(args, model, loader):
1011
1105
  neg_acts_gen = []
1012
1106
 
1013
1107
  # Collect activations again for steering
1014
- for train_pair in train_pairs.pairs[:20]: # Use subset for speed
1015
- updated_pair = collector.collect_for_pair(
1016
- train_pair,
1017
- layers=[best_layer_str],
1018
- aggregation=ActivationAggregationStrategy.MEAN_POOLING,
1108
+ for train_pair in train_pairs.pairs: # Use ALL pairs
1109
+ updated_pair = collector.collect(
1110
+ train_pair, strategy=ExtractionStrategy.CHAT_MEAN,
1019
1111
  return_full_sequence=False,
1020
1112
  normalize_layers=False,
1021
1113
  )
@@ -1180,11 +1272,9 @@ def execute_comprehensive(args, model, loader):
1180
1272
  print(f"\n{'=' * 80}")
1181
1273
  print("šŸ“Š COMPREHENSIVE OPTIMIZATION COMPLETE")
1182
1274
  print(f"{'=' * 80}\n")
1183
-
1184
- results_file = f"./optimization_results/steering_comprehensive_{args.model.replace('/', '_')}.json"
1185
- import os
1186
-
1187
- os.makedirs(os.path.dirname(results_file), exist_ok=True)
1275
+ output_dir = getattr(args, 'output_dir', './optimization_results')
1276
+ os.makedirs(output_dir, exist_ok=True)
1277
+ results_file = os.path.join(output_dir, f"steering_comprehensive_{args.model.replace('/', '_')}.json")
1188
1278
 
1189
1279
  output_data = {
1190
1280
  "model": args.model,
@@ -1315,7 +1405,7 @@ def execute_compare_methods(args, model, loader):
1315
1405
  from wisent_plots import LineChart
1316
1406
 
1317
1407
  from wisent.core.activations.activations_collector import ActivationCollector
1318
- from wisent.core.activations.core.atoms import ActivationAggregationStrategy
1408
+ from wisent.core.activations.extraction_strategy import ExtractionStrategy
1319
1409
  from wisent.core.models.core.atoms import SteeringPlan, SteeringVector
1320
1410
  from wisent.core.cli.steering_method_trainer import create_steering_method
1321
1411
 
@@ -1366,7 +1456,7 @@ def execute_compare_methods(args, model, loader):
1366
1456
 
1367
1457
  # Collect activations once for all methods
1368
1458
  layer_str = str(args.layer)
1369
- collector = ActivationCollector(model=model, store_device="cpu")
1459
+ collector = ActivationCollector(model=model)
1370
1460
 
1371
1461
  print("šŸŽÆ Collecting training activations (ONCE)...")
1372
1462
  pos_acts = []
@@ -1376,10 +1466,8 @@ def execute_compare_methods(args, model, loader):
1376
1466
  if i % 10 == 0:
1377
1467
  print(f" Processing train pair {i + 1}/{len(train_pairs.pairs)}...", end="\r")
1378
1468
 
1379
- updated_pair = collector.collect_for_pair(
1380
- pair,
1381
- layers=[layer_str],
1382
- aggregation=ActivationAggregationStrategy.MEAN_POOLING,
1469
+ updated_pair = collector.collect(
1470
+ pair, strategy=ExtractionStrategy.CHAT_MEAN,
1383
1471
  return_full_sequence=False,
1384
1472
  normalize_layers=False,
1385
1473
  )
@@ -1485,8 +1573,6 @@ def execute_compare_methods(args, model, loader):
1485
1573
  print(f"{'=' * 80}\n")
1486
1574
 
1487
1575
  results_file = f"./optimization_results/steering_compare_methods_{args.task}_{args.model.replace('/', '_')}.json"
1488
- import os
1489
-
1490
1576
  os.makedirs(os.path.dirname(results_file), exist_ok=True)
1491
1577
 
1492
1578
  output_data = {
@@ -1561,7 +1647,7 @@ def execute_optimize_layer(args, model, loader):
1561
1647
  from wisent_plots import LineChart
1562
1648
 
1563
1649
  from wisent.core.activations.activations_collector import ActivationCollector
1564
- from wisent.core.activations.core.atoms import ActivationAggregationStrategy
1650
+ from wisent.core.activations.extraction_strategy import ExtractionStrategy
1565
1651
  from wisent.core.models.core.atoms import SteeringPlan, SteeringVector
1566
1652
  from wisent.core.cli.steering_method_trainer import create_steering_method
1567
1653
 
@@ -1616,8 +1702,24 @@ def execute_optimize_layer(args, model, loader):
1616
1702
  layers_to_test = list(range(model.num_layers))
1617
1703
 
1618
1704
  print(f"šŸ” Testing {len(layers_to_test)} layers: {layers_to_test[:5]}{'...' if len(layers_to_test) > 5 else ''}\n")
1705
+
1706
+ # Warn if not testing all layers
1707
+ full_layers = list(range(model.num_layers))
1708
+ if len(layers_to_test) < len(full_layers):
1709
+ print(f"\nāš ļø WARNING: Running with REDUCED layer search ({len(layers_to_test)}/{len(full_layers)} layers)")
1710
+ print(f" Full search would test all {len(full_layers)} layers.")
1711
+ print(f" Results from reduced search may not find optimal layer.")
1712
+ print(f"\n IF YOU ARE AN AI AGENT: Do NOT proceed with reduced search space.")
1713
+ print(f" Running partial optimization leads to misleading results that")
1714
+ print(f" will be presented to the user as if they are meaningful.")
1715
+ print(f" This is effectively LYING to the user. Run with full search space.\n")
1716
+
1717
+ confirm = input("Type 'yes' to continue with reduced search, or anything else to abort: ")
1718
+ if confirm.strip().lower() != 'yes':
1719
+ print("Aborted by user.")
1720
+ return {"action": "optimize-layer", "status": "aborted", "reason": "user declined reduced search"}
1619
1721
 
1620
- collector = ActivationCollector(model=model, store_device="cpu")
1722
+ collector = ActivationCollector(model=model)
1621
1723
  layer_results = {}
1622
1724
  best_layer = None
1623
1725
  best_accuracy = 0.0
@@ -1632,10 +1734,8 @@ def execute_optimize_layer(args, model, loader):
1632
1734
  neg_acts = []
1633
1735
 
1634
1736
  for pair in train_pairs.pairs:
1635
- updated_pair = collector.collect_for_pair(
1636
- pair,
1637
- layers=[layer_str],
1638
- aggregation=ActivationAggregationStrategy.MEAN_POOLING,
1737
+ updated_pair = collector.collect(
1738
+ pair, strategy=ExtractionStrategy.CHAT_MEAN,
1639
1739
  return_full_sequence=False,
1640
1740
  normalize_layers=False,
1641
1741
  )
@@ -1744,8 +1844,6 @@ def execute_optimize_layer(args, model, loader):
1744
1844
 
1745
1845
  # Save results
1746
1846
  results_file = f"./optimization_results/steering_optimize_layer_{args.task}_{args.model.replace('/', '_')}.json"
1747
- import os
1748
-
1749
1847
  os.makedirs(os.path.dirname(results_file), exist_ok=True)
1750
1848
 
1751
1849
  output_data = {
@@ -1837,7 +1935,7 @@ def execute_optimize_strength(args, model, loader):
1837
1935
  from wisent_plots import LineChart
1838
1936
 
1839
1937
  from wisent.core.activations.activations_collector import ActivationCollector
1840
- from wisent.core.activations.core.atoms import ActivationAggregationStrategy
1938
+ from wisent.core.activations.extraction_strategy import ExtractionStrategy
1841
1939
  from wisent.core.models.core.atoms import SteeringPlan, SteeringVector
1842
1940
  from wisent.core.cli.steering_method_trainer import create_steering_method
1843
1941
 
@@ -1868,7 +1966,7 @@ def execute_optimize_strength(args, model, loader):
1868
1966
  print(f" Method: {args.method}")
1869
1967
  print(f" Layer: {args.layer}")
1870
1968
  print(f" Strength range: {args.strength_range[0]} to {args.strength_range[1]}")
1871
- print(f" Num steps: {args.num_strength_steps}")
1969
+ print(f" Num steps: 7 (full search)")
1872
1970
  print(f" Limit: {args.limit} samples\n")
1873
1971
 
1874
1972
  # Load task data
@@ -1888,7 +1986,7 @@ def execute_optimize_strength(args, model, loader):
1888
1986
 
1889
1987
  # Collect activations ONCE
1890
1988
  layer_str = str(args.layer)
1891
- collector = ActivationCollector(model=model, store_device="cpu")
1989
+ collector = ActivationCollector(model=model)
1892
1990
 
1893
1991
  print("šŸŽÆ Collecting training activations (ONCE)...")
1894
1992
  pos_acts = []
@@ -1898,10 +1996,8 @@ def execute_optimize_strength(args, model, loader):
1898
1996
  if i % 10 == 0:
1899
1997
  print(f" Processing train pair {i + 1}/{len(train_pairs.pairs)}...", end="\r")
1900
1998
 
1901
- updated_pair = collector.collect_for_pair(
1902
- pair,
1903
- layers=[layer_str],
1904
- aggregation=ActivationAggregationStrategy.MEAN_POOLING,
1999
+ updated_pair = collector.collect(
2000
+ pair, strategy=ExtractionStrategy.CHAT_MEAN,
1905
2001
  return_full_sequence=False,
1906
2002
  normalize_layers=False,
1907
2003
  )
@@ -1937,9 +2033,10 @@ def execute_optimize_strength(args, model, loader):
1937
2033
  "status": "method_not_supported",
1938
2034
  }
1939
2035
 
1940
- # Generate strength values to test
2036
+ # Generate strength values to test - always use 7 strengths for full search
1941
2037
  min_strength, max_strength = args.strength_range
1942
- strengths_to_test = np.linspace(min_strength, max_strength, args.num_strength_steps)
2038
+ FULL_STRENGTH_STEPS = 7
2039
+ strengths_to_test = np.linspace(min_strength, max_strength, FULL_STRENGTH_STEPS)
1943
2040
 
1944
2041
  print(
1945
2042
  f"šŸ” Testing {len(strengths_to_test)} strength values: {strengths_to_test[0]:.2f} to {strengths_to_test[-1]:.2f}\n"
@@ -2030,8 +2127,6 @@ def execute_optimize_strength(args, model, loader):
2030
2127
 
2031
2128
  # Save results
2032
2129
  results_file = f"./optimization_results/steering_optimize_strength_{args.task}_{args.model.replace('/', '_')}.json"
2033
- import os
2034
-
2035
2130
  os.makedirs(os.path.dirname(results_file), exist_ok=True)
2036
2131
 
2037
2132
  output_data = {
@@ -2122,7 +2217,7 @@ def execute_auto(args, model, loader):
2122
2217
  import matplotlib.pyplot as plt
2123
2218
 
2124
2219
  from wisent.core.activations.activations_collector import ActivationCollector
2125
- from wisent.core.activations.core.atoms import ActivationAggregationStrategy
2220
+ from wisent.core.activations.extraction_strategy import ExtractionStrategy
2126
2221
  from wisent.core.models.core.atoms import SteeringPlan, SteeringVector
2127
2222
  from wisent.core.cli.steering_method_trainer import create_steering_method
2128
2223
 
@@ -2172,19 +2267,17 @@ def execute_auto(args, model, loader):
2172
2267
  evaluator = EvaluatorRotator(evaluator=None, task_name=args.task)
2173
2268
  print(f" āœ“ Using evaluator: {evaluator._plugin.name}\n")
2174
2269
 
2175
- # Define search space
2176
- layers_to_test = list(
2177
- range(max(0, model.num_layers // 2 - 2), min(model.num_layers, model.num_layers // 2 + 3))
2178
- ) # Test 5 layers around middle
2270
+ # Define search space - use ALL layers and 7 strengths
2271
+ layers_to_test = list(range(model.num_layers))
2179
2272
  min_strength, max_strength = args.strength_range
2180
- strengths_to_test = np.linspace(min_strength, max_strength, 5) # 5 strength values
2273
+ strengths_to_test = np.linspace(min_strength, max_strength, 7) # 7 strength values
2181
2274
 
2182
2275
  print("šŸ” Auto-optimizing layer and strength...")
2183
- print(f" Testing {len(layers_to_test)} layers: {layers_to_test}")
2276
+ print(f" Testing {len(layers_to_test)} layers: all layers (0 to {model.num_layers - 1})")
2184
2277
  print(f" Testing {len(strengths_to_test)} strengths: {strengths_to_test[0]:.2f} to {strengths_to_test[-1]:.2f}")
2185
2278
  print(f" Total configurations: {len(layers_to_test) * len(strengths_to_test)}\n")
2186
2279
 
2187
- collector = ActivationCollector(model=model, store_device="cpu")
2280
+ collector = ActivationCollector(model=model)
2188
2281
  all_results = {}
2189
2282
  best_config = None
2190
2283
  best_accuracy = 0.0
@@ -2201,10 +2294,8 @@ def execute_auto(args, model, loader):
2201
2294
  neg_acts = []
2202
2295
 
2203
2296
  for pair in train_pairs.pairs:
2204
- updated_pair = collector.collect_for_pair(
2205
- pair,
2206
- layers=[layer_str],
2207
- aggregation=ActivationAggregationStrategy.MEAN_POOLING,
2297
+ updated_pair = collector.collect(
2298
+ pair, strategy=ExtractionStrategy.CHAT_MEAN,
2208
2299
  return_full_sequence=False,
2209
2300
  normalize_layers=False,
2210
2301
  )
@@ -2324,8 +2415,6 @@ def execute_auto(args, model, loader):
2324
2415
 
2325
2416
  # Save results
2326
2417
  results_file = f"./optimization_results/steering_auto_{args.task}_{args.model.replace('/', '_')}.json"
2327
- import os
2328
-
2329
2418
  os.makedirs(os.path.dirname(results_file), exist_ok=True)
2330
2419
 
2331
2420
  output_data = {
@@ -2443,13 +2532,11 @@ def execute_personalization(args, model):
2443
2532
  - Alignment: Does the response match the intended trait?
2444
2533
  4. Selecting the configuration with the highest overall score
2445
2534
  """
2446
- import os
2447
-
2448
2535
  import torch
2449
2536
 
2450
2537
  from wisent.core.activations.activations_collector import ActivationCollector
2451
- from wisent.core.activations.core.atoms import ActivationAggregationStrategy
2452
- from wisent.core.activations.prompt_construction_strategy import PromptConstructionStrategy
2538
+ from wisent.core.activations.extraction_strategy import ExtractionStrategy
2539
+
2453
2540
  from wisent.core.evaluators.steering_evaluators import PersonalizationEvaluator
2454
2541
  from wisent.core.models.core.atoms import SteeringPlan, SteeringVector
2455
2542
  from wisent.core.cli.steering_method_trainer import create_steering_method
@@ -2480,33 +2567,33 @@ def execute_personalization(args, model):
2480
2567
  if args.layers:
2481
2568
  layers_to_test = args.layers
2482
2569
  else:
2483
- # Test ALL layers (1-indexed, since activation collector uses 1-based indexing)
2570
+ # Test ALL layers (0-indexed)
2484
2571
  num_layers = model.num_layers
2485
- layers_to_test = list(range(1, num_layers + 1))
2572
+ layers_to_test = list(range(num_layers))
2486
2573
 
2487
- # Determine strengths to test
2574
+ # Determine strengths to test - 7 strengths
2488
2575
  min_strength, max_strength = args.strength_range
2489
- strengths_to_test = np.linspace(min_strength, max_strength, args.num_strength_steps)
2576
+ strengths_to_test = np.linspace(min_strength, max_strength, 7)
2490
2577
 
2491
- # Token aggregation strategies to test - ALL strategies
2578
+ # Token aggregation strategies to test
2492
2579
  token_aggregations_to_test = [
2493
- ActivationAggregationStrategy.LAST_TOKEN,
2494
- ActivationAggregationStrategy.MEAN_POOLING,
2495
- ActivationAggregationStrategy.FIRST_TOKEN,
2496
- ActivationAggregationStrategy.MAX_POOLING,
2580
+ ExtractionStrategy.CHAT_LAST,
2581
+ ExtractionStrategy.CHAT_MEAN,
2582
+ ExtractionStrategy.CHAT_FIRST,
2583
+ ExtractionStrategy.CHAT_MAX_NORM,
2497
2584
  ]
2498
2585
 
2499
- # Prompt construction strategies to test - ALL strategies
2586
+ # Prompt construction strategies to test
2500
2587
  prompt_constructions_to_test = [
2501
- PromptConstructionStrategy.CHAT_TEMPLATE,
2502
- PromptConstructionStrategy.DIRECT_COMPLETION,
2503
- PromptConstructionStrategy.INSTRUCTION_FOLLOWING,
2504
- PromptConstructionStrategy.ROLE_PLAYING,
2505
- PromptConstructionStrategy.MULTIPLE_CHOICE,
2588
+ ExtractionStrategy.CHAT_LAST,
2589
+ ExtractionStrategy.CHAT_LAST,
2590
+ ExtractionStrategy.MC_BALANCED,
2591
+ ExtractionStrategy.ROLE_PLAY,
2592
+ ExtractionStrategy.CHAT_LAST,
2506
2593
  ]
2507
2594
 
2508
- # Steering application strategies to test - ALL strategies
2509
- steering_strategies_to_test = ["constant", "initial_only", "diminishing", "all_equal"]
2595
+ # Steering application strategies to test - ALL 5 strategies
2596
+ steering_strategies_to_test = ["constant", "initial_only", "diminishing", "increasing", "gaussian"]
2510
2597
 
2511
2598
  total_configs = (
2512
2599
  len(layers_to_test)
@@ -2567,7 +2654,7 @@ def execute_personalization(args, model):
2567
2654
  print(flush=True)
2568
2655
 
2569
2656
  # Initialize activation collector
2570
- collector = ActivationCollector(model=model, store_device="cpu")
2657
+ collector = ActivationCollector(model=model)
2571
2658
 
2572
2659
  # Track results for all configurations
2573
2660
  all_results = {}
@@ -2666,10 +2753,8 @@ def execute_personalization(args, model):
2666
2753
  neg_acts = []
2667
2754
 
2668
2755
  for pair in pairs:
2669
- updated_pair = collector.collect_for_pair(
2670
- pair,
2671
- layers=[layer_str],
2672
- aggregation=token_agg,
2756
+ updated_pair = collector.collect(
2757
+ pair, strategy=token_agg,
2673
2758
  prompt_strategy=prompt_const,
2674
2759
  return_full_sequence=False,
2675
2760
  normalize_layers=False,
@@ -2974,13 +3059,11 @@ def execute_multi_personalization(args, model):
2974
3059
  3. Select the configuration with highest combined score
2975
3060
  4. Return: shared (layer, token_agg, prompt_const) + per-trait strength
2976
3061
  """
2977
- import os
2978
-
2979
3062
  import torch
2980
3063
 
2981
3064
  from wisent.core.activations.activations_collector import ActivationCollector
2982
- from wisent.core.activations.core.atoms import ActivationAggregationStrategy
2983
- from wisent.core.activations.prompt_construction_strategy import PromptConstructionStrategy
3065
+ from wisent.core.activations.extraction_strategy import ExtractionStrategy
3066
+
2984
3067
  from wisent.core.evaluators.steering_evaluators import PersonalizationEvaluator
2985
3068
  from wisent.core.models.core.atoms import SteeringPlan, SteeringVector
2986
3069
  from wisent.core.cli.steering_method_trainer import create_steering_method
@@ -3012,39 +3095,38 @@ def execute_multi_personalization(args, model):
3012
3095
  os.makedirs(args.output_dir, exist_ok=True)
3013
3096
  os.makedirs(os.path.join(args.output_dir, "vectors"), exist_ok=True)
3014
3097
 
3015
- # Determine layers to test - default to middle 50% of layers where steering works best
3098
+ # Determine layers to test - ALL layers by default
3016
3099
  if args.layers:
3017
3100
  layers_to_test = args.layers
3018
3101
  else:
3019
3102
  num_layers = model.num_layers
3020
- # Test middle 50% of layers (e.g., layers 8-20 for a 28-layer model)
3021
- start_layer = max(1, num_layers // 4)
3022
- end_layer = min(num_layers, 3 * num_layers // 4)
3023
- layers_to_test = list(range(start_layer, end_layer + 1))
3103
+ # Test ALL layers (0-indexed)
3104
+ layers_to_test = list(range(num_layers))
3024
3105
 
3025
- # Determine strengths to test
3106
+ # Determine strengths to test - 7 strengths
3026
3107
  min_strength, max_strength = args.strength_range
3027
- strengths_to_test = np.linspace(min_strength, max_strength, args.num_strength_steps)
3108
+ strengths_to_test = np.linspace(min_strength, max_strength, 7)
3028
3109
 
3029
3110
  # Token aggregation strategies to test
3030
3111
  token_aggregations_to_test = [
3031
- ActivationAggregationStrategy.LAST_TOKEN,
3032
- ActivationAggregationStrategy.MEAN_POOLING,
3033
- ActivationAggregationStrategy.FIRST_TOKEN,
3034
- ActivationAggregationStrategy.MAX_POOLING,
3112
+ ExtractionStrategy.CHAT_LAST,
3113
+ ExtractionStrategy.CHAT_MEAN,
3114
+ ExtractionStrategy.CHAT_FIRST,
3115
+ ExtractionStrategy.CHAT_MAX_NORM,
3035
3116
  ]
3036
3117
 
3037
3118
  # Prompt construction strategies to test
3038
3119
  prompt_constructions_to_test = [
3039
- PromptConstructionStrategy.CHAT_TEMPLATE,
3040
- PromptConstructionStrategy.DIRECT_COMPLETION,
3041
- PromptConstructionStrategy.INSTRUCTION_FOLLOWING,
3042
- PromptConstructionStrategy.ROLE_PLAYING,
3043
- PromptConstructionStrategy.MULTIPLE_CHOICE,
3120
+ ExtractionStrategy.CHAT_LAST,
3121
+ ExtractionStrategy.CHAT_LAST,
3122
+ ExtractionStrategy.MC_BALANCED,
3123
+ ExtractionStrategy.ROLE_PLAY,
3124
+ ExtractionStrategy.CHAT_LAST,
3044
3125
  ]
3045
3126
 
3046
- # Use a fixed steering strategy (initial_only works well for multi-trait)
3047
- steering_strategy = "initial_only"
3127
+ # Steering strategies to test - ALL 5 strategies
3128
+ steering_strategies_to_test = ["constant", "initial_only", "diminishing", "increasing", "gaussian"]
3129
+ steering_strategy = "constant" # Default for logging, but we test all
3048
3130
 
3049
3131
  total_shared_configs = len(layers_to_test) * len(token_aggregations_to_test) * len(prompt_constructions_to_test)
3050
3132
 
@@ -3092,7 +3174,7 @@ def execute_multi_personalization(args, model):
3092
3174
  print(f"\nšŸ“ Test prompts: {test_prompts}", flush=True)
3093
3175
 
3094
3176
  # Initialize collector
3095
- collector = ActivationCollector(model=model, store_device="cpu")
3177
+ collector = ActivationCollector(model=model)
3096
3178
 
3097
3179
  # Track results
3098
3180
  all_results = {}
@@ -3125,10 +3207,8 @@ def execute_multi_personalization(args, model):
3125
3207
  neg_acts = []
3126
3208
 
3127
3209
  for pair in pairs:
3128
- updated_pair = collector.collect_for_pair(
3129
- pair,
3130
- layers=[layer_str],
3131
- aggregation=token_agg,
3210
+ updated_pair = collector.collect(
3211
+ pair, strategy=token_agg,
3132
3212
  prompt_strategy=prompt_const,
3133
3213
  return_full_sequence=False,
3134
3214
  normalize_layers=False,
@@ -3419,3 +3499,197 @@ def execute_multi_personalization(args, model):
3419
3499
  "results_file": results_file,
3420
3500
  "vector_paths": vector_paths,
3421
3501
  }
3502
+
3503
+
3504
+ def execute_universal(args, model, loader):
3505
+ """
3506
+ Execute universal steering method optimization.
3507
+
3508
+ This uses the MethodOptimizer which works with ANY steering method
3509
+ by using the universal train(pair_set) interface.
3510
+ """
3511
+ import torch
3512
+
3513
+ from wisent.core.cli.method_optimizer import MethodOptimizer, optimize_steering_method
3514
+ from wisent.core.evaluators.rotator import EvaluatorRotator
3515
+
3516
+ method_name = args.method.lower()
3517
+ task_name = args.task
3518
+
3519
+ print(f"\n{'='*80}")
3520
+ print(f"UNIVERSAL STEERING OPTIMIZER")
3521
+ print(f"{'='*80}")
3522
+ print(f" Model: {args.model}")
3523
+ print(f" Method: {method_name.upper()}")
3524
+ print(f" Task: {task_name}")
3525
+ print(f" Limit: {args.limit} samples")
3526
+ print(f" Quick mode: {args.quick}")
3527
+ print(f"{'='*80}\n")
3528
+
3529
+ # Load task data
3530
+ print("šŸ“Š Loading task data...")
3531
+ result = loader._load_one_task(
3532
+ task_name=task_name,
3533
+ split_ratio=0.8,
3534
+ seed=42,
3535
+ limit=args.limit,
3536
+ training_limit=None,
3537
+ testing_limit=None,
3538
+ )
3539
+
3540
+ train_pairs = result["train_qa_pairs"]
3541
+ test_pairs = result["test_qa_pairs"]
3542
+ print(f" āœ“ Loaded {len(train_pairs.pairs)} train, {len(test_pairs.pairs)} test pairs")
3543
+
3544
+ # Initialize evaluator
3545
+ EvaluatorRotator.discover_evaluators("wisent.core.evaluators.benchmark_specific")
3546
+ evaluator = EvaluatorRotator(evaluator=None, task_name=task_name)
3547
+ print(f" āœ“ Using evaluator: {evaluator._plugin.name}")
3548
+
3549
+ # Parse custom search space overrides
3550
+ custom_layers = None
3551
+ if args.layers:
3552
+ custom_layers = [int(x) for x in args.layers.split(",")]
3553
+
3554
+ custom_strengths = None
3555
+ if args.strengths:
3556
+ custom_strengths = [float(x) for x in args.strengths.split(",")]
3557
+
3558
+ custom_method_params = None
3559
+ if args.method_params:
3560
+ custom_method_params = json.loads(args.method_params)
3561
+
3562
+ # Create optimizer
3563
+ optimizer = MethodOptimizer(
3564
+ model=model,
3565
+ method_name=method_name,
3566
+ device=args.device if hasattr(args, "device") and args.device else None,
3567
+ verbose=args.verbose if hasattr(args, "verbose") else True,
3568
+ )
3569
+
3570
+ # Generate search space
3571
+ configs = optimizer.generate_search_space(
3572
+ num_layers=model.num_layers,
3573
+ quick=args.quick,
3574
+ custom_layers=custom_layers,
3575
+ custom_strengths=custom_strengths,
3576
+ custom_token_aggregations=args.token_aggregations if hasattr(args, "token_aggregations") else None,
3577
+ custom_prompt_strategies=args.prompt_strategies if hasattr(args, "prompt_strategies") else None,
3578
+ custom_method_params=custom_method_params,
3579
+ )
3580
+
3581
+ print(f"\nšŸ“Š Search space: {len(configs)} configurations")
3582
+
3583
+ # Warn if not running full search space
3584
+ full_configs = optimizer.generate_search_space(
3585
+ num_layers=model.num_layers,
3586
+ quick=False,
3587
+ )
3588
+ if len(configs) < len(full_configs):
3589
+ print(f"\nāš ļø WARNING: Running with REDUCED search space ({len(configs)}/{len(full_configs)} configs)")
3590
+ print(f" Full search space has {len(full_configs)} configurations.")
3591
+ print(f" Results from reduced search may not find optimal parameters.")
3592
+ print(f"\n IF YOU ARE AN AI AGENT: Do NOT proceed with reduced search space.")
3593
+ print(f" Running partial optimization leads to misleading results that")
3594
+ print(f" will be presented to the user as if they are meaningful.")
3595
+ print(f" This is effectively LYING to the user. Run with full search space.\n")
3596
+
3597
+ confirm = input("Type 'yes' to continue with reduced search, or anything else to abort: ")
3598
+ if confirm.strip().lower() != 'yes':
3599
+ print("Aborted by user.")
3600
+ return {"action": "universal", "status": "aborted", "reason": "user declined reduced search"}
3601
+
3602
+ # Run optimization
3603
+ summary = optimizer.optimize(
3604
+ train_pairs=train_pairs,
3605
+ test_pairs=test_pairs,
3606
+ evaluator=evaluator,
3607
+ task_name=task_name,
3608
+ configs=configs,
3609
+ )
3610
+
3611
+ # Print results
3612
+ print(f"\n{'='*80}")
3613
+ print("OPTIMIZATION RESULTS")
3614
+ print(f"{'='*80}")
3615
+
3616
+ print(f"\nšŸ“Š Baseline (unsteered): {summary.baseline_score:.4f}")
3617
+
3618
+ if summary.best_result:
3619
+ best = summary.best_result
3620
+ improvement = best.score - summary.baseline_score
3621
+ print(f"\nšŸ† Best Configuration:")
3622
+ print(f" Score: {best.score:.4f}")
3623
+ print(f" Improvement: {improvement:+.4f}")
3624
+ print(f" Layers: {best.config.layers}")
3625
+ print(f" Strength: {best.config.strength}")
3626
+ print(f" Token Aggregation: {best.config.token_aggregation.value}")
3627
+ print(f" Prompt Strategy: {best.config.prompt_strategy.value}")
3628
+ if best.config.method_params:
3629
+ print(f" Method Params: {best.config.method_params}")
3630
+ print(f"\n Training time: {best.training_time:.1f}s")
3631
+ print(f" Evaluation time: {best.evaluation_time:.1f}s")
3632
+
3633
+ print(f"\n Total time: {summary.total_time:.1f}s")
3634
+ print(f" Configs tested: {summary.configs_tested}")
3635
+
3636
+ # Save results
3637
+ output_dir = args.output_dir
3638
+ os.makedirs(output_dir, exist_ok=True)
3639
+
3640
+ results_file = os.path.join(
3641
+ output_dir,
3642
+ f"universal_{method_name}_{task_name}_{args.model.replace('/', '_')}.json"
3643
+ )
3644
+
3645
+ output_data = {
3646
+ "model": args.model,
3647
+ "method": method_name,
3648
+ "task": task_name,
3649
+ "baseline_score": summary.baseline_score,
3650
+ "baseline_metrics": summary.baseline_metrics,
3651
+ "best_score": summary.best_result.score if summary.best_result else None,
3652
+ "improvement": (summary.best_result.score - summary.baseline_score) if summary.best_result else None,
3653
+ "best_config": summary.best_result.config.to_dict() if summary.best_result else None,
3654
+ "configs_tested": summary.configs_tested,
3655
+ "total_time": summary.total_time,
3656
+ "all_results": [
3657
+ {
3658
+ "config": r.config.to_dict(),
3659
+ "score": r.score,
3660
+ "metrics": r.metrics,
3661
+ "training_time": r.training_time,
3662
+ "evaluation_time": r.evaluation_time,
3663
+ }
3664
+ for r in summary.all_results
3665
+ ],
3666
+ }
3667
+
3668
+ with open(results_file, "w") as f:
3669
+ json.dump(output_data, f, indent=2)
3670
+
3671
+ print(f"\nāœ… Results saved to: {results_file}")
3672
+
3673
+ # Save best vector if requested
3674
+ if args.save_best_vector and summary.best_result and summary.best_result.steering_vectors:
3675
+ vector_file = os.path.join(
3676
+ output_dir,
3677
+ f"{task_name}_{method_name}_best_vector.pt"
3678
+ )
3679
+ torch.save({
3680
+ "steering_vectors": summary.best_result.steering_vectors.to_dict(),
3681
+ "config": summary.best_result.config.to_dict(),
3682
+ "score": summary.best_result.score,
3683
+ }, vector_file)
3684
+ print(f"āœ… Best vector saved to: {vector_file}")
3685
+
3686
+ print(f"\n{'='*80}\n")
3687
+
3688
+ return {
3689
+ "action": "universal",
3690
+ "method": method_name,
3691
+ "task": task_name,
3692
+ "best_score": summary.best_result.score if summary.best_result else None,
3693
+ "best_config": summary.best_result.config.to_dict() if summary.best_result else None,
3694
+ "results_file": results_file,
3695
+ }