wisent 0.7.379__py3-none-any.whl → 0.7.701__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (725) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/__init__.py +22 -6
  3. wisent/core/activations/activations.py +21 -39
  4. wisent/core/activations/activations_collector.py +141 -373
  5. wisent/core/activations/classifier_inference_strategy.py +194 -0
  6. wisent/core/activations/core/atoms.py +8 -92
  7. wisent/core/activations/extraction_strategy.py +308 -0
  8. wisent/core/agent/diagnose/response_diagnostics.py +3 -3
  9. wisent/core/agent/diagnose.py +3 -3
  10. wisent/core/autonomous_agent.py +2 -2
  11. wisent/core/cli/agent/apply_steering.py +23 -27
  12. wisent/core/cli/agent/evaluate_response.py +18 -20
  13. wisent/core/cli/agent/train_classifier.py +18 -20
  14. wisent/core/cli/cluster_benchmarks.py +472 -0
  15. wisent/core/cli/create_steering_vector.py +13 -5
  16. wisent/core/cli/generate_vector_from_task.py +4 -0
  17. wisent/core/cli/get_activations.py +12 -36
  18. wisent/core/cli/method_optimizer.py +859 -0
  19. wisent/core/cli/optimize.py +44 -5
  20. wisent/core/cli/optimize_classification.py +5 -6
  21. wisent/core/cli/optimize_sample_size.py +8 -22
  22. wisent/core/cli/optimize_steering.py +429 -153
  23. wisent/core/cli/optimize_weights.py +65 -6
  24. wisent/core/cli/steering_method_trainer.py +5 -4
  25. wisent/core/cli/steering_search_space.py +20 -15
  26. wisent/core/cli/tasks.py +14 -43
  27. wisent/core/cli/train_unified_goodness.py +17 -18
  28. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +1578 -173
  29. wisent/core/contrastive_pairs/diagnostics/linearity.py +63 -80
  30. wisent/core/contrastive_pairs/diagnostics/vector_quality.py +6 -5
  31. wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py +5 -19
  32. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py +11 -5
  33. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py +146 -32
  34. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py +2 -2
  35. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py +98 -57
  36. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py +8 -8
  37. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py +1 -1
  38. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -5
  39. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval_aqua_rat.py +129 -0
  40. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py +11 -6
  41. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +1 -1
  42. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py +47 -6
  43. wisent/core/evaluators/benchmark_specific/apps_evaluator.py +133 -0
  44. wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py +6 -1
  45. wisent/core/evaluators/benchmark_specific/conala_evaluator.py +31 -168
  46. wisent/core/evaluators/custom/examples/humanization_coherent.py +89 -35
  47. wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py +2 -20
  48. wisent/core/evaluators/personalization/coherence.py +46 -0
  49. wisent/core/hyperparameter_optimizer.py +13 -13
  50. wisent/core/lm_eval_harness_ground_truth.py +7 -11
  51. wisent/core/main.py +3 -0
  52. wisent/core/models/wisent_model.py +8 -7
  53. wisent/core/opti/methods/opti_weights.py +29 -2
  54. wisent/core/optuna/classifier/activation_generator.py +14 -12
  55. wisent/core/optuna/steering/steering_optimization.py +14 -9
  56. wisent/core/parser_arguments/cluster_benchmarks_parser.py +31 -0
  57. wisent/core/parser_arguments/generate_vector_from_task_parser.py +20 -0
  58. wisent/core/parser_arguments/main_parser.py +8 -0
  59. wisent/core/parser_arguments/optimize_steering_parser.py +117 -10
  60. wisent/core/parser_arguments/optimize_weights_parser.py +6 -0
  61. wisent/core/parser_arguments/tasks_parser.py +7 -19
  62. wisent/core/steering_methods/core/atoms.py +1 -2
  63. wisent/core/steering_methods/methods/caa.py +1 -1
  64. wisent/core/steering_methods/methods/hyperplane.py +74 -0
  65. wisent/core/steering_methods/methods/prism.py +1 -2
  66. wisent/core/steering_methods/methods/pulse.py +39 -8
  67. wisent/core/steering_methods/methods/titan.py +59 -14
  68. wisent/core/steering_methods/registry.py +52 -12
  69. wisent/core/steering_optimizer.py +15 -15
  70. wisent/core/trainers/steering_trainer.py +9 -18
  71. wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json +19 -70
  72. wisent/scripts/run_quality_metrics_sweep.sh +22 -27
  73. wisent/tests/test_aggregation_geometry.py +236 -0
  74. wisent/tests/test_detector_accuracy.py +163 -0
  75. wisent/tests/test_geometry_exhaustive.py +1202 -0
  76. wisent/tests/visualize_geometry.py +255 -61
  77. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/METADATA +1 -1
  78. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/RECORD +82 -714
  79. wisent/core/activations/prompt_construction_strategy.py +0 -47
  80. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text.py +0 -15
  81. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_go.py +0 -64
  82. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_java.py +0 -65
  83. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_javascript.py +0 -65
  84. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_php.py +0 -65
  85. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_python.py +0 -65
  86. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_ruby.py +0 -65
  87. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/freebase.py +0 -99
  88. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py +0 -180
  89. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py +0 -129
  90. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py +0 -142
  91. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py +0 -155
  92. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code2text.py +0 -161
  93. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/codexglue.py +0 -107
  94. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livemathbench.py +0 -155
  95. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polymath.py +0 -155
  96. wisent/examples/scripts/results/benchmark_descriptions.json +0 -1244
  97. wisent/examples/scripts/results/benchmark_evaluation_methods.json +0 -66
  98. wisent/examples/scripts/results/benchmark_evaluator_mapping.json +0 -2781
  99. wisent/examples/scripts/results/benchmark_evaluator_mapping_updated.json +0 -30536
  100. wisent/examples/scripts/results/benchmark_evaluators_clean.json +0 -469
  101. wisent/examples/scripts/results/benchmark_methods_summary.json +0 -260
  102. wisent/examples/scripts/results/benchmark_pair_creation_methods.json +0 -66
  103. wisent/examples/scripts/results/benchmark_pair_totals.json +0 -269
  104. wisent/examples/scripts/results/benchmark_tags.json +0 -917
  105. wisent/examples/scripts/results/benchmark_test_summary_nov4.json +0 -71
  106. wisent/examples/scripts/results/coding_benchmarks_test_code_status.json +0 -150
  107. wisent/examples/scripts/results/failing_benchmarks.json +0 -946
  108. wisent/examples/scripts/results/failing_benchmarks_list.json +0 -41
  109. wisent/examples/scripts/results/failing_benchmarks_test_results.json +0 -945
  110. wisent/examples/scripts/results/missing_benchmark_tags.json +0 -341
  111. wisent/examples/scripts/results/test_20_newsgroups_evaluation.json +0 -30
  112. wisent/examples/scripts/results/test_20_newsgroups_pairs.json +0 -8
  113. wisent/examples/scripts/results/test_AraDICE_evaluation.json +0 -51
  114. wisent/examples/scripts/results/test_AraDICE_pairs.json +0 -14
  115. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_evaluation.json +0 -30
  116. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_pairs.json +0 -8
  117. wisent/examples/scripts/results/test_ArabCulture_evaluation.json +0 -51
  118. wisent/examples/scripts/results/test_ArabCulture_pairs.json +0 -14
  119. wisent/examples/scripts/results/test_Tag_evaluation.json +0 -30
  120. wisent/examples/scripts/results/test_Tag_pairs.json +0 -8
  121. wisent/examples/scripts/results/test_aclue_evaluation.json +0 -51
  122. wisent/examples/scripts/results/test_aclue_pairs.json +0 -14
  123. wisent/examples/scripts/results/test_acp_bench_evaluation.json +0 -51
  124. wisent/examples/scripts/results/test_acp_bench_hard_evaluation.json +0 -51
  125. wisent/examples/scripts/results/test_acp_bench_hard_pairs.json +0 -14
  126. wisent/examples/scripts/results/test_acp_bench_pairs.json +0 -14
  127. wisent/examples/scripts/results/test_advanced_ai_risk_evaluation.json +0 -51
  128. wisent/examples/scripts/results/test_advanced_ai_risk_pairs.json +0 -14
  129. wisent/examples/scripts/results/test_aexams_evaluation.json +0 -51
  130. wisent/examples/scripts/results/test_aexams_pairs.json +0 -14
  131. wisent/examples/scripts/results/test_afrimgsm_direct_amh_evaluation.json +0 -30
  132. wisent/examples/scripts/results/test_afrimgsm_direct_amh_pairs.json +0 -8
  133. wisent/examples/scripts/results/test_afrimmlu_direct_amh_evaluation.json +0 -30
  134. wisent/examples/scripts/results/test_afrimmlu_direct_amh_pairs.json +0 -8
  135. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_evaluation.json +0 -30
  136. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_pairs.json +0 -8
  137. wisent/examples/scripts/results/test_ag_news_evaluation.json +0 -30
  138. wisent/examples/scripts/results/test_ag_news_pairs.json +0 -8
  139. wisent/examples/scripts/results/test_agieval_evaluation.json +0 -51
  140. wisent/examples/scripts/results/test_agieval_pairs.json +0 -14
  141. wisent/examples/scripts/results/test_aime2024_evaluation.json +0 -30
  142. wisent/examples/scripts/results/test_aime2024_pairs.json +0 -8
  143. wisent/examples/scripts/results/test_aime2025_evaluation.json +0 -30
  144. wisent/examples/scripts/results/test_aime2025_pairs.json +0 -8
  145. wisent/examples/scripts/results/test_aime_evaluation.json +0 -30
  146. wisent/examples/scripts/results/test_aime_pairs.json +0 -8
  147. wisent/examples/scripts/results/test_anagrams1_evaluation.json +0 -30
  148. wisent/examples/scripts/results/test_anagrams1_pairs.json +0 -8
  149. wisent/examples/scripts/results/test_anagrams2_evaluation.json +0 -30
  150. wisent/examples/scripts/results/test_anagrams2_pairs.json +0 -8
  151. wisent/examples/scripts/results/test_anli_evaluation.json +0 -30
  152. wisent/examples/scripts/results/test_anli_pairs.json +0 -8
  153. wisent/examples/scripts/results/test_apps_evaluation.json +0 -30
  154. wisent/examples/scripts/results/test_apps_pairs.json +0 -8
  155. wisent/examples/scripts/results/test_arabic_exams_evaluation.json +0 -30
  156. wisent/examples/scripts/results/test_arabic_exams_pairs.json +0 -8
  157. wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json +0 -51
  158. wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json +0 -14
  159. wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json +0 -51
  160. wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json +0 -14
  161. wisent/examples/scripts/results/test_arabicmmlu_evaluation.json +0 -51
  162. wisent/examples/scripts/results/test_arabicmmlu_pairs.json +0 -14
  163. wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json +0 -51
  164. wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json +0 -14
  165. wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json +0 -51
  166. wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json +0 -14
  167. wisent/examples/scripts/results/test_arc_ar_evaluation.json +0 -30
  168. wisent/examples/scripts/results/test_arc_ar_pairs.json +0 -8
  169. wisent/examples/scripts/results/test_arc_challenge_evaluation.json +0 -30
  170. wisent/examples/scripts/results/test_arc_challenge_pairs.json +0 -8
  171. wisent/examples/scripts/results/test_arc_easy_evaluation.json +0 -30
  172. wisent/examples/scripts/results/test_arc_easy_pairs.json +0 -8
  173. wisent/examples/scripts/results/test_argument_topic_evaluation.json +0 -30
  174. wisent/examples/scripts/results/test_argument_topic_pairs.json +0 -8
  175. wisent/examples/scripts/results/test_arithmetic_evaluation.json +0 -51
  176. wisent/examples/scripts/results/test_arithmetic_pairs.json +0 -14
  177. wisent/examples/scripts/results/test_asdiv_evaluation.json +0 -30
  178. wisent/examples/scripts/results/test_asdiv_pairs.json +0 -8
  179. wisent/examples/scripts/results/test_assin_entailment_evaluation.json +0 -30
  180. wisent/examples/scripts/results/test_assin_entailment_pairs.json +0 -8
  181. wisent/examples/scripts/results/test_atis_evaluation.json +0 -30
  182. wisent/examples/scripts/results/test_atis_pairs.json +0 -8
  183. wisent/examples/scripts/results/test_babi_evaluation.json +0 -30
  184. wisent/examples/scripts/results/test_babi_pairs.json +0 -8
  185. wisent/examples/scripts/results/test_babilong_evaluation.json +0 -30
  186. wisent/examples/scripts/results/test_babilong_pairs.json +0 -8
  187. wisent/examples/scripts/results/test_bangla_mmlu_evaluation.json +0 -30
  188. wisent/examples/scripts/results/test_bangla_mmlu_pairs.json +0 -8
  189. wisent/examples/scripts/results/test_banking77_evaluation.json +0 -30
  190. wisent/examples/scripts/results/test_banking77_pairs.json +0 -8
  191. wisent/examples/scripts/results/test_basque/test_basque-glue_pairs.json +0 -14
  192. wisent/examples/scripts/results/test_basque-glue_evaluation.json +0 -51
  193. wisent/examples/scripts/results/test_basque-glue_pairs.json +0 -14
  194. wisent/examples/scripts/results/test_basque2/test_basque-glue_evaluation.json +0 -51
  195. wisent/examples/scripts/results/test_basque2/test_basque-glue_pairs.json +0 -14
  196. wisent/examples/scripts/results/test_basque_bench_evaluation.json +0 -51
  197. wisent/examples/scripts/results/test_basque_bench_pairs.json +0 -14
  198. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_evaluation.json +0 -51
  199. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_pairs.json +0 -14
  200. wisent/examples/scripts/results/test_basqueglue_evaluation.json +0 -51
  201. wisent/examples/scripts/results/test_basqueglue_pairs.json +0 -14
  202. wisent/examples/scripts/results/test_bbh_evaluation.json +0 -51
  203. wisent/examples/scripts/results/test_bbh_pairs.json +0 -14
  204. wisent/examples/scripts/results/test_bbq_evaluation.json +0 -30
  205. wisent/examples/scripts/results/test_bbq_pairs.json +0 -8
  206. wisent/examples/scripts/results/test_bec2016eu_evaluation.json +0 -51
  207. wisent/examples/scripts/results/test_bec2016eu_pairs.json +0 -14
  208. wisent/examples/scripts/results/test_belebele_evaluation.json +0 -51
  209. wisent/examples/scripts/results/test_belebele_pairs.json +0 -14
  210. wisent/examples/scripts/results/test_benchmarks_evaluation.json +0 -51
  211. wisent/examples/scripts/results/test_benchmarks_pairs.json +0 -14
  212. wisent/examples/scripts/results/test_bertaqa_evaluation.json +0 -51
  213. wisent/examples/scripts/results/test_bertaqa_pairs.json +0 -14
  214. wisent/examples/scripts/results/test_bhtc_v2_evaluation.json +0 -30
  215. wisent/examples/scripts/results/test_bhtc_v2_pairs.json +0 -8
  216. wisent/examples/scripts/results/test_bigbench_evaluation.json +0 -51
  217. wisent/examples/scripts/results/test_bigbench_pairs.json +0 -14
  218. wisent/examples/scripts/results/test_blimp_evaluation.json +0 -51
  219. wisent/examples/scripts/results/test_blimp_pairs.json +0 -14
  220. wisent/examples/scripts/results/test_boolq/test_boolq_evaluation.json +0 -30
  221. wisent/examples/scripts/results/test_boolq/test_boolq_pairs.json +0 -8
  222. wisent/examples/scripts/results/test_boolq-seq2seq_evaluation.json +0 -30
  223. wisent/examples/scripts/results/test_boolq-seq2seq_pairs.json +0 -8
  224. wisent/examples/scripts/results/test_boolq_evaluation.json +0 -30
  225. wisent/examples/scripts/results/test_boolq_pairs.json +0 -8
  226. wisent/examples/scripts/results/test_c4_evaluation.json +0 -30
  227. wisent/examples/scripts/results/test_c4_pairs.json +0 -8
  228. wisent/examples/scripts/results/test_cabreu_evaluation.json +0 -30
  229. wisent/examples/scripts/results/test_cabreu_pairs.json +0 -8
  230. wisent/examples/scripts/results/test_careqa_evaluation.json +0 -30
  231. wisent/examples/scripts/results/test_careqa_pairs.json +0 -8
  232. wisent/examples/scripts/results/test_catalan_bench_evaluation.json +0 -51
  233. wisent/examples/scripts/results/test_catalan_bench_pairs.json +0 -14
  234. wisent/examples/scripts/results/test_catalanqa_evaluation.json +0 -30
  235. wisent/examples/scripts/results/test_catalanqa_pairs.json +0 -8
  236. wisent/examples/scripts/results/test_catcola_evaluation.json +0 -30
  237. wisent/examples/scripts/results/test_catcola_pairs.json +0 -8
  238. wisent/examples/scripts/results/test_cb_evaluation.json +0 -30
  239. wisent/examples/scripts/results/test_cb_pairs.json +0 -8
  240. wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json +0 -51
  241. wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json +0 -14
  242. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json +0 -30
  243. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json +0 -8
  244. wisent/examples/scripts/results/test_ceval_evaluation.json +0 -51
  245. wisent/examples/scripts/results/test_ceval_pairs.json +0 -14
  246. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json +0 -51
  247. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json +0 -14
  248. wisent/examples/scripts/results/test_chain_of_thought_evaluation.json +0 -51
  249. wisent/examples/scripts/results/test_chain_of_thought_pairs.json +0 -14
  250. wisent/examples/scripts/results/test_chartqa_evaluation.json +0 -30
  251. wisent/examples/scripts/results/test_chartqa_pairs.json +0 -8
  252. wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json +0 -30
  253. wisent/examples/scripts/results/test_claim_stance_topic_pairs.json +0 -8
  254. wisent/examples/scripts/results/test_cmmlu_evaluation.json +0 -51
  255. wisent/examples/scripts/results/test_cmmlu_pairs.json +0 -14
  256. wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json +0 -30
  257. wisent/examples/scripts/results/test_cnn_dailymail_pairs.json +0 -8
  258. wisent/examples/scripts/results/test_cocoteros_es_evaluation.json +0 -30
  259. wisent/examples/scripts/results/test_cocoteros_es_pairs.json +0 -8
  260. wisent/examples/scripts/results/test_codexglue_code_to_text_go_evaluation.json +0 -30
  261. wisent/examples/scripts/results/test_codexglue_code_to_text_go_pairs.json +0 -8
  262. wisent/examples/scripts/results/test_codexglue_code_to_text_java_evaluation.json +0 -30
  263. wisent/examples/scripts/results/test_codexglue_code_to_text_java_pairs.json +0 -8
  264. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_evaluation.json +0 -30
  265. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_pairs.json +0 -8
  266. wisent/examples/scripts/results/test_codexglue_code_to_text_php_evaluation.json +0 -30
  267. wisent/examples/scripts/results/test_codexglue_code_to_text_php_pairs.json +0 -8
  268. wisent/examples/scripts/results/test_codexglue_code_to_text_python_evaluation.json +0 -30
  269. wisent/examples/scripts/results/test_codexglue_code_to_text_python_pairs.json +0 -8
  270. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_evaluation.json +0 -30
  271. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_pairs.json +0 -8
  272. wisent/examples/scripts/results/test_coedit_gec_evaluation.json +0 -30
  273. wisent/examples/scripts/results/test_coedit_gec_pairs.json +0 -8
  274. wisent/examples/scripts/results/test_cola_evaluation.json +0 -30
  275. wisent/examples/scripts/results/test_cola_pairs.json +0 -8
  276. wisent/examples/scripts/results/test_commonsense_qa_evaluation.json +0 -30
  277. wisent/examples/scripts/results/test_commonsense_qa_pairs.json +0 -8
  278. wisent/examples/scripts/results/test_conala_evaluation.json +0 -30
  279. wisent/examples/scripts/results/test_conala_pairs.json +0 -8
  280. wisent/examples/scripts/results/test_concode_evaluation.json +0 -30
  281. wisent/examples/scripts/results/test_concode_pairs.json +0 -8
  282. wisent/examples/scripts/results/test_copa_evaluation.json +0 -30
  283. wisent/examples/scripts/results/test_copa_pairs.json +0 -8
  284. wisent/examples/scripts/results/test_copal_id_evaluation.json +0 -30
  285. wisent/examples/scripts/results/test_copal_id_pairs.json +0 -8
  286. wisent/examples/scripts/results/test_coqa_evaluation.json +0 -30
  287. wisent/examples/scripts/results/test_coqa_pairs.json +0 -8
  288. wisent/examples/scripts/results/test_coqcat_evaluation.json +0 -30
  289. wisent/examples/scripts/results/test_coqcat_pairs.json +0 -8
  290. wisent/examples/scripts/results/test_crows_pairs_evaluation.json +0 -51
  291. wisent/examples/scripts/results/test_crows_pairs_pairs.json +0 -14
  292. wisent/examples/scripts/results/test_csatqa_evaluation.json +0 -51
  293. wisent/examples/scripts/results/test_csatqa_pairs.json +0 -14
  294. wisent/examples/scripts/results/test_cycle_letters_evaluation.json +0 -30
  295. wisent/examples/scripts/results/test_cycle_letters_pairs.json +0 -8
  296. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_evaluation.json +0 -51
  297. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_pairs.json +0 -14
  298. wisent/examples/scripts/results/test_darija_bench_evaluation.json +0 -51
  299. wisent/examples/scripts/results/test_darija_bench_pairs.json +0 -14
  300. wisent/examples/scripts/results/test_darijahellaswag_evaluation.json +0 -30
  301. wisent/examples/scripts/results/test_darijahellaswag_pairs.json +0 -8
  302. wisent/examples/scripts/results/test_darijammlu_evaluation.json +0 -51
  303. wisent/examples/scripts/results/test_darijammlu_pairs.json +0 -14
  304. wisent/examples/scripts/results/test_dbpedia_14_evaluation.json +0 -30
  305. wisent/examples/scripts/results/test_dbpedia_14_pairs.json +0 -8
  306. wisent/examples/scripts/results/test_drop_evaluation.json +0 -30
  307. wisent/examples/scripts/results/test_drop_pairs.json +0 -8
  308. wisent/examples/scripts/results/test_ds1000_evaluation.json +0 -30
  309. wisent/examples/scripts/results/test_ds1000_pairs.json +0 -8
  310. wisent/examples/scripts/results/test_egyhellaswag_evaluation.json +0 -30
  311. wisent/examples/scripts/results/test_egyhellaswag_pairs.json +0 -8
  312. wisent/examples/scripts/results/test_egymmlu_evaluation.json +0 -51
  313. wisent/examples/scripts/results/test_egymmlu_pairs.json +0 -14
  314. wisent/examples/scripts/results/test_epec_koref_bin_evaluation.json +0 -30
  315. wisent/examples/scripts/results/test_epec_koref_bin_pairs.json +0 -8
  316. wisent/examples/scripts/results/test_eq_bench_evaluation.json +0 -30
  317. wisent/examples/scripts/results/test_eq_bench_pairs.json +0 -8
  318. wisent/examples/scripts/results/test_escola_evaluation.json +0 -30
  319. wisent/examples/scripts/results/test_escola_pairs.json +0 -8
  320. wisent/examples/scripts/results/test_ethics_cm_evaluation.json +0 -30
  321. wisent/examples/scripts/results/test_ethics_cm_pairs.json +0 -8
  322. wisent/examples/scripts/results/test_ethos_binary_evaluation.json +0 -30
  323. wisent/examples/scripts/results/test_ethos_binary_pairs.json +0 -8
  324. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_evaluation.json +0 -51
  325. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_pairs.json +0 -14
  326. wisent/examples/scripts/results/test_eus_exams_es_evaluation.json +0 -51
  327. wisent/examples/scripts/results/test_eus_exams_es_pairs.json +0 -14
  328. wisent/examples/scripts/results/test_eus_exams_evaluation.json +0 -51
  329. wisent/examples/scripts/results/test_eus_exams_pairs.json +0 -14
  330. wisent/examples/scripts/results/test_eus_proficiency_evaluation.json +0 -30
  331. wisent/examples/scripts/results/test_eus_proficiency_pairs.json +0 -8
  332. wisent/examples/scripts/results/test_eus_reading_evaluation.json +0 -30
  333. wisent/examples/scripts/results/test_eus_reading_pairs.json +0 -8
  334. wisent/examples/scripts/results/test_eus_trivia_evaluation.json +0 -30
  335. wisent/examples/scripts/results/test_eus_trivia_pairs.json +0 -8
  336. wisent/examples/scripts/results/test_evalita-mp_evaluation.json +0 -51
  337. wisent/examples/scripts/results/test_evalita-mp_pairs.json +0 -14
  338. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
  339. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
  340. wisent/examples/scripts/results/test_evalita_LLM_evaluation.json +0 -51
  341. wisent/examples/scripts/results/test_evalita_LLM_pairs.json +0 -14
  342. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_evaluation.json +0 -51
  343. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_pairs.json +0 -14
  344. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_evaluation.json +0 -30
  345. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_pairs.json +0 -8
  346. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_evaluation.json +0 -51
  347. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_pairs.json +0 -14
  348. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
  349. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
  350. wisent/examples/scripts/results/test_fda_evaluation.json +0 -30
  351. wisent/examples/scripts/results/test_fda_pairs.json +0 -8
  352. wisent/examples/scripts/results/test_financial_tweets_evaluation.json +0 -30
  353. wisent/examples/scripts/results/test_financial_tweets_pairs.json +0 -8
  354. wisent/examples/scripts/results/test_fld/test_fld_evaluation.json +0 -30
  355. wisent/examples/scripts/results/test_fld/test_fld_pairs.json +0 -8
  356. wisent/examples/scripts/results/test_fld_evaluation.json +0 -30
  357. wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json +0 -30
  358. wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json +0 -8
  359. wisent/examples/scripts/results/test_fld_pairs.json +0 -8
  360. wisent/examples/scripts/results/test_flores_evaluation.json +0 -51
  361. wisent/examples/scripts/results/test_flores_pairs.json +0 -14
  362. wisent/examples/scripts/results/test_freebase_evaluation.json +0 -30
  363. wisent/examples/scripts/results/test_freebase_pairs.json +0 -8
  364. wisent/examples/scripts/results/test_french_bench_evaluation.json +0 -51
  365. wisent/examples/scripts/results/test_french_bench_pairs.json +0 -14
  366. wisent/examples/scripts/results/test_galcola_evaluation.json +0 -30
  367. wisent/examples/scripts/results/test_galcola_pairs.json +0 -8
  368. wisent/examples/scripts/results/test_galician_bench_evaluation.json +0 -51
  369. wisent/examples/scripts/results/test_galician_bench_pairs.json +0 -14
  370. wisent/examples/scripts/results/test_glianorex_evaluation.json +0 -30
  371. wisent/examples/scripts/results/test_glianorex_pairs.json +0 -8
  372. wisent/examples/scripts/results/test_global_mmlu_evaluation.json +0 -51
  373. wisent/examples/scripts/results/test_global_mmlu_pairs.json +0 -14
  374. wisent/examples/scripts/results/test_glue_evaluation.json +0 -51
  375. wisent/examples/scripts/results/test_glue_pairs.json +0 -14
  376. wisent/examples/scripts/results/test_gpqa_evaluation.json +0 -51
  377. wisent/examples/scripts/results/test_gpqa_pairs.json +0 -14
  378. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_evaluation.json +0 -51
  379. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_pairs.json +0 -14
  380. wisent/examples/scripts/results/test_groundcocoa_evaluation.json +0 -30
  381. wisent/examples/scripts/results/test_groundcocoa_pairs.json +0 -8
  382. wisent/examples/scripts/results/test_gsm8k_evaluation.json +0 -30
  383. wisent/examples/scripts/results/test_gsm8k_pairs.json +0 -8
  384. wisent/examples/scripts/results/test_haerae_evaluation.json +0 -51
  385. wisent/examples/scripts/results/test_haerae_pairs.json +0 -14
  386. wisent/examples/scripts/results/test_headqa_evaluation.json +0 -30
  387. wisent/examples/scripts/results/test_headqa_pairs.json +0 -8
  388. wisent/examples/scripts/results/test_hellaswag_evaluation.json +0 -30
  389. wisent/examples/scripts/results/test_hellaswag_pairs.json +0 -8
  390. wisent/examples/scripts/results/test_hendrycks_ethics_evaluation.json +0 -51
  391. wisent/examples/scripts/results/test_hendrycks_ethics_pairs.json +0 -14
  392. wisent/examples/scripts/results/test_hendrycks_math_evaluation.json +0 -51
  393. wisent/examples/scripts/results/test_hendrycks_math_pairs.json +0 -14
  394. wisent/examples/scripts/results/test_histoires_morales_evaluation.json +0 -30
  395. wisent/examples/scripts/results/test_histoires_morales_pairs.json +0 -8
  396. wisent/examples/scripts/results/test_hmmt_evaluation.json +0 -30
  397. wisent/examples/scripts/results/test_hmmt_feb_2025_evaluation.json +0 -30
  398. wisent/examples/scripts/results/test_hmmt_feb_2025_pairs.json +0 -8
  399. wisent/examples/scripts/results/test_hmmt_pairs.json +0 -8
  400. wisent/examples/scripts/results/test_hrm8k_evaluation.json +0 -51
  401. wisent/examples/scripts/results/test_hrm8k_pairs.json +0 -14
  402. wisent/examples/scripts/results/test_humaneval_evaluation.json +0 -30
  403. wisent/examples/scripts/results/test_humaneval_pairs.json +0 -8
  404. wisent/examples/scripts/results/test_humaneval_plus_evaluation.json +0 -30
  405. wisent/examples/scripts/results/test_humaneval_plus_pairs.json +0 -8
  406. wisent/examples/scripts/results/test_ifeval_evaluation.json +0 -30
  407. wisent/examples/scripts/results/test_ifeval_pairs.json +0 -8
  408. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_evaluation.json +0 -30
  409. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_pairs.json +0 -8
  410. wisent/examples/scripts/results/test_instruct_humaneval_evaluation.json +0 -30
  411. wisent/examples/scripts/results/test_instruct_humaneval_pairs.json +0 -8
  412. wisent/examples/scripts/results/test_inverse_scaling_evaluation.json +0 -51
  413. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_evaluation.json +0 -30
  414. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_pairs.json +0 -8
  415. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_evaluation.json +0 -51
  416. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_pairs.json +0 -14
  417. wisent/examples/scripts/results/test_inverse_scaling_pairs.json +0 -14
  418. wisent/examples/scripts/results/test_iwslt2017-ar-en_evaluation.json +0 -30
  419. wisent/examples/scripts/results/test_iwslt2017-ar-en_pairs.json +0 -8
  420. wisent/examples/scripts/results/test_iwslt2017-en-ar_evaluation.json +0 -30
  421. wisent/examples/scripts/results/test_iwslt2017-en-ar_pairs.json +0 -8
  422. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_evaluation.json +0 -30
  423. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_pairs.json +0 -8
  424. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_evaluation.json +0 -30
  425. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_pairs.json +0 -8
  426. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_evaluation.json +0 -30
  427. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_pairs.json +0 -8
  428. wisent/examples/scripts/results/test_japanese_leaderboard_evaluation.json +0 -51
  429. wisent/examples/scripts/results/test_japanese_leaderboard_pairs.json +0 -14
  430. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_evaluation.json +0 -30
  431. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_pairs.json +0 -8
  432. wisent/examples/scripts/results/test_jsonschema_bench_evaluation.json +0 -30
  433. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_evaluation.json +0 -30
  434. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_pairs.json +0 -8
  435. wisent/examples/scripts/results/test_jsonschema_bench_pairs.json +0 -8
  436. wisent/examples/scripts/results/test_kbl_evaluation.json +0 -51
  437. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_evaluation.json +0 -51
  438. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_pairs.json +0 -14
  439. wisent/examples/scripts/results/test_kbl_pairs.json +0 -14
  440. wisent/examples/scripts/results/test_kmmlu_evaluation.json +0 -51
  441. wisent/examples/scripts/results/test_kmmlu_pairs.json +0 -14
  442. wisent/examples/scripts/results/test_kobest_evaluation.json +0 -51
  443. wisent/examples/scripts/results/test_kobest_pairs.json +0 -14
  444. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json +0 -30
  445. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json +0 -8
  446. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json +0 -30
  447. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json +0 -8
  448. wisent/examples/scripts/results/test_kormedmcqa_evaluation.json +0 -30
  449. wisent/examples/scripts/results/test_kormedmcqa_pairs.json +0 -8
  450. wisent/examples/scripts/results/test_lambada_cloze_evaluation.json +0 -30
  451. wisent/examples/scripts/results/test_lambada_cloze_pairs.json +0 -8
  452. wisent/examples/scripts/results/test_lambada_evaluation.json +0 -30
  453. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  454. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  455. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json +0 -51
  456. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json +0 -14
  457. wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json +0 -51
  458. wisent/examples/scripts/results/test_lambada_multilingual_pairs.json +0 -14
  459. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json +0 -51
  460. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json +0 -14
  461. wisent/examples/scripts/results/test_lambada_openai_evaluation.json +0 -30
  462. wisent/examples/scripts/results/test_lambada_openai_pairs.json +0 -8
  463. wisent/examples/scripts/results/test_lambada_pairs.json +0 -8
  464. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  465. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  466. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  467. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  468. wisent/examples/scripts/results/test_lambada_standard_evaluation.json +0 -30
  469. wisent/examples/scripts/results/test_lambada_standard_pairs.json +0 -8
  470. wisent/examples/scripts/results/test_leaderboard_evaluation.json +0 -51
  471. wisent/examples/scripts/results/test_leaderboard_pairs.json +0 -14
  472. wisent/examples/scripts/results/test_libra/test_libra_evaluation.json +0 -51
  473. wisent/examples/scripts/results/test_libra/test_libra_pairs.json +0 -14
  474. wisent/examples/scripts/results/test_libra_evaluation.json +0 -51
  475. wisent/examples/scripts/results/test_libra_pairs.json +0 -14
  476. wisent/examples/scripts/results/test_lingoly_evaluation.json +0 -30
  477. wisent/examples/scripts/results/test_lingoly_pairs.json +0 -8
  478. wisent/examples/scripts/results/test_livecodebench_evaluation.json +0 -30
  479. wisent/examples/scripts/results/test_livecodebench_pairs.json +0 -8
  480. wisent/examples/scripts/results/test_livemathbench_cnmo_en_evaluation.json +0 -30
  481. wisent/examples/scripts/results/test_livemathbench_cnmo_en_pairs.json +0 -8
  482. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_evaluation.json +0 -30
  483. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_pairs.json +0 -8
  484. wisent/examples/scripts/results/test_llama_evaluation.json +0 -30
  485. wisent/examples/scripts/results/test_llama_pairs.json +0 -8
  486. wisent/examples/scripts/results/test_logiqa2_evaluation.json +0 -30
  487. wisent/examples/scripts/results/test_logiqa2_pairs.json +0 -8
  488. wisent/examples/scripts/results/test_logiqa_evaluation.json +0 -30
  489. wisent/examples/scripts/results/test_logiqa_pairs.json +0 -8
  490. wisent/examples/scripts/results/test_m_mmlu_evaluation.json +0 -51
  491. wisent/examples/scripts/results/test_m_mmlu_pairs.json +0 -14
  492. wisent/examples/scripts/results/test_mastermind/test_mastermind_evaluation.json +0 -51
  493. wisent/examples/scripts/results/test_mastermind/test_mastermind_pairs.json +0 -14
  494. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_evaluation.json +0 -30
  495. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_pairs.json +0 -8
  496. wisent/examples/scripts/results/test_mastermind_evaluation.json +0 -51
  497. wisent/examples/scripts/results/test_mastermind_pairs.json +0 -14
  498. wisent/examples/scripts/results/test_math500_evaluation.json +0 -30
  499. wisent/examples/scripts/results/test_math500_pairs.json +0 -8
  500. wisent/examples/scripts/results/test_math_evaluation.json +0 -30
  501. wisent/examples/scripts/results/test_math_pairs.json +0 -8
  502. wisent/examples/scripts/results/test_mathqa_evaluation.json +0 -30
  503. wisent/examples/scripts/results/test_mathqa_pairs.json +0 -8
  504. wisent/examples/scripts/results/test_mbpp_evaluation.json +0 -30
  505. wisent/examples/scripts/results/test_mbpp_pairs.json +0 -8
  506. wisent/examples/scripts/results/test_mbpp_plus_evaluation.json +0 -30
  507. wisent/examples/scripts/results/test_mbpp_plus_pairs.json +0 -8
  508. wisent/examples/scripts/results/test_mc_taco_evaluation.json +0 -30
  509. wisent/examples/scripts/results/test_mc_taco_pairs.json +0 -8
  510. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_evaluation.json +0 -51
  511. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_pairs.json +0 -14
  512. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_evaluation.json +0 -30
  513. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_pairs.json +0 -8
  514. wisent/examples/scripts/results/test_med_concepts_qa_evaluation.json +0 -51
  515. wisent/examples/scripts/results/test_med_concepts_qa_pairs.json +0 -14
  516. wisent/examples/scripts/results/test_meddialog_evaluation.json +0 -30
  517. wisent/examples/scripts/results/test_meddialog_pairs.json +0 -8
  518. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_evaluation.json +0 -30
  519. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_pairs.json +0 -8
  520. wisent/examples/scripts/results/test_mediqa_qa2019_evaluation.json +0 -30
  521. wisent/examples/scripts/results/test_mediqa_qa2019_pairs.json +0 -8
  522. wisent/examples/scripts/results/test_medmcqa_evaluation.json +0 -30
  523. wisent/examples/scripts/results/test_medmcqa_pairs.json +0 -8
  524. wisent/examples/scripts/results/test_medqa_evaluation.json +0 -30
  525. wisent/examples/scripts/results/test_medqa_pairs.json +0 -8
  526. wisent/examples/scripts/results/test_medtext_evaluation.json +0 -30
  527. wisent/examples/scripts/results/test_medtext_pairs.json +0 -8
  528. wisent/examples/scripts/results/test_mela_evaluation.json +0 -51
  529. wisent/examples/scripts/results/test_mela_pairs.json +0 -14
  530. wisent/examples/scripts/results/test_meqsum_evaluation.json +0 -30
  531. wisent/examples/scripts/results/test_meqsum_pairs.json +0 -8
  532. wisent/examples/scripts/results/test_mercury_evaluation.json +0 -30
  533. wisent/examples/scripts/results/test_mercury_pairs.json +0 -8
  534. wisent/examples/scripts/results/test_metabench_evaluation.json +0 -51
  535. wisent/examples/scripts/results/test_metabench_pairs.json +0 -14
  536. wisent/examples/scripts/results/test_mgsm_evaluation.json +0 -51
  537. wisent/examples/scripts/results/test_mgsm_pairs.json +0 -14
  538. wisent/examples/scripts/results/test_mimic_repsum_evaluation.json +0 -30
  539. wisent/examples/scripts/results/test_mimic_repsum_pairs.json +0 -8
  540. wisent/examples/scripts/results/test_minerva_math_evaluation.json +0 -51
  541. wisent/examples/scripts/results/test_minerva_math_pairs.json +0 -14
  542. wisent/examples/scripts/results/test_mlqa_evaluation.json +0 -51
  543. wisent/examples/scripts/results/test_mlqa_pairs.json +0 -14
  544. wisent/examples/scripts/results/test_mmlu-pro-plus_evaluation.json +0 -51
  545. wisent/examples/scripts/results/test_mmlu-pro-plus_pairs.json +0 -14
  546. wisent/examples/scripts/results/test_mmlu_evaluation.json +0 -51
  547. wisent/examples/scripts/results/test_mmlu_pairs.json +0 -14
  548. wisent/examples/scripts/results/test_mmlu_pro_evaluation.json +0 -51
  549. wisent/examples/scripts/results/test_mmlu_pro_pairs.json +0 -14
  550. wisent/examples/scripts/results/test_mmlu_prox_evaluation.json +0 -51
  551. wisent/examples/scripts/results/test_mmlu_prox_pairs.json +0 -14
  552. wisent/examples/scripts/results/test_mmlusr_evaluation.json +0 -30
  553. wisent/examples/scripts/results/test_mmlusr_pairs.json +0 -8
  554. wisent/examples/scripts/results/test_mmmu_evaluation.json +0 -51
  555. wisent/examples/scripts/results/test_mmmu_pairs.json +0 -14
  556. wisent/examples/scripts/results/test_mnli_evaluation.json +0 -30
  557. wisent/examples/scripts/results/test_mnli_pairs.json +0 -8
  558. wisent/examples/scripts/results/test_model_written_evals_evaluation.json +0 -51
  559. wisent/examples/scripts/results/test_model_written_evals_pairs.json +0 -14
  560. wisent/examples/scripts/results/test_moral_stories_evaluation.json +0 -30
  561. wisent/examples/scripts/results/test_moral_stories_pairs.json +0 -8
  562. wisent/examples/scripts/results/test_mts_dialog_evaluation.json +0 -30
  563. wisent/examples/scripts/results/test_mts_dialog_pairs.json +0 -8
  564. wisent/examples/scripts/results/test_multiblimp_evaluation.json +0 -51
  565. wisent/examples/scripts/results/test_multiblimp_pairs.json +0 -14
  566. wisent/examples/scripts/results/test_multimedqa_evaluation.json +0 -51
  567. wisent/examples/scripts/results/test_multimedqa_pairs.json +0 -14
  568. wisent/examples/scripts/results/test_multipl_e_evaluation.json +0 -30
  569. wisent/examples/scripts/results/test_multipl_e_pairs.json +0 -8
  570. wisent/examples/scripts/results/test_mutual_evaluation.json +0 -30
  571. wisent/examples/scripts/results/test_mutual_pairs.json +0 -8
  572. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_evaluation.json +0 -30
  573. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_pairs.json +0 -8
  574. wisent/examples/scripts/results/test_noreval_evaluation.json +0 -51
  575. wisent/examples/scripts/results/test_noreval_pairs.json +0 -14
  576. wisent/examples/scripts/results/test_noticia_evaluation.json +0 -30
  577. wisent/examples/scripts/results/test_noticia_pairs.json +0 -8
  578. wisent/examples/scripts/results/test_nq_open_evaluation.json +0 -30
  579. wisent/examples/scripts/results/test_nq_open_pairs.json +0 -8
  580. wisent/examples/scripts/results/test_olaph_evaluation.json +0 -30
  581. wisent/examples/scripts/results/test_olaph_pairs.json +0 -8
  582. wisent/examples/scripts/results/test_openbookqa_evaluation.json +0 -30
  583. wisent/examples/scripts/results/test_openbookqa_pairs.json +0 -8
  584. wisent/examples/scripts/results/test_openllm_evaluation.json +0 -51
  585. wisent/examples/scripts/results/test_openllm_pairs.json +0 -14
  586. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_evaluation.json +0 -30
  587. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_pairs.json +0 -8
  588. wisent/examples/scripts/results/test_paloma_evaluation.json +0 -51
  589. wisent/examples/scripts/results/test_paloma_pairs.json +0 -14
  590. wisent/examples/scripts/results/test_passkey/test_passkey_evaluation.json +0 -30
  591. wisent/examples/scripts/results/test_passkey/test_passkey_pairs.json +0 -8
  592. wisent/examples/scripts/results/test_paws-x_evaluation.json +0 -51
  593. wisent/examples/scripts/results/test_paws-x_pairs.json +0 -14
  594. wisent/examples/scripts/results/test_paws_en/test_paws_en_evaluation.json +0 -30
  595. wisent/examples/scripts/results/test_paws_en/test_paws_en_pairs.json +0 -8
  596. wisent/examples/scripts/results/test_penn_treebank_evaluation.json +0 -30
  597. wisent/examples/scripts/results/test_penn_treebank_pairs.json +0 -8
  598. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_evaluation.json +0 -30
  599. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_pairs.json +0 -8
  600. wisent/examples/scripts/results/test_piqa_evaluation.json +0 -30
  601. wisent/examples/scripts/results/test_piqa_pairs.json +0 -8
  602. wisent/examples/scripts/results/test_polemo2_evaluation.json +0 -30
  603. wisent/examples/scripts/results/test_polemo2_pairs.json +0 -8
  604. wisent/examples/scripts/results/test_polymath_en_high_evaluation.json +0 -30
  605. wisent/examples/scripts/results/test_polymath_en_high_pairs.json +0 -8
  606. wisent/examples/scripts/results/test_polymath_en_medium_evaluation.json +0 -30
  607. wisent/examples/scripts/results/test_polymath_en_medium_pairs.json +0 -8
  608. wisent/examples/scripts/results/test_polymath_zh_high_evaluation.json +0 -30
  609. wisent/examples/scripts/results/test_polymath_zh_high_pairs.json +0 -8
  610. wisent/examples/scripts/results/test_polymath_zh_medium_evaluation.json +0 -30
  611. wisent/examples/scripts/results/test_polymath_zh_medium_pairs.json +0 -8
  612. wisent/examples/scripts/results/test_portuguese_bench_evaluation.json +0 -51
  613. wisent/examples/scripts/results/test_portuguese_bench_pairs.json +0 -14
  614. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
  615. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
  616. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
  617. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
  618. wisent/examples/scripts/results/test_prost_evaluation.json +0 -30
  619. wisent/examples/scripts/results/test_prost_pairs.json +0 -8
  620. wisent/examples/scripts/results/test_ptb_evaluation.json +0 -30
  621. wisent/examples/scripts/results/test_ptb_pairs.json +0 -8
  622. wisent/examples/scripts/results/test_pubmedqa_evaluation.json +0 -30
  623. wisent/examples/scripts/results/test_pubmedqa_pairs.json +0 -8
  624. wisent/examples/scripts/results/test_pythia_evaluation.json +0 -51
  625. wisent/examples/scripts/results/test_pythia_pairs.json +0 -14
  626. wisent/examples/scripts/results/test_qa4mre_evaluation.json +0 -30
  627. wisent/examples/scripts/results/test_qa4mre_pairs.json +0 -8
  628. wisent/examples/scripts/results/test_qasper_evaluation.json +0 -30
  629. wisent/examples/scripts/results/test_qasper_pairs.json +0 -8
  630. wisent/examples/scripts/results/test_race_evaluation.json +0 -30
  631. wisent/examples/scripts/results/test_race_pairs.json +0 -8
  632. wisent/examples/scripts/results/test_realtoxicityprompts_evaluation.json +0 -30
  633. wisent/examples/scripts/results/test_realtoxicityprompts_pairs.json +0 -8
  634. wisent/examples/scripts/results/test_recode_evaluation.json +0 -30
  635. wisent/examples/scripts/results/test_recode_pairs.json +0 -8
  636. wisent/examples/scripts/results/test_record_evaluation.json +0 -30
  637. wisent/examples/scripts/results/test_record_pairs.json +0 -8
  638. wisent/examples/scripts/results/test_ruler_evaluation.json +0 -51
  639. wisent/examples/scripts/results/test_ruler_pairs.json +0 -14
  640. wisent/examples/scripts/results/test_sciq_evaluation.json +0 -30
  641. wisent/examples/scripts/results/test_sciq_pairs.json +0 -8
  642. wisent/examples/scripts/results/test_score_evaluation.json +0 -51
  643. wisent/examples/scripts/results/test_score_pairs.json +0 -14
  644. wisent/examples/scripts/results/test_self_consistency_evaluation.json +0 -30
  645. wisent/examples/scripts/results/test_self_consistency_pairs.json +0 -8
  646. wisent/examples/scripts/results/test_siqa/test_siqa_evaluation.json +0 -30
  647. wisent/examples/scripts/results/test_siqa/test_siqa_pairs.json +0 -8
  648. wisent/examples/scripts/results/test_siqa_evaluation.json +0 -30
  649. wisent/examples/scripts/results/test_siqa_pairs.json +0 -8
  650. wisent/examples/scripts/results/test_spanish_bench_evaluation.json +0 -51
  651. wisent/examples/scripts/results/test_spanish_bench_pairs.json +0 -14
  652. wisent/examples/scripts/results/test_squad2_evaluation.json +0 -30
  653. wisent/examples/scripts/results/test_squad2_pairs.json +0 -8
  654. wisent/examples/scripts/results/test_squadv2_evaluation.json +0 -30
  655. wisent/examples/scripts/results/test_squadv2_pairs.json +0 -8
  656. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_evaluation.json +0 -30
  657. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_pairs.json +0 -8
  658. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_evaluation.json +0 -51
  659. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_pairs.json +0 -14
  660. wisent/examples/scripts/results/test_swag_evaluation.json +0 -30
  661. wisent/examples/scripts/results/test_swag_pairs.json +0 -8
  662. wisent/examples/scripts/results/test_tinyBenchmarks_evaluation.json +0 -51
  663. wisent/examples/scripts/results/test_tinyBenchmarks_pairs.json +0 -14
  664. wisent/examples/scripts/results/test_tmmluplus_evaluation.json +0 -51
  665. wisent/examples/scripts/results/test_tmmluplus_pairs.json +0 -14
  666. wisent/examples/scripts/results/test_translation_evaluation.json +0 -51
  667. wisent/examples/scripts/results/test_translation_pairs.json +0 -14
  668. wisent/examples/scripts/results/test_triviaqa_evaluation.json +0 -30
  669. wisent/examples/scripts/results/test_triviaqa_pairs.json +0 -8
  670. wisent/examples/scripts/results/test_truthfulqa-multi_evaluation.json +0 -51
  671. wisent/examples/scripts/results/test_truthfulqa-multi_pairs.json +0 -14
  672. wisent/examples/scripts/results/test_truthfulqa_evaluation.json +0 -30
  673. wisent/examples/scripts/results/test_truthfulqa_mc1_evaluation.json +0 -30
  674. wisent/examples/scripts/results/test_truthfulqa_mc1_pairs.json +0 -8
  675. wisent/examples/scripts/results/test_truthfulqa_mc2_evaluation.json +0 -30
  676. wisent/examples/scripts/results/test_truthfulqa_mc2_pairs.json +0 -8
  677. wisent/examples/scripts/results/test_truthfulqa_pairs.json +0 -8
  678. wisent/examples/scripts/results/test_turkishmmlu_evaluation.json +0 -51
  679. wisent/examples/scripts/results/test_turkishmmlu_pairs.json +0 -14
  680. wisent/examples/scripts/results/test_unfair_tos_evaluation.json +0 -30
  681. wisent/examples/scripts/results/test_unfair_tos_pairs.json +0 -8
  682. wisent/examples/scripts/results/test_unscramble_evaluation.json +0 -51
  683. wisent/examples/scripts/results/test_unscramble_pairs.json +0 -14
  684. wisent/examples/scripts/results/test_webqs_evaluation.json +0 -30
  685. wisent/examples/scripts/results/test_webqs_pairs.json +0 -8
  686. wisent/examples/scripts/results/test_wikitext103_evaluation.json +0 -30
  687. wisent/examples/scripts/results/test_wikitext103_pairs.json +0 -8
  688. wisent/examples/scripts/results/test_wikitext_evaluation.json +0 -30
  689. wisent/examples/scripts/results/test_wikitext_pairs.json +0 -8
  690. wisent/examples/scripts/results/test_winogender_evaluation.json +0 -51
  691. wisent/examples/scripts/results/test_winogender_pairs.json +0 -14
  692. wisent/examples/scripts/results/test_winogrande_evaluation.json +0 -30
  693. wisent/examples/scripts/results/test_winogrande_pairs.json +0 -8
  694. wisent/examples/scripts/results/test_wmdp_evaluation.json +0 -30
  695. wisent/examples/scripts/results/test_wmdp_pairs.json +0 -8
  696. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_evaluation.json +0 -30
  697. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_pairs.json +0 -8
  698. wisent/examples/scripts/results/test_wmt14_en_fr_evaluation.json +0 -30
  699. wisent/examples/scripts/results/test_wmt14_en_fr_pairs.json +0 -8
  700. wisent/examples/scripts/results/test_wmt16_en_de_evaluation.json +0 -30
  701. wisent/examples/scripts/results/test_wmt16_en_de_pairs.json +0 -8
  702. wisent/examples/scripts/results/test_wmt16_ro_en_evaluation.json +0 -30
  703. wisent/examples/scripts/results/test_wmt16_ro_en_pairs.json +0 -8
  704. wisent/examples/scripts/results/test_wsc273_evaluation.json +0 -30
  705. wisent/examples/scripts/results/test_wsc273_pairs.json +0 -8
  706. wisent/examples/scripts/results/test_xcopa_evaluation.json +0 -51
  707. wisent/examples/scripts/results/test_xcopa_pairs.json +0 -14
  708. wisent/examples/scripts/results/test_xnli_eu_evaluation.json +0 -30
  709. wisent/examples/scripts/results/test_xnli_eu_pairs.json +0 -8
  710. wisent/examples/scripts/results/test_xnli_evaluation.json +0 -51
  711. wisent/examples/scripts/results/test_xnli_pairs.json +0 -14
  712. wisent/examples/scripts/results/test_xquad_evaluation.json +0 -51
  713. wisent/examples/scripts/results/test_xquad_pairs.json +0 -14
  714. wisent/examples/scripts/results/test_xstorycloze_evaluation.json +0 -51
  715. wisent/examples/scripts/results/test_xstorycloze_pairs.json +0 -14
  716. wisent/examples/scripts/results/test_xsum_evaluation.json +0 -30
  717. wisent/examples/scripts/results/test_xsum_pairs.json +0 -8
  718. wisent/examples/scripts/results/test_xwinograd_evaluation.json +0 -51
  719. wisent/examples/scripts/results/test_xwinograd_pairs.json +0 -14
  720. wisent/examples/scripts/results/test_yahoo_answers_topics_evaluation.json +0 -30
  721. wisent/examples/scripts/results/test_yahoo_answers_topics_pairs.json +0 -8
  722. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/WHEEL +0 -0
  723. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/entry_points.txt +0 -0
  724. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/licenses/LICENSE +0 -0
  725. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/top_level.txt +0 -0
@@ -19,6 +19,47 @@ __all__ = ["evaluate_quality"]
19
19
  # Global tokenizer cache
20
20
  _tokenizer_cache = {}
21
21
 
22
+ # Function words - the glue words of English that appear in natural text
23
+ # Real sentences need these; gibberish often lacks them
24
+ FUNCTION_WORDS = {
25
+ "the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
26
+ "have", "has", "had", "do", "does", "did", "will", "would", "could",
27
+ "should", "may", "might", "must", "shall", "can", "need", "dare",
28
+ "ought", "used", "to", "of", "in", "for", "on", "with", "at", "by",
29
+ "from", "as", "into", "through", "during", "before", "after", "above",
30
+ "below", "between", "under", "again", "further", "then", "once",
31
+ "and", "but", "or", "nor", "so", "yet", "both", "either", "neither",
32
+ "not", "only", "own", "same", "than", "too", "very", "just", "also",
33
+ "now", "here", "there", "when", "where", "why", "how", "all", "each",
34
+ "every", "few", "more", "most", "other", "some", "such", "no",
35
+ "any", "i", "you", "he", "she", "it", "we", "they", "me", "him", "her",
36
+ "us", "them", "my", "your", "his", "its", "our", "their", "this", "that",
37
+ "these", "those", "what", "which", "who", "whom", "whose",
38
+ }
39
+
40
+
41
+ def _has_low_function_word_ratio(text: str, threshold: float = 0.15) -> bool:
42
+ """Check if text has suspiciously low ratio of function words.
43
+
44
+ Natural English text typically has 30-50% function words.
45
+ Gibberish made of strung-together nouns/jargon has very few.
46
+
47
+ Args:
48
+ text: Text to check
49
+ threshold: Minimum ratio of function words (default 0.15)
50
+
51
+ Returns:
52
+ True if text has too few function words (likely gibberish)
53
+ """
54
+ tokens = re.findall(r'\b\w+\b', text.lower())
55
+ if len(tokens) < 6:
56
+ return False # Too short to judge
57
+
58
+ function_count = sum(1 for t in tokens if t in FUNCTION_WORDS)
59
+ ratio = function_count / len(tokens)
60
+
61
+ return ratio < threshold
62
+
22
63
 
23
64
  def _get_tokenizer():
24
65
  """Get a cached tokenizer for nonsense word detection."""
@@ -137,6 +178,11 @@ def _is_gibberish(text: str) -> bool:
137
178
  if validity_ratio < 0.3:
138
179
  return True
139
180
 
181
+ # Check 6: Function word ratio - real English has ~30-50% function words
182
+ # Gibberish made of strung-together nouns/jargon has very few
183
+ if _has_low_function_word_ratio(text, threshold=0.15):
184
+ return True
185
+
140
186
  return False
141
187
 
142
188
 
@@ -9,8 +9,8 @@ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_sc
9
9
  from .contrastive_pairs import ContrastivePairSet
10
10
  from .steering import SteeringMethod, SteeringType
11
11
  from .activations.activations_collector import ActivationCollector
12
- from .activations.core.atoms import ActivationAggregationStrategy
13
- from .activations.prompt_construction_strategy import PromptConstructionStrategy
12
+ from .activations.extraction_strategy import ExtractionStrategy
13
+
14
14
  from wisent.core.errors import OptimizationError, NoActivationDataError, InsufficientDataError
15
15
 
16
16
  logger = logging.getLogger(__name__)
@@ -352,22 +352,22 @@ class HyperparameterOptimizer:
352
352
 
353
353
  # Map aggregation string to enum
354
354
  aggregation_map = {
355
- 'average': ActivationAggregationStrategy.MEAN_POOLING,
356
- 'first': ActivationAggregationStrategy.FIRST_TOKEN,
357
- 'last': ActivationAggregationStrategy.LAST_TOKEN,
358
- 'max': ActivationAggregationStrategy.MAX_POOLING,
355
+ 'average': ExtractionStrategy.CHAT_MEAN,
356
+ 'first': ExtractionStrategy.CHAT_FIRST,
357
+ 'last': ExtractionStrategy.CHAT_LAST,
358
+ 'max': ExtractionStrategy.CHAT_MAX_NORM,
359
359
  }
360
- agg_strategy = aggregation_map.get(aggregation, ActivationAggregationStrategy.MEAN_POOLING)
360
+ agg_strategy = aggregation_map.get(aggregation, ExtractionStrategy.CHAT_MEAN)
361
361
 
362
362
  # Map prompt strategy string to enum
363
363
  prompt_strategy_map = {
364
- 'multiple_choice': PromptConstructionStrategy.MULTIPLE_CHOICE,
365
- 'role_playing': PromptConstructionStrategy.ROLE_PLAYING,
366
- 'direct_completion': PromptConstructionStrategy.DIRECT_COMPLETION,
367
- 'instruction_following': PromptConstructionStrategy.INSTRUCTION_FOLLOWING,
368
- 'chat_template': PromptConstructionStrategy.CHAT_TEMPLATE,
364
+ 'multiple_choice': ExtractionStrategy.MC_BALANCED,
365
+ 'role_playing': ExtractionStrategy.ROLE_PLAY,
366
+ 'direct_completion': ExtractionStrategy.CHAT_LAST,
367
+ 'instruction_following': ExtractionStrategy.CHAT_LAST,
368
+ 'chat_template': ExtractionStrategy.CHAT_LAST,
369
369
  }
370
- prompt_strategy = prompt_strategy_map.get(prompt_construction_strategy, PromptConstructionStrategy.CHAT_TEMPLATE)
370
+ prompt_strategy = prompt_strategy_map.get(prompt_construction_strategy, ExtractionStrategy.CHAT_LAST)
371
371
 
372
372
  # Create activation collector
373
373
  collector = ActivationCollector(model=model, store_device="cpu")
@@ -10,7 +10,7 @@ Evaluation uses the TEST portion (20%) to ensure no data leakage with training.
10
10
  import logging
11
11
  from typing import Any, Dict
12
12
 
13
- from wisent.core.activations.core.atoms import ActivationAggregationStrategy
13
+ from wisent.core.activations.extraction_strategy import ExtractionStrategy
14
14
  from wisent.core.activations.activations import Activations
15
15
  from wisent.core.layer import Layer
16
16
  from wisent.core.utils.dataset_splits import get_all_docs_from_task, create_deterministic_split
@@ -674,16 +674,12 @@ class LMEvalHarnessGroundTruth:
674
674
  }
675
675
 
676
676
  def _map_token_aggregation_to_activation_method(self, token_aggregation: str):
677
- """Map token aggregation string to activation method."""
678
-
679
- mapping = { # TODO This should be refactor, why we use strings as Token aggregation?
680
- "average": ActivationAggregationStrategy.MEAN_POOLING,
681
- "mean": ActivationAggregationStrategy.MEAN_POOLING,
682
- "last": ActivationAggregationStrategy.LAST_TOKEN,
683
- "max": ActivationAggregationStrategy.MAX_POOLING,
684
- }
685
-
686
- return mapping.get(token_aggregation.lower(), ActivationAggregationStrategy.MEAN_POOLING)
677
+ """Map token aggregation string to ExtractionStrategy."""
678
+ # Use ExtractionStrategy directly - token_aggregation should already be a valid enum value
679
+ try:
680
+ return ExtractionStrategy(token_aggregation)
681
+ except ValueError:
682
+ return ExtractionStrategy.CHAT_LAST
687
683
 
688
684
  def _is_task_interface_task(self, task_name: str) -> bool:
689
685
  """Check if this is a TaskInterface task (not an lm-eval task)."""
wisent/core/main.py CHANGED
@@ -12,6 +12,7 @@ from wisent.core.branding import print_banner
12
12
  from wisent.core.cli import execute_tasks, execute_generate_pairs_from_task, execute_generate_pairs, execute_diagnose_pairs, execute_get_activations, execute_diagnose_vectors, execute_create_steering_vector, execute_generate_vector_from_task, execute_generate_vector_from_synthetic, execute_optimize_classification, execute_optimize_steering, execute_optimize_sample_size, execute_generate_responses, execute_evaluate_responses, execute_multi_steer, execute_agent, execute_modify_weights, execute_evaluate_refusal, execute_inference_config, execute_optimization_cache, execute_optimize_weights, execute_optimize
13
13
  from wisent.core.cli.train_unified_goodness import execute_train_unified_goodness
14
14
  from wisent.core.cli.check_linearity import execute_check_linearity
15
+ from wisent.core.cli.cluster_benchmarks import execute_cluster_benchmarks
15
16
 
16
17
 
17
18
  def _should_show_banner() -> bool:
@@ -92,6 +93,8 @@ def main():
92
93
  execute_train_unified_goodness(args)
93
94
  elif args.command == 'check-linearity':
94
95
  execute_check_linearity(args)
96
+ elif args.command == 'cluster-benchmarks':
97
+ execute_cluster_benchmarks(args)
95
98
  else:
96
99
  print(f"\n✗ Command '{args.command}' is not yet implemented")
97
100
  sys.exit(1)
@@ -154,12 +154,13 @@ class WisentModel:
154
154
  layers: list[nn.Module] = []
155
155
 
156
156
  candidates = [
157
- "layers",
158
- "model.layers",
159
- "model.decoder.layers",
160
- "transformer.h",
157
+ "layers",
158
+ "model.layers",
159
+ "model.decoder.layers",
160
+ "transformer.h",
161
161
  "base_model.model.layers",
162
- "blocks", "model.blocks",
162
+ "blocks", "model.blocks",
163
+ "gpt_neox.layers", # Pythia models
163
164
  ]
164
165
  for path in candidates:
165
166
  obj = m
@@ -516,7 +517,7 @@ class WisentModel:
516
517
  return_tensors="pt",
517
518
  padding=False, # Single prompt, no padding needed
518
519
  truncation=True, # Avoid errors on long inputs
519
- max_length=self.tokenizer.model_max_length # Use model's actual limit
520
+ max_length=self.tokenizer.model_max_length, # Use model's actual limit
520
521
  )
521
522
  # Move tensors to the correct device (same as _batch_encode does)
522
523
  batch = {
@@ -792,7 +793,7 @@ class WisentModel:
792
793
  return_tensors="pt",
793
794
  padding=False, # Single prompt, no padding needed
794
795
  truncation=True, # Avoid errors on long inputs
795
- max_length=self.tokenizer.model_max_length # Use model's actual limit
796
+ max_length=self.tokenizer.model_max_length, # Use model's actual limit
796
797
  )
797
798
  # Move tensors to the correct device (same as _batch_encode does)
798
799
  batch = {
@@ -240,11 +240,14 @@ class WeightsOptimizer(BaseOptimizer):
240
240
  norm_preserve=self.config.norm_preserve,
241
241
  verbose=False,
242
242
  )
243
- elif self.config.method == "additive":
243
+ elif self.config.method in ("additive", "titan", "prism", "pulse"):
244
244
  # Direct additive: add steering vector directly to weight matrices
245
- # This is the simplest approach that worked in manual tests
245
+ # This modifies weights directly, not biases, so it persists when saved
246
+ # Used for additive and multi-direction methods (titan/prism/pulse)
246
247
  self._apply_direct_additive(params)
247
248
  else:
249
+ # Default fallback - use bake_steering_with_kernel
250
+ # Note: This adds biases which may not load correctly for some architectures
248
251
  bake_steering_with_kernel(
249
252
  self.model,
250
253
  self.steering_vectors,
@@ -376,6 +379,8 @@ class WeightsOptimizer(BaseOptimizer):
376
379
  checkpoint_interval: int = 5,
377
380
  output_dir: str | None = None,
378
381
  tokenizer: Any = None,
382
+ s3_bucket: str | None = None,
383
+ s3_key_prefix: str | None = None,
379
384
  ) -> HPORun:
380
385
  """
381
386
  Run optimization with checkpointing support.
@@ -436,11 +441,20 @@ class WeightsOptimizer(BaseOptimizer):
436
441
  if checkpoint_path and trial_num % checkpoint_interval == 0:
437
442
  self._save_checkpoint(study, checkpoint_path)
438
443
  print(f" [Checkpoint saved at trial {trial_num}]")
444
+
445
+ # Upload checkpoint to S3
446
+ if s3_bucket and s3_key_prefix:
447
+ self._upload_to_s3(checkpoint_path, s3_bucket, f"{s3_key_prefix}/checkpoint.json")
439
448
 
440
449
  # Save best model at intervals
441
450
  if output_dir and trial_num % checkpoint_interval == 0:
442
451
  if study.best_trial is not None:
443
452
  self._save_best_model_checkpoint(study, output_dir, tokenizer)
453
+
454
+ # Upload best model checkpoint to S3
455
+ if s3_bucket and s3_key_prefix:
456
+ checkpoint_dir = os.path.join(output_dir, "checkpoint_best")
457
+ self._upload_to_s3(checkpoint_dir, s3_bucket, f"{s3_key_prefix}/checkpoint_best/")
444
458
 
445
459
  # Run optimization with callback
446
460
  study.optimize(
@@ -521,3 +535,16 @@ class WeightsOptimizer(BaseOptimizer):
521
535
  }
522
536
  with open(os.path.join(checkpoint_dir, "checkpoint_metadata.json"), "w") as f:
523
537
  json.dump(metadata, f, indent=2)
538
+
539
+ def _upload_to_s3(self, local_path: str, s3_bucket: str, s3_key: str) -> bool:
540
+ """Upload a file or directory to S3."""
541
+ import subprocess
542
+ try:
543
+ if os.path.isdir(local_path):
544
+ cmd = ["aws", "s3", "sync", local_path, f"s3://{s3_bucket}/{s3_key}", "--quiet"]
545
+ else:
546
+ cmd = ["aws", "s3", "cp", local_path, f"s3://{s3_bucket}/{s3_key}", "--quiet"]
547
+ subprocess.run(cmd, check=True, capture_output=True)
548
+ return True
549
+ except Exception:
550
+ return False
@@ -16,7 +16,7 @@ import numpy as np
16
16
  import torch
17
17
 
18
18
  from wisent.core.activations.activations_collector import ActivationCollector
19
- from wisent.core.activations.core.atoms import ActivationAggregationStrategy
19
+ from wisent.core.activations.extraction_strategy import ExtractionStrategy
20
20
  from wisent.core.activations.activations import Activations
21
21
 
22
22
  logger = logging.getLogger(__name__)
@@ -29,7 +29,7 @@ class ActivationData:
29
29
  activations: torch.Tensor
30
30
  labels: torch.Tensor
31
31
  layer: int
32
- aggregation: ActivationAggregationStrategy
32
+ aggregation: ExtractionStrategy
33
33
  metadata: dict[str, Any]
34
34
 
35
35
  def to_numpy(self) -> tuple[np.ndarray, np.ndarray]:
@@ -102,7 +102,7 @@ class GenerationConfig:
102
102
  """Configuration for activation generation."""
103
103
 
104
104
  layer_search_range: tuple[int, int]
105
- aggregation_methods: Optional[list[ActivationAggregationStrategy]] = None
105
+ aggregation_methods: Optional[list[ExtractionStrategy]] = None
106
106
  cache_dir: Optional[str] = None
107
107
  device: Optional[str] = None
108
108
  dtype: Optional[torch.dtype] = None # Auto-detect if None
@@ -113,10 +113,10 @@ class GenerationConfig:
113
113
  self.cache_dir = "./activation_cache"
114
114
  if not self.aggregation_methods:
115
115
  self.aggregation_methods = [
116
- ActivationAggregationStrategy.MEAN_POOLING,
117
- ActivationAggregationStrategy.LAST_TOKEN,
118
- ActivationAggregationStrategy.FIRST_TOKEN,
119
- ActivationAggregationStrategy.MAX_POOLING,
116
+ ExtractionStrategy.CHAT_MEAN,
117
+ ExtractionStrategy.CHAT_LAST,
118
+ ExtractionStrategy.CHAT_FIRST,
119
+ ExtractionStrategy.CHAT_MAX_NORM,
120
120
  ]
121
121
 
122
122
 
@@ -239,7 +239,7 @@ class ActivationGenerator:
239
239
  return activation_data
240
240
 
241
241
  def _apply_batch_aggregation(
242
- self, activations: torch.Tensor, strategy: ActivationAggregationStrategy
242
+ self, activations: torch.Tensor, strategy: ExtractionStrategy
243
243
  ) -> torch.Tensor:
244
244
  """
245
245
  Apply aggregation strategy to a batch of activations efficiently.
@@ -258,14 +258,16 @@ class ActivationGenerator:
258
258
  return activations
259
259
  if len(activations.shape) == 3:
260
260
  # [n_samples, n_tokens, hidden_dim] -> [n_samples, hidden_dim]
261
- if strategy == ActivationAggregationStrategy.MEAN_POOLING:
261
+ if strategy == ExtractionStrategy.CHAT_MEAN:
262
262
  return torch.mean(activations, dim=1)
263
- if strategy == ActivationAggregationStrategy.LAST_TOKEN:
263
+ if strategy == ExtractionStrategy.CHAT_LAST:
264
264
  return activations[:, -1, :]
265
- if strategy == ActivationAggregationStrategy.FIRST_TOKEN:
265
+ if strategy == ExtractionStrategy.CHAT_FIRST:
266
266
  return activations[:, 0, :]
267
- if strategy == ActivationAggregationStrategy.MAX_POOLING:
267
+ if strategy == ExtractionStrategy.CHAT_MAX_NORM:
268
268
  return torch.max(activations, dim=1)[0]
269
+ if strategy == ExtractionStrategy.CHAT_MEAN:
270
+ return torch.min(activations, dim=1)[0]
269
271
  # Default to mean pooling
270
272
  self.logger.warning(f"Unknown aggregation strategy {strategy}, using mean pooling")
271
273
  return torch.mean(activations, dim=1)
@@ -14,7 +14,7 @@ from typing import Any, Dict, List, Optional, Tuple
14
14
  import torch
15
15
  from tqdm import tqdm
16
16
 
17
- from wisent.core.activations.core import ActivationAggregationStrategy
17
+ from wisent.core.activations import ExtractionStrategy
18
18
  from wisent.core.classifier.classifier import Classifier
19
19
  from wisent.core.contrastive_pairs.contrastive_pair import ContrastivePair
20
20
  from wisent.core.contrastive_pairs.contrastive_pair_set import ContrastivePairSet
@@ -743,24 +743,29 @@ class SteeringOptimizer:
743
743
  # Apply aggregation strategy
744
744
  if (
745
745
  aggregation_strategy == "mean_pooling"
746
- or aggregation_strategy == ActivationAggregationStrategy.MEAN_POOLING.value
746
+ or aggregation_strategy == ExtractionStrategy.CHAT_MEAN.value
747
747
  ):
748
748
  aggregated = torch.mean(activation_tensor, dim=1) # [1, hidden_dim]
749
749
  elif (
750
750
  aggregation_strategy == "last_token"
751
- or aggregation_strategy == ActivationAggregationStrategy.LAST_TOKEN.value
751
+ or aggregation_strategy == ExtractionStrategy.CHAT_LAST.value
752
752
  ):
753
753
  aggregated = activation_tensor[:, -1, :] # [1, hidden_dim]
754
754
  elif (
755
755
  aggregation_strategy == "first_token"
756
- or aggregation_strategy == ActivationAggregationStrategy.FIRST_TOKEN.value
756
+ or aggregation_strategy == ExtractionStrategy.CHAT_FIRST.value
757
757
  ):
758
758
  aggregated = activation_tensor[:, 0, :] # [1, hidden_dim]
759
759
  elif (
760
760
  aggregation_strategy == "max_pooling"
761
- or aggregation_strategy == ActivationAggregationStrategy.MAX_POOLING.value
761
+ or aggregation_strategy == ExtractionStrategy.CHAT_MAX_NORM.value
762
762
  ):
763
763
  aggregated = torch.max(activation_tensor, dim=1)[0] # [1, hidden_dim]
764
+ elif (
765
+ aggregation_strategy == "min_pooling"
766
+ or aggregation_strategy == ExtractionStrategy.CHAT_MEAN.value
767
+ ):
768
+ aggregated = torch.min(activation_tensor, dim=1)[0] # [1, hidden_dim]
764
769
  else:
765
770
  # Default to mean pooling if unknown
766
771
  self.logger.warning(f"Unknown aggregation strategy {aggregation_strategy}, using mean pooling")
@@ -1029,10 +1034,10 @@ class SteeringOptimizer:
1029
1034
  generation_config = GenerationConfig(
1030
1035
  layer_search_range=(0, 23), # Will be auto-detected from model
1031
1036
  aggregation_methods=[
1032
- ActivationAggregationStrategy.MEAN_POOLING,
1033
- ActivationAggregationStrategy.LAST_TOKEN,
1034
- ActivationAggregationStrategy.FIRST_TOKEN,
1035
- ActivationAggregationStrategy.MAX_POOLING,
1037
+ ExtractionStrategy.CHAT_MEAN,
1038
+ ExtractionStrategy.CHAT_LAST,
1039
+ ExtractionStrategy.CHAT_FIRST,
1040
+ ExtractionStrategy.CHAT_MAX_NORM,
1036
1041
  ],
1037
1042
  cache_dir="./cache/steering_activations",
1038
1043
  device=optimization_config.device,
@@ -0,0 +1,31 @@
1
+ """Parser for cluster-benchmarks command."""
2
+
3
+ import argparse
4
+
5
+
6
+ def setup_cluster_benchmarks_parser(parser: argparse.ArgumentParser) -> None:
7
+ """Set up arguments for the cluster-benchmarks command."""
8
+ parser.add_argument(
9
+ "--model",
10
+ type=str,
11
+ required=True,
12
+ help="Model name or path (e.g., meta-llama/Llama-3.2-1B-Instruct)"
13
+ )
14
+ parser.add_argument(
15
+ "--output",
16
+ type=str,
17
+ default="./cluster_output",
18
+ help="Output directory for results (default: ./cluster_output)"
19
+ )
20
+ parser.add_argument(
21
+ "--pairs-per-benchmark",
22
+ type=int,
23
+ default=50,
24
+ help="Number of contrastive pairs per benchmark (default: 50)"
25
+ )
26
+ parser.add_argument(
27
+ "--device",
28
+ type=str,
29
+ default=None,
30
+ help="Device to use (cuda/mps/cpu). Auto-detected if not specified."
31
+ )
@@ -101,6 +101,26 @@ def setup_generate_vector_from_task_parser(parser: argparse.ArgumentParser) -> N
101
101
  help="Do not L2-normalize steering vectors"
102
102
  )
103
103
 
104
+ # Universal Subspace options (PRISM/TITAN)
105
+ parser.add_argument(
106
+ "--auto-num-directions",
107
+ action="store_true",
108
+ default=False,
109
+ help="Automatically determine num_directions based on explained variance (PRISM/TITAN)"
110
+ )
111
+ parser.add_argument(
112
+ "--use-universal-basis-init",
113
+ action="store_true",
114
+ default=False,
115
+ help="Initialize directions from universal basis (PRISM/TITAN)"
116
+ )
117
+ parser.add_argument(
118
+ "--num-directions",
119
+ type=int,
120
+ default=3,
121
+ help="Number of steering directions for PRISM/TITAN (default: 3)"
122
+ )
123
+
104
124
  # Intermediate file handling
105
125
  parser.add_argument(
106
126
  "--keep-intermediate",
@@ -39,6 +39,7 @@ from wisent.core.parser_arguments.optimize_weights_parser import setup_optimize_
39
39
  from wisent.core.parser_arguments.train_unified_goodness_parser import setup_train_unified_goodness_parser
40
40
  from wisent.core.parser_arguments.optimize_parser import setup_optimize_parser
41
41
  from wisent.core.parser_arguments.check_linearity_parser import setup_check_linearity_parser
42
+ from wisent.core.parser_arguments.cluster_benchmarks_parser import setup_cluster_benchmarks_parser
42
43
 
43
44
 
44
45
  def setup_parser() -> argparse.ArgumentParser:
@@ -217,4 +218,11 @@ def setup_parser() -> argparse.ArgumentParser:
217
218
  )
218
219
  setup_check_linearity_parser(check_linearity_parser)
219
220
 
221
+ # Cluster benchmarks command - cluster benchmarks by direction similarity
222
+ cluster_benchmarks_parser = subparsers.add_parser(
223
+ "cluster-benchmarks",
224
+ help="Cluster benchmarks by direction similarity with geometry analysis"
225
+ )
226
+ setup_cluster_benchmarks_parser(cluster_benchmarks_parser)
227
+
220
228
  return parser
@@ -68,6 +68,12 @@ def setup_steering_optimizer_parser(parser):
68
68
  default="./baseline_comparison",
69
69
  help="Directory to save baseline comparison results (default: ./baseline_comparison)",
70
70
  )
71
+ comprehensive_parser.add_argument(
72
+ "--output-dir",
73
+ type=str,
74
+ default="./optimization_results",
75
+ help="Directory to save optimization results (default: ./optimization_results)",
76
+ )
71
77
  comprehensive_parser.add_argument(
72
78
  "--show-comparisons",
73
79
  type=int,
@@ -170,32 +176,39 @@ def setup_steering_optimizer_parser(parser):
170
176
 
171
177
  # Base search space overrides
172
178
  comprehensive_parser.add_argument(
173
- "--search-layers",
179
+ "--search-layers", "--layers",
174
180
  type=str,
175
181
  default=None,
182
+ dest="search_layers",
176
183
  help="Comma-separated layer indices to search (e.g., '8,10,12,14')"
177
184
  )
178
185
  comprehensive_parser.add_argument(
179
- "--search-strengths",
186
+ "--search-strengths", "--strengths",
180
187
  type=str,
181
188
  default=None,
189
+ dest="search_strengths",
182
190
  help="Comma-separated strength values to search (e.g., '0.5,1.0,1.5,2.0')"
183
191
  )
184
192
  comprehensive_parser.add_argument(
185
- "--search-strategies",
193
+ "--search-strategies", "--strategies",
186
194
  type=str,
187
- nargs="+",
188
195
  default=None,
189
- choices=["constant", "initial_only", "diminishing", "all_equal"],
190
- help="Steering strategies to search"
196
+ dest="search_strategies",
197
+ help="Comma-separated steering strategies to search (e.g., 'constant,initial_only,diminishing,increasing,gaussian')"
191
198
  )
192
199
  comprehensive_parser.add_argument(
193
- "--search-token-aggregations",
200
+ "--search-token-aggregations", "--token-aggregations",
194
201
  type=str,
195
- nargs="+",
196
202
  default=None,
197
- choices=["last_token", "mean_pooling", "first_token", "max_pooling"],
198
- help="Token aggregation strategies to search"
203
+ dest="search_token_aggregations",
204
+ help="Comma-separated token aggregation strategies (e.g., 'last_token,mean_pooling,first_token,max_pooling,continuation_token,choice_token')"
205
+ )
206
+ comprehensive_parser.add_argument(
207
+ "--search-prompt-constructions", "--prompt-constructions",
208
+ type=str,
209
+ default=None,
210
+ dest="search_prompt_constructions",
211
+ help="Comma-separated prompt construction strategies (e.g., 'chat_template,direct_completion,instruction_following,multiple_choice,role_playing')"
199
212
  )
200
213
 
201
214
  # PRISM-specific search space
@@ -615,3 +628,97 @@ def setup_steering_optimizer_parser(parser):
615
628
  action="store_true",
616
629
  help="Save optimal parameters as default for this model/task combination",
617
630
  )
631
+
632
+ # ==========================================================================
633
+ # UNIVERSAL METHOD OPTIMIZER (NEW)
634
+ # ==========================================================================
635
+ # This optimizer uses the universal train(pair_set) interface that ALL
636
+ # steering methods implement, ensuring it works with any method including
637
+ # future ones.
638
+
639
+ universal_parser = steering_subparsers.add_parser(
640
+ "universal",
641
+ help="Universal optimizer that works with ANY steering method (recommended)"
642
+ )
643
+ universal_parser.add_argument("model", type=str, help="Model name or path")
644
+ universal_parser.add_argument(
645
+ "--task",
646
+ type=str,
647
+ required=True,
648
+ help="Task/benchmark to optimize for (e.g., truthfulqa_generation, arc_easy)"
649
+ )
650
+ universal_parser.add_argument(
651
+ "--method",
652
+ type=str,
653
+ default="CAA",
654
+ choices=AVAILABLE_METHODS + [m.lower() for m in AVAILABLE_METHODS],
655
+ help=f"Steering method to optimize. Available: {', '.join(AVAILABLE_METHODS)} (default: CAA)"
656
+ )
657
+ universal_parser.add_argument(
658
+ "--limit",
659
+ type=int,
660
+ default=100,
661
+ help="Maximum samples to use (default: 100)"
662
+ )
663
+ universal_parser.add_argument(
664
+ "--quick",
665
+ action="store_true",
666
+ help="Use reduced search space for faster testing"
667
+ )
668
+ universal_parser.add_argument(
669
+ "--max-configs",
670
+ type=int,
671
+ default=None,
672
+ help="Maximum number of configurations to test (default: all)"
673
+ )
674
+ universal_parser.add_argument(
675
+ "--output-dir",
676
+ type=str,
677
+ default="./optimization_results",
678
+ help="Directory to save results (default: ./optimization_results)"
679
+ )
680
+ universal_parser.add_argument(
681
+ "--save-best-vector",
682
+ action="store_true",
683
+ help="Save the best steering vector to output directory"
684
+ )
685
+ universal_parser.add_argument("--device", type=str, default=None, help="Device to run on")
686
+ universal_parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
687
+
688
+ # Search space customization
689
+ universal_parser.add_argument(
690
+ "--layers",
691
+ type=str,
692
+ default=None,
693
+ help="Comma-separated layer indices to search (e.g., '8,10,12,14')"
694
+ )
695
+ universal_parser.add_argument(
696
+ "--strengths",
697
+ type=str,
698
+ default=None,
699
+ help="Comma-separated strength values to search (e.g., '0.5,1.0,1.5,2.0')"
700
+ )
701
+ universal_parser.add_argument(
702
+ "--token-aggregations",
703
+ type=str,
704
+ nargs="+",
705
+ default=None,
706
+ choices=["last_token", "mean_pooling", "first_token", "max_pooling", "continuation_token"],
707
+ help="Token aggregation strategies to search"
708
+ )
709
+ universal_parser.add_argument(
710
+ "--prompt-strategies",
711
+ type=str,
712
+ nargs="+",
713
+ default=None,
714
+ choices=["chat_template", "direct_completion", "multiple_choice", "role_playing", "instruction_following"],
715
+ help="Prompt construction strategies to search"
716
+ )
717
+
718
+ # Method-specific parameter overrides (JSON format)
719
+ universal_parser.add_argument(
720
+ "--method-params",
721
+ type=str,
722
+ default=None,
723
+ help='JSON dict of method-specific parameter ranges, e.g., \'{"num_directions": [2, 3, 5]}\''
724
+ )
@@ -150,6 +150,12 @@ def setup_optimize_weights_parser(parser: argparse.ArgumentParser) -> None:
150
150
  default=5,
151
151
  help="Save checkpoint and best model every N trials. Default: 5"
152
152
  )
153
+ parser.add_argument(
154
+ "--s3-bucket",
155
+ type=str,
156
+ default=None,
157
+ help="S3 bucket to upload results to (e.g., 'wisent-optimization-results'). Results will be uploaded on completion."
158
+ )
153
159
 
154
160
  # ==========================================================================
155
161
  # EVALUATION CONFIGURATION