wisent 0.7.379__py3-none-any.whl → 0.7.701__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (725) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/__init__.py +22 -6
  3. wisent/core/activations/activations.py +21 -39
  4. wisent/core/activations/activations_collector.py +141 -373
  5. wisent/core/activations/classifier_inference_strategy.py +194 -0
  6. wisent/core/activations/core/atoms.py +8 -92
  7. wisent/core/activations/extraction_strategy.py +308 -0
  8. wisent/core/agent/diagnose/response_diagnostics.py +3 -3
  9. wisent/core/agent/diagnose.py +3 -3
  10. wisent/core/autonomous_agent.py +2 -2
  11. wisent/core/cli/agent/apply_steering.py +23 -27
  12. wisent/core/cli/agent/evaluate_response.py +18 -20
  13. wisent/core/cli/agent/train_classifier.py +18 -20
  14. wisent/core/cli/cluster_benchmarks.py +472 -0
  15. wisent/core/cli/create_steering_vector.py +13 -5
  16. wisent/core/cli/generate_vector_from_task.py +4 -0
  17. wisent/core/cli/get_activations.py +12 -36
  18. wisent/core/cli/method_optimizer.py +859 -0
  19. wisent/core/cli/optimize.py +44 -5
  20. wisent/core/cli/optimize_classification.py +5 -6
  21. wisent/core/cli/optimize_sample_size.py +8 -22
  22. wisent/core/cli/optimize_steering.py +429 -153
  23. wisent/core/cli/optimize_weights.py +65 -6
  24. wisent/core/cli/steering_method_trainer.py +5 -4
  25. wisent/core/cli/steering_search_space.py +20 -15
  26. wisent/core/cli/tasks.py +14 -43
  27. wisent/core/cli/train_unified_goodness.py +17 -18
  28. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +1578 -173
  29. wisent/core/contrastive_pairs/diagnostics/linearity.py +63 -80
  30. wisent/core/contrastive_pairs/diagnostics/vector_quality.py +6 -5
  31. wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py +5 -19
  32. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py +11 -5
  33. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py +146 -32
  34. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py +2 -2
  35. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py +98 -57
  36. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py +8 -8
  37. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py +1 -1
  38. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -5
  39. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval_aqua_rat.py +129 -0
  40. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py +11 -6
  41. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +1 -1
  42. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py +47 -6
  43. wisent/core/evaluators/benchmark_specific/apps_evaluator.py +133 -0
  44. wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py +6 -1
  45. wisent/core/evaluators/benchmark_specific/conala_evaluator.py +31 -168
  46. wisent/core/evaluators/custom/examples/humanization_coherent.py +89 -35
  47. wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py +2 -20
  48. wisent/core/evaluators/personalization/coherence.py +46 -0
  49. wisent/core/hyperparameter_optimizer.py +13 -13
  50. wisent/core/lm_eval_harness_ground_truth.py +7 -11
  51. wisent/core/main.py +3 -0
  52. wisent/core/models/wisent_model.py +8 -7
  53. wisent/core/opti/methods/opti_weights.py +29 -2
  54. wisent/core/optuna/classifier/activation_generator.py +14 -12
  55. wisent/core/optuna/steering/steering_optimization.py +14 -9
  56. wisent/core/parser_arguments/cluster_benchmarks_parser.py +31 -0
  57. wisent/core/parser_arguments/generate_vector_from_task_parser.py +20 -0
  58. wisent/core/parser_arguments/main_parser.py +8 -0
  59. wisent/core/parser_arguments/optimize_steering_parser.py +117 -10
  60. wisent/core/parser_arguments/optimize_weights_parser.py +6 -0
  61. wisent/core/parser_arguments/tasks_parser.py +7 -19
  62. wisent/core/steering_methods/core/atoms.py +1 -2
  63. wisent/core/steering_methods/methods/caa.py +1 -1
  64. wisent/core/steering_methods/methods/hyperplane.py +74 -0
  65. wisent/core/steering_methods/methods/prism.py +1 -2
  66. wisent/core/steering_methods/methods/pulse.py +39 -8
  67. wisent/core/steering_methods/methods/titan.py +59 -14
  68. wisent/core/steering_methods/registry.py +52 -12
  69. wisent/core/steering_optimizer.py +15 -15
  70. wisent/core/trainers/steering_trainer.py +9 -18
  71. wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json +19 -70
  72. wisent/scripts/run_quality_metrics_sweep.sh +22 -27
  73. wisent/tests/test_aggregation_geometry.py +236 -0
  74. wisent/tests/test_detector_accuracy.py +163 -0
  75. wisent/tests/test_geometry_exhaustive.py +1202 -0
  76. wisent/tests/visualize_geometry.py +255 -61
  77. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/METADATA +1 -1
  78. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/RECORD +82 -714
  79. wisent/core/activations/prompt_construction_strategy.py +0 -47
  80. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text.py +0 -15
  81. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_go.py +0 -64
  82. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_java.py +0 -65
  83. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_javascript.py +0 -65
  84. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_php.py +0 -65
  85. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_python.py +0 -65
  86. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_ruby.py +0 -65
  87. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/freebase.py +0 -99
  88. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py +0 -180
  89. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py +0 -129
  90. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py +0 -142
  91. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py +0 -155
  92. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code2text.py +0 -161
  93. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/codexglue.py +0 -107
  94. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livemathbench.py +0 -155
  95. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polymath.py +0 -155
  96. wisent/examples/scripts/results/benchmark_descriptions.json +0 -1244
  97. wisent/examples/scripts/results/benchmark_evaluation_methods.json +0 -66
  98. wisent/examples/scripts/results/benchmark_evaluator_mapping.json +0 -2781
  99. wisent/examples/scripts/results/benchmark_evaluator_mapping_updated.json +0 -30536
  100. wisent/examples/scripts/results/benchmark_evaluators_clean.json +0 -469
  101. wisent/examples/scripts/results/benchmark_methods_summary.json +0 -260
  102. wisent/examples/scripts/results/benchmark_pair_creation_methods.json +0 -66
  103. wisent/examples/scripts/results/benchmark_pair_totals.json +0 -269
  104. wisent/examples/scripts/results/benchmark_tags.json +0 -917
  105. wisent/examples/scripts/results/benchmark_test_summary_nov4.json +0 -71
  106. wisent/examples/scripts/results/coding_benchmarks_test_code_status.json +0 -150
  107. wisent/examples/scripts/results/failing_benchmarks.json +0 -946
  108. wisent/examples/scripts/results/failing_benchmarks_list.json +0 -41
  109. wisent/examples/scripts/results/failing_benchmarks_test_results.json +0 -945
  110. wisent/examples/scripts/results/missing_benchmark_tags.json +0 -341
  111. wisent/examples/scripts/results/test_20_newsgroups_evaluation.json +0 -30
  112. wisent/examples/scripts/results/test_20_newsgroups_pairs.json +0 -8
  113. wisent/examples/scripts/results/test_AraDICE_evaluation.json +0 -51
  114. wisent/examples/scripts/results/test_AraDICE_pairs.json +0 -14
  115. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_evaluation.json +0 -30
  116. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_pairs.json +0 -8
  117. wisent/examples/scripts/results/test_ArabCulture_evaluation.json +0 -51
  118. wisent/examples/scripts/results/test_ArabCulture_pairs.json +0 -14
  119. wisent/examples/scripts/results/test_Tag_evaluation.json +0 -30
  120. wisent/examples/scripts/results/test_Tag_pairs.json +0 -8
  121. wisent/examples/scripts/results/test_aclue_evaluation.json +0 -51
  122. wisent/examples/scripts/results/test_aclue_pairs.json +0 -14
  123. wisent/examples/scripts/results/test_acp_bench_evaluation.json +0 -51
  124. wisent/examples/scripts/results/test_acp_bench_hard_evaluation.json +0 -51
  125. wisent/examples/scripts/results/test_acp_bench_hard_pairs.json +0 -14
  126. wisent/examples/scripts/results/test_acp_bench_pairs.json +0 -14
  127. wisent/examples/scripts/results/test_advanced_ai_risk_evaluation.json +0 -51
  128. wisent/examples/scripts/results/test_advanced_ai_risk_pairs.json +0 -14
  129. wisent/examples/scripts/results/test_aexams_evaluation.json +0 -51
  130. wisent/examples/scripts/results/test_aexams_pairs.json +0 -14
  131. wisent/examples/scripts/results/test_afrimgsm_direct_amh_evaluation.json +0 -30
  132. wisent/examples/scripts/results/test_afrimgsm_direct_amh_pairs.json +0 -8
  133. wisent/examples/scripts/results/test_afrimmlu_direct_amh_evaluation.json +0 -30
  134. wisent/examples/scripts/results/test_afrimmlu_direct_amh_pairs.json +0 -8
  135. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_evaluation.json +0 -30
  136. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_pairs.json +0 -8
  137. wisent/examples/scripts/results/test_ag_news_evaluation.json +0 -30
  138. wisent/examples/scripts/results/test_ag_news_pairs.json +0 -8
  139. wisent/examples/scripts/results/test_agieval_evaluation.json +0 -51
  140. wisent/examples/scripts/results/test_agieval_pairs.json +0 -14
  141. wisent/examples/scripts/results/test_aime2024_evaluation.json +0 -30
  142. wisent/examples/scripts/results/test_aime2024_pairs.json +0 -8
  143. wisent/examples/scripts/results/test_aime2025_evaluation.json +0 -30
  144. wisent/examples/scripts/results/test_aime2025_pairs.json +0 -8
  145. wisent/examples/scripts/results/test_aime_evaluation.json +0 -30
  146. wisent/examples/scripts/results/test_aime_pairs.json +0 -8
  147. wisent/examples/scripts/results/test_anagrams1_evaluation.json +0 -30
  148. wisent/examples/scripts/results/test_anagrams1_pairs.json +0 -8
  149. wisent/examples/scripts/results/test_anagrams2_evaluation.json +0 -30
  150. wisent/examples/scripts/results/test_anagrams2_pairs.json +0 -8
  151. wisent/examples/scripts/results/test_anli_evaluation.json +0 -30
  152. wisent/examples/scripts/results/test_anli_pairs.json +0 -8
  153. wisent/examples/scripts/results/test_apps_evaluation.json +0 -30
  154. wisent/examples/scripts/results/test_apps_pairs.json +0 -8
  155. wisent/examples/scripts/results/test_arabic_exams_evaluation.json +0 -30
  156. wisent/examples/scripts/results/test_arabic_exams_pairs.json +0 -8
  157. wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json +0 -51
  158. wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json +0 -14
  159. wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json +0 -51
  160. wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json +0 -14
  161. wisent/examples/scripts/results/test_arabicmmlu_evaluation.json +0 -51
  162. wisent/examples/scripts/results/test_arabicmmlu_pairs.json +0 -14
  163. wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json +0 -51
  164. wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json +0 -14
  165. wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json +0 -51
  166. wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json +0 -14
  167. wisent/examples/scripts/results/test_arc_ar_evaluation.json +0 -30
  168. wisent/examples/scripts/results/test_arc_ar_pairs.json +0 -8
  169. wisent/examples/scripts/results/test_arc_challenge_evaluation.json +0 -30
  170. wisent/examples/scripts/results/test_arc_challenge_pairs.json +0 -8
  171. wisent/examples/scripts/results/test_arc_easy_evaluation.json +0 -30
  172. wisent/examples/scripts/results/test_arc_easy_pairs.json +0 -8
  173. wisent/examples/scripts/results/test_argument_topic_evaluation.json +0 -30
  174. wisent/examples/scripts/results/test_argument_topic_pairs.json +0 -8
  175. wisent/examples/scripts/results/test_arithmetic_evaluation.json +0 -51
  176. wisent/examples/scripts/results/test_arithmetic_pairs.json +0 -14
  177. wisent/examples/scripts/results/test_asdiv_evaluation.json +0 -30
  178. wisent/examples/scripts/results/test_asdiv_pairs.json +0 -8
  179. wisent/examples/scripts/results/test_assin_entailment_evaluation.json +0 -30
  180. wisent/examples/scripts/results/test_assin_entailment_pairs.json +0 -8
  181. wisent/examples/scripts/results/test_atis_evaluation.json +0 -30
  182. wisent/examples/scripts/results/test_atis_pairs.json +0 -8
  183. wisent/examples/scripts/results/test_babi_evaluation.json +0 -30
  184. wisent/examples/scripts/results/test_babi_pairs.json +0 -8
  185. wisent/examples/scripts/results/test_babilong_evaluation.json +0 -30
  186. wisent/examples/scripts/results/test_babilong_pairs.json +0 -8
  187. wisent/examples/scripts/results/test_bangla_mmlu_evaluation.json +0 -30
  188. wisent/examples/scripts/results/test_bangla_mmlu_pairs.json +0 -8
  189. wisent/examples/scripts/results/test_banking77_evaluation.json +0 -30
  190. wisent/examples/scripts/results/test_banking77_pairs.json +0 -8
  191. wisent/examples/scripts/results/test_basque/test_basque-glue_pairs.json +0 -14
  192. wisent/examples/scripts/results/test_basque-glue_evaluation.json +0 -51
  193. wisent/examples/scripts/results/test_basque-glue_pairs.json +0 -14
  194. wisent/examples/scripts/results/test_basque2/test_basque-glue_evaluation.json +0 -51
  195. wisent/examples/scripts/results/test_basque2/test_basque-glue_pairs.json +0 -14
  196. wisent/examples/scripts/results/test_basque_bench_evaluation.json +0 -51
  197. wisent/examples/scripts/results/test_basque_bench_pairs.json +0 -14
  198. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_evaluation.json +0 -51
  199. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_pairs.json +0 -14
  200. wisent/examples/scripts/results/test_basqueglue_evaluation.json +0 -51
  201. wisent/examples/scripts/results/test_basqueglue_pairs.json +0 -14
  202. wisent/examples/scripts/results/test_bbh_evaluation.json +0 -51
  203. wisent/examples/scripts/results/test_bbh_pairs.json +0 -14
  204. wisent/examples/scripts/results/test_bbq_evaluation.json +0 -30
  205. wisent/examples/scripts/results/test_bbq_pairs.json +0 -8
  206. wisent/examples/scripts/results/test_bec2016eu_evaluation.json +0 -51
  207. wisent/examples/scripts/results/test_bec2016eu_pairs.json +0 -14
  208. wisent/examples/scripts/results/test_belebele_evaluation.json +0 -51
  209. wisent/examples/scripts/results/test_belebele_pairs.json +0 -14
  210. wisent/examples/scripts/results/test_benchmarks_evaluation.json +0 -51
  211. wisent/examples/scripts/results/test_benchmarks_pairs.json +0 -14
  212. wisent/examples/scripts/results/test_bertaqa_evaluation.json +0 -51
  213. wisent/examples/scripts/results/test_bertaqa_pairs.json +0 -14
  214. wisent/examples/scripts/results/test_bhtc_v2_evaluation.json +0 -30
  215. wisent/examples/scripts/results/test_bhtc_v2_pairs.json +0 -8
  216. wisent/examples/scripts/results/test_bigbench_evaluation.json +0 -51
  217. wisent/examples/scripts/results/test_bigbench_pairs.json +0 -14
  218. wisent/examples/scripts/results/test_blimp_evaluation.json +0 -51
  219. wisent/examples/scripts/results/test_blimp_pairs.json +0 -14
  220. wisent/examples/scripts/results/test_boolq/test_boolq_evaluation.json +0 -30
  221. wisent/examples/scripts/results/test_boolq/test_boolq_pairs.json +0 -8
  222. wisent/examples/scripts/results/test_boolq-seq2seq_evaluation.json +0 -30
  223. wisent/examples/scripts/results/test_boolq-seq2seq_pairs.json +0 -8
  224. wisent/examples/scripts/results/test_boolq_evaluation.json +0 -30
  225. wisent/examples/scripts/results/test_boolq_pairs.json +0 -8
  226. wisent/examples/scripts/results/test_c4_evaluation.json +0 -30
  227. wisent/examples/scripts/results/test_c4_pairs.json +0 -8
  228. wisent/examples/scripts/results/test_cabreu_evaluation.json +0 -30
  229. wisent/examples/scripts/results/test_cabreu_pairs.json +0 -8
  230. wisent/examples/scripts/results/test_careqa_evaluation.json +0 -30
  231. wisent/examples/scripts/results/test_careqa_pairs.json +0 -8
  232. wisent/examples/scripts/results/test_catalan_bench_evaluation.json +0 -51
  233. wisent/examples/scripts/results/test_catalan_bench_pairs.json +0 -14
  234. wisent/examples/scripts/results/test_catalanqa_evaluation.json +0 -30
  235. wisent/examples/scripts/results/test_catalanqa_pairs.json +0 -8
  236. wisent/examples/scripts/results/test_catcola_evaluation.json +0 -30
  237. wisent/examples/scripts/results/test_catcola_pairs.json +0 -8
  238. wisent/examples/scripts/results/test_cb_evaluation.json +0 -30
  239. wisent/examples/scripts/results/test_cb_pairs.json +0 -8
  240. wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json +0 -51
  241. wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json +0 -14
  242. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json +0 -30
  243. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json +0 -8
  244. wisent/examples/scripts/results/test_ceval_evaluation.json +0 -51
  245. wisent/examples/scripts/results/test_ceval_pairs.json +0 -14
  246. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json +0 -51
  247. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json +0 -14
  248. wisent/examples/scripts/results/test_chain_of_thought_evaluation.json +0 -51
  249. wisent/examples/scripts/results/test_chain_of_thought_pairs.json +0 -14
  250. wisent/examples/scripts/results/test_chartqa_evaluation.json +0 -30
  251. wisent/examples/scripts/results/test_chartqa_pairs.json +0 -8
  252. wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json +0 -30
  253. wisent/examples/scripts/results/test_claim_stance_topic_pairs.json +0 -8
  254. wisent/examples/scripts/results/test_cmmlu_evaluation.json +0 -51
  255. wisent/examples/scripts/results/test_cmmlu_pairs.json +0 -14
  256. wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json +0 -30
  257. wisent/examples/scripts/results/test_cnn_dailymail_pairs.json +0 -8
  258. wisent/examples/scripts/results/test_cocoteros_es_evaluation.json +0 -30
  259. wisent/examples/scripts/results/test_cocoteros_es_pairs.json +0 -8
  260. wisent/examples/scripts/results/test_codexglue_code_to_text_go_evaluation.json +0 -30
  261. wisent/examples/scripts/results/test_codexglue_code_to_text_go_pairs.json +0 -8
  262. wisent/examples/scripts/results/test_codexglue_code_to_text_java_evaluation.json +0 -30
  263. wisent/examples/scripts/results/test_codexglue_code_to_text_java_pairs.json +0 -8
  264. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_evaluation.json +0 -30
  265. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_pairs.json +0 -8
  266. wisent/examples/scripts/results/test_codexglue_code_to_text_php_evaluation.json +0 -30
  267. wisent/examples/scripts/results/test_codexglue_code_to_text_php_pairs.json +0 -8
  268. wisent/examples/scripts/results/test_codexglue_code_to_text_python_evaluation.json +0 -30
  269. wisent/examples/scripts/results/test_codexglue_code_to_text_python_pairs.json +0 -8
  270. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_evaluation.json +0 -30
  271. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_pairs.json +0 -8
  272. wisent/examples/scripts/results/test_coedit_gec_evaluation.json +0 -30
  273. wisent/examples/scripts/results/test_coedit_gec_pairs.json +0 -8
  274. wisent/examples/scripts/results/test_cola_evaluation.json +0 -30
  275. wisent/examples/scripts/results/test_cola_pairs.json +0 -8
  276. wisent/examples/scripts/results/test_commonsense_qa_evaluation.json +0 -30
  277. wisent/examples/scripts/results/test_commonsense_qa_pairs.json +0 -8
  278. wisent/examples/scripts/results/test_conala_evaluation.json +0 -30
  279. wisent/examples/scripts/results/test_conala_pairs.json +0 -8
  280. wisent/examples/scripts/results/test_concode_evaluation.json +0 -30
  281. wisent/examples/scripts/results/test_concode_pairs.json +0 -8
  282. wisent/examples/scripts/results/test_copa_evaluation.json +0 -30
  283. wisent/examples/scripts/results/test_copa_pairs.json +0 -8
  284. wisent/examples/scripts/results/test_copal_id_evaluation.json +0 -30
  285. wisent/examples/scripts/results/test_copal_id_pairs.json +0 -8
  286. wisent/examples/scripts/results/test_coqa_evaluation.json +0 -30
  287. wisent/examples/scripts/results/test_coqa_pairs.json +0 -8
  288. wisent/examples/scripts/results/test_coqcat_evaluation.json +0 -30
  289. wisent/examples/scripts/results/test_coqcat_pairs.json +0 -8
  290. wisent/examples/scripts/results/test_crows_pairs_evaluation.json +0 -51
  291. wisent/examples/scripts/results/test_crows_pairs_pairs.json +0 -14
  292. wisent/examples/scripts/results/test_csatqa_evaluation.json +0 -51
  293. wisent/examples/scripts/results/test_csatqa_pairs.json +0 -14
  294. wisent/examples/scripts/results/test_cycle_letters_evaluation.json +0 -30
  295. wisent/examples/scripts/results/test_cycle_letters_pairs.json +0 -8
  296. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_evaluation.json +0 -51
  297. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_pairs.json +0 -14
  298. wisent/examples/scripts/results/test_darija_bench_evaluation.json +0 -51
  299. wisent/examples/scripts/results/test_darija_bench_pairs.json +0 -14
  300. wisent/examples/scripts/results/test_darijahellaswag_evaluation.json +0 -30
  301. wisent/examples/scripts/results/test_darijahellaswag_pairs.json +0 -8
  302. wisent/examples/scripts/results/test_darijammlu_evaluation.json +0 -51
  303. wisent/examples/scripts/results/test_darijammlu_pairs.json +0 -14
  304. wisent/examples/scripts/results/test_dbpedia_14_evaluation.json +0 -30
  305. wisent/examples/scripts/results/test_dbpedia_14_pairs.json +0 -8
  306. wisent/examples/scripts/results/test_drop_evaluation.json +0 -30
  307. wisent/examples/scripts/results/test_drop_pairs.json +0 -8
  308. wisent/examples/scripts/results/test_ds1000_evaluation.json +0 -30
  309. wisent/examples/scripts/results/test_ds1000_pairs.json +0 -8
  310. wisent/examples/scripts/results/test_egyhellaswag_evaluation.json +0 -30
  311. wisent/examples/scripts/results/test_egyhellaswag_pairs.json +0 -8
  312. wisent/examples/scripts/results/test_egymmlu_evaluation.json +0 -51
  313. wisent/examples/scripts/results/test_egymmlu_pairs.json +0 -14
  314. wisent/examples/scripts/results/test_epec_koref_bin_evaluation.json +0 -30
  315. wisent/examples/scripts/results/test_epec_koref_bin_pairs.json +0 -8
  316. wisent/examples/scripts/results/test_eq_bench_evaluation.json +0 -30
  317. wisent/examples/scripts/results/test_eq_bench_pairs.json +0 -8
  318. wisent/examples/scripts/results/test_escola_evaluation.json +0 -30
  319. wisent/examples/scripts/results/test_escola_pairs.json +0 -8
  320. wisent/examples/scripts/results/test_ethics_cm_evaluation.json +0 -30
  321. wisent/examples/scripts/results/test_ethics_cm_pairs.json +0 -8
  322. wisent/examples/scripts/results/test_ethos_binary_evaluation.json +0 -30
  323. wisent/examples/scripts/results/test_ethos_binary_pairs.json +0 -8
  324. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_evaluation.json +0 -51
  325. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_pairs.json +0 -14
  326. wisent/examples/scripts/results/test_eus_exams_es_evaluation.json +0 -51
  327. wisent/examples/scripts/results/test_eus_exams_es_pairs.json +0 -14
  328. wisent/examples/scripts/results/test_eus_exams_evaluation.json +0 -51
  329. wisent/examples/scripts/results/test_eus_exams_pairs.json +0 -14
  330. wisent/examples/scripts/results/test_eus_proficiency_evaluation.json +0 -30
  331. wisent/examples/scripts/results/test_eus_proficiency_pairs.json +0 -8
  332. wisent/examples/scripts/results/test_eus_reading_evaluation.json +0 -30
  333. wisent/examples/scripts/results/test_eus_reading_pairs.json +0 -8
  334. wisent/examples/scripts/results/test_eus_trivia_evaluation.json +0 -30
  335. wisent/examples/scripts/results/test_eus_trivia_pairs.json +0 -8
  336. wisent/examples/scripts/results/test_evalita-mp_evaluation.json +0 -51
  337. wisent/examples/scripts/results/test_evalita-mp_pairs.json +0 -14
  338. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
  339. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
  340. wisent/examples/scripts/results/test_evalita_LLM_evaluation.json +0 -51
  341. wisent/examples/scripts/results/test_evalita_LLM_pairs.json +0 -14
  342. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_evaluation.json +0 -51
  343. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_pairs.json +0 -14
  344. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_evaluation.json +0 -30
  345. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_pairs.json +0 -8
  346. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_evaluation.json +0 -51
  347. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_pairs.json +0 -14
  348. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
  349. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
  350. wisent/examples/scripts/results/test_fda_evaluation.json +0 -30
  351. wisent/examples/scripts/results/test_fda_pairs.json +0 -8
  352. wisent/examples/scripts/results/test_financial_tweets_evaluation.json +0 -30
  353. wisent/examples/scripts/results/test_financial_tweets_pairs.json +0 -8
  354. wisent/examples/scripts/results/test_fld/test_fld_evaluation.json +0 -30
  355. wisent/examples/scripts/results/test_fld/test_fld_pairs.json +0 -8
  356. wisent/examples/scripts/results/test_fld_evaluation.json +0 -30
  357. wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json +0 -30
  358. wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json +0 -8
  359. wisent/examples/scripts/results/test_fld_pairs.json +0 -8
  360. wisent/examples/scripts/results/test_flores_evaluation.json +0 -51
  361. wisent/examples/scripts/results/test_flores_pairs.json +0 -14
  362. wisent/examples/scripts/results/test_freebase_evaluation.json +0 -30
  363. wisent/examples/scripts/results/test_freebase_pairs.json +0 -8
  364. wisent/examples/scripts/results/test_french_bench_evaluation.json +0 -51
  365. wisent/examples/scripts/results/test_french_bench_pairs.json +0 -14
  366. wisent/examples/scripts/results/test_galcola_evaluation.json +0 -30
  367. wisent/examples/scripts/results/test_galcola_pairs.json +0 -8
  368. wisent/examples/scripts/results/test_galician_bench_evaluation.json +0 -51
  369. wisent/examples/scripts/results/test_galician_bench_pairs.json +0 -14
  370. wisent/examples/scripts/results/test_glianorex_evaluation.json +0 -30
  371. wisent/examples/scripts/results/test_glianorex_pairs.json +0 -8
  372. wisent/examples/scripts/results/test_global_mmlu_evaluation.json +0 -51
  373. wisent/examples/scripts/results/test_global_mmlu_pairs.json +0 -14
  374. wisent/examples/scripts/results/test_glue_evaluation.json +0 -51
  375. wisent/examples/scripts/results/test_glue_pairs.json +0 -14
  376. wisent/examples/scripts/results/test_gpqa_evaluation.json +0 -51
  377. wisent/examples/scripts/results/test_gpqa_pairs.json +0 -14
  378. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_evaluation.json +0 -51
  379. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_pairs.json +0 -14
  380. wisent/examples/scripts/results/test_groundcocoa_evaluation.json +0 -30
  381. wisent/examples/scripts/results/test_groundcocoa_pairs.json +0 -8
  382. wisent/examples/scripts/results/test_gsm8k_evaluation.json +0 -30
  383. wisent/examples/scripts/results/test_gsm8k_pairs.json +0 -8
  384. wisent/examples/scripts/results/test_haerae_evaluation.json +0 -51
  385. wisent/examples/scripts/results/test_haerae_pairs.json +0 -14
  386. wisent/examples/scripts/results/test_headqa_evaluation.json +0 -30
  387. wisent/examples/scripts/results/test_headqa_pairs.json +0 -8
  388. wisent/examples/scripts/results/test_hellaswag_evaluation.json +0 -30
  389. wisent/examples/scripts/results/test_hellaswag_pairs.json +0 -8
  390. wisent/examples/scripts/results/test_hendrycks_ethics_evaluation.json +0 -51
  391. wisent/examples/scripts/results/test_hendrycks_ethics_pairs.json +0 -14
  392. wisent/examples/scripts/results/test_hendrycks_math_evaluation.json +0 -51
  393. wisent/examples/scripts/results/test_hendrycks_math_pairs.json +0 -14
  394. wisent/examples/scripts/results/test_histoires_morales_evaluation.json +0 -30
  395. wisent/examples/scripts/results/test_histoires_morales_pairs.json +0 -8
  396. wisent/examples/scripts/results/test_hmmt_evaluation.json +0 -30
  397. wisent/examples/scripts/results/test_hmmt_feb_2025_evaluation.json +0 -30
  398. wisent/examples/scripts/results/test_hmmt_feb_2025_pairs.json +0 -8
  399. wisent/examples/scripts/results/test_hmmt_pairs.json +0 -8
  400. wisent/examples/scripts/results/test_hrm8k_evaluation.json +0 -51
  401. wisent/examples/scripts/results/test_hrm8k_pairs.json +0 -14
  402. wisent/examples/scripts/results/test_humaneval_evaluation.json +0 -30
  403. wisent/examples/scripts/results/test_humaneval_pairs.json +0 -8
  404. wisent/examples/scripts/results/test_humaneval_plus_evaluation.json +0 -30
  405. wisent/examples/scripts/results/test_humaneval_plus_pairs.json +0 -8
  406. wisent/examples/scripts/results/test_ifeval_evaluation.json +0 -30
  407. wisent/examples/scripts/results/test_ifeval_pairs.json +0 -8
  408. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_evaluation.json +0 -30
  409. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_pairs.json +0 -8
  410. wisent/examples/scripts/results/test_instruct_humaneval_evaluation.json +0 -30
  411. wisent/examples/scripts/results/test_instruct_humaneval_pairs.json +0 -8
  412. wisent/examples/scripts/results/test_inverse_scaling_evaluation.json +0 -51
  413. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_evaluation.json +0 -30
  414. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_pairs.json +0 -8
  415. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_evaluation.json +0 -51
  416. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_pairs.json +0 -14
  417. wisent/examples/scripts/results/test_inverse_scaling_pairs.json +0 -14
  418. wisent/examples/scripts/results/test_iwslt2017-ar-en_evaluation.json +0 -30
  419. wisent/examples/scripts/results/test_iwslt2017-ar-en_pairs.json +0 -8
  420. wisent/examples/scripts/results/test_iwslt2017-en-ar_evaluation.json +0 -30
  421. wisent/examples/scripts/results/test_iwslt2017-en-ar_pairs.json +0 -8
  422. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_evaluation.json +0 -30
  423. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_pairs.json +0 -8
  424. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_evaluation.json +0 -30
  425. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_pairs.json +0 -8
  426. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_evaluation.json +0 -30
  427. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_pairs.json +0 -8
  428. wisent/examples/scripts/results/test_japanese_leaderboard_evaluation.json +0 -51
  429. wisent/examples/scripts/results/test_japanese_leaderboard_pairs.json +0 -14
  430. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_evaluation.json +0 -30
  431. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_pairs.json +0 -8
  432. wisent/examples/scripts/results/test_jsonschema_bench_evaluation.json +0 -30
  433. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_evaluation.json +0 -30
  434. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_pairs.json +0 -8
  435. wisent/examples/scripts/results/test_jsonschema_bench_pairs.json +0 -8
  436. wisent/examples/scripts/results/test_kbl_evaluation.json +0 -51
  437. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_evaluation.json +0 -51
  438. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_pairs.json +0 -14
  439. wisent/examples/scripts/results/test_kbl_pairs.json +0 -14
  440. wisent/examples/scripts/results/test_kmmlu_evaluation.json +0 -51
  441. wisent/examples/scripts/results/test_kmmlu_pairs.json +0 -14
  442. wisent/examples/scripts/results/test_kobest_evaluation.json +0 -51
  443. wisent/examples/scripts/results/test_kobest_pairs.json +0 -14
  444. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json +0 -30
  445. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json +0 -8
  446. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json +0 -30
  447. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json +0 -8
  448. wisent/examples/scripts/results/test_kormedmcqa_evaluation.json +0 -30
  449. wisent/examples/scripts/results/test_kormedmcqa_pairs.json +0 -8
  450. wisent/examples/scripts/results/test_lambada_cloze_evaluation.json +0 -30
  451. wisent/examples/scripts/results/test_lambada_cloze_pairs.json +0 -8
  452. wisent/examples/scripts/results/test_lambada_evaluation.json +0 -30
  453. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  454. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  455. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json +0 -51
  456. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json +0 -14
  457. wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json +0 -51
  458. wisent/examples/scripts/results/test_lambada_multilingual_pairs.json +0 -14
  459. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json +0 -51
  460. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json +0 -14
  461. wisent/examples/scripts/results/test_lambada_openai_evaluation.json +0 -30
  462. wisent/examples/scripts/results/test_lambada_openai_pairs.json +0 -8
  463. wisent/examples/scripts/results/test_lambada_pairs.json +0 -8
  464. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  465. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  466. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  467. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  468. wisent/examples/scripts/results/test_lambada_standard_evaluation.json +0 -30
  469. wisent/examples/scripts/results/test_lambada_standard_pairs.json +0 -8
  470. wisent/examples/scripts/results/test_leaderboard_evaluation.json +0 -51
  471. wisent/examples/scripts/results/test_leaderboard_pairs.json +0 -14
  472. wisent/examples/scripts/results/test_libra/test_libra_evaluation.json +0 -51
  473. wisent/examples/scripts/results/test_libra/test_libra_pairs.json +0 -14
  474. wisent/examples/scripts/results/test_libra_evaluation.json +0 -51
  475. wisent/examples/scripts/results/test_libra_pairs.json +0 -14
  476. wisent/examples/scripts/results/test_lingoly_evaluation.json +0 -30
  477. wisent/examples/scripts/results/test_lingoly_pairs.json +0 -8
  478. wisent/examples/scripts/results/test_livecodebench_evaluation.json +0 -30
  479. wisent/examples/scripts/results/test_livecodebench_pairs.json +0 -8
  480. wisent/examples/scripts/results/test_livemathbench_cnmo_en_evaluation.json +0 -30
  481. wisent/examples/scripts/results/test_livemathbench_cnmo_en_pairs.json +0 -8
  482. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_evaluation.json +0 -30
  483. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_pairs.json +0 -8
  484. wisent/examples/scripts/results/test_llama_evaluation.json +0 -30
  485. wisent/examples/scripts/results/test_llama_pairs.json +0 -8
  486. wisent/examples/scripts/results/test_logiqa2_evaluation.json +0 -30
  487. wisent/examples/scripts/results/test_logiqa2_pairs.json +0 -8
  488. wisent/examples/scripts/results/test_logiqa_evaluation.json +0 -30
  489. wisent/examples/scripts/results/test_logiqa_pairs.json +0 -8
  490. wisent/examples/scripts/results/test_m_mmlu_evaluation.json +0 -51
  491. wisent/examples/scripts/results/test_m_mmlu_pairs.json +0 -14
  492. wisent/examples/scripts/results/test_mastermind/test_mastermind_evaluation.json +0 -51
  493. wisent/examples/scripts/results/test_mastermind/test_mastermind_pairs.json +0 -14
  494. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_evaluation.json +0 -30
  495. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_pairs.json +0 -8
  496. wisent/examples/scripts/results/test_mastermind_evaluation.json +0 -51
  497. wisent/examples/scripts/results/test_mastermind_pairs.json +0 -14
  498. wisent/examples/scripts/results/test_math500_evaluation.json +0 -30
  499. wisent/examples/scripts/results/test_math500_pairs.json +0 -8
  500. wisent/examples/scripts/results/test_math_evaluation.json +0 -30
  501. wisent/examples/scripts/results/test_math_pairs.json +0 -8
  502. wisent/examples/scripts/results/test_mathqa_evaluation.json +0 -30
  503. wisent/examples/scripts/results/test_mathqa_pairs.json +0 -8
  504. wisent/examples/scripts/results/test_mbpp_evaluation.json +0 -30
  505. wisent/examples/scripts/results/test_mbpp_pairs.json +0 -8
  506. wisent/examples/scripts/results/test_mbpp_plus_evaluation.json +0 -30
  507. wisent/examples/scripts/results/test_mbpp_plus_pairs.json +0 -8
  508. wisent/examples/scripts/results/test_mc_taco_evaluation.json +0 -30
  509. wisent/examples/scripts/results/test_mc_taco_pairs.json +0 -8
  510. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_evaluation.json +0 -51
  511. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_pairs.json +0 -14
  512. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_evaluation.json +0 -30
  513. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_pairs.json +0 -8
  514. wisent/examples/scripts/results/test_med_concepts_qa_evaluation.json +0 -51
  515. wisent/examples/scripts/results/test_med_concepts_qa_pairs.json +0 -14
  516. wisent/examples/scripts/results/test_meddialog_evaluation.json +0 -30
  517. wisent/examples/scripts/results/test_meddialog_pairs.json +0 -8
  518. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_evaluation.json +0 -30
  519. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_pairs.json +0 -8
  520. wisent/examples/scripts/results/test_mediqa_qa2019_evaluation.json +0 -30
  521. wisent/examples/scripts/results/test_mediqa_qa2019_pairs.json +0 -8
  522. wisent/examples/scripts/results/test_medmcqa_evaluation.json +0 -30
  523. wisent/examples/scripts/results/test_medmcqa_pairs.json +0 -8
  524. wisent/examples/scripts/results/test_medqa_evaluation.json +0 -30
  525. wisent/examples/scripts/results/test_medqa_pairs.json +0 -8
  526. wisent/examples/scripts/results/test_medtext_evaluation.json +0 -30
  527. wisent/examples/scripts/results/test_medtext_pairs.json +0 -8
  528. wisent/examples/scripts/results/test_mela_evaluation.json +0 -51
  529. wisent/examples/scripts/results/test_mela_pairs.json +0 -14
  530. wisent/examples/scripts/results/test_meqsum_evaluation.json +0 -30
  531. wisent/examples/scripts/results/test_meqsum_pairs.json +0 -8
  532. wisent/examples/scripts/results/test_mercury_evaluation.json +0 -30
  533. wisent/examples/scripts/results/test_mercury_pairs.json +0 -8
  534. wisent/examples/scripts/results/test_metabench_evaluation.json +0 -51
  535. wisent/examples/scripts/results/test_metabench_pairs.json +0 -14
  536. wisent/examples/scripts/results/test_mgsm_evaluation.json +0 -51
  537. wisent/examples/scripts/results/test_mgsm_pairs.json +0 -14
  538. wisent/examples/scripts/results/test_mimic_repsum_evaluation.json +0 -30
  539. wisent/examples/scripts/results/test_mimic_repsum_pairs.json +0 -8
  540. wisent/examples/scripts/results/test_minerva_math_evaluation.json +0 -51
  541. wisent/examples/scripts/results/test_minerva_math_pairs.json +0 -14
  542. wisent/examples/scripts/results/test_mlqa_evaluation.json +0 -51
  543. wisent/examples/scripts/results/test_mlqa_pairs.json +0 -14
  544. wisent/examples/scripts/results/test_mmlu-pro-plus_evaluation.json +0 -51
  545. wisent/examples/scripts/results/test_mmlu-pro-plus_pairs.json +0 -14
  546. wisent/examples/scripts/results/test_mmlu_evaluation.json +0 -51
  547. wisent/examples/scripts/results/test_mmlu_pairs.json +0 -14
  548. wisent/examples/scripts/results/test_mmlu_pro_evaluation.json +0 -51
  549. wisent/examples/scripts/results/test_mmlu_pro_pairs.json +0 -14
  550. wisent/examples/scripts/results/test_mmlu_prox_evaluation.json +0 -51
  551. wisent/examples/scripts/results/test_mmlu_prox_pairs.json +0 -14
  552. wisent/examples/scripts/results/test_mmlusr_evaluation.json +0 -30
  553. wisent/examples/scripts/results/test_mmlusr_pairs.json +0 -8
  554. wisent/examples/scripts/results/test_mmmu_evaluation.json +0 -51
  555. wisent/examples/scripts/results/test_mmmu_pairs.json +0 -14
  556. wisent/examples/scripts/results/test_mnli_evaluation.json +0 -30
  557. wisent/examples/scripts/results/test_mnli_pairs.json +0 -8
  558. wisent/examples/scripts/results/test_model_written_evals_evaluation.json +0 -51
  559. wisent/examples/scripts/results/test_model_written_evals_pairs.json +0 -14
  560. wisent/examples/scripts/results/test_moral_stories_evaluation.json +0 -30
  561. wisent/examples/scripts/results/test_moral_stories_pairs.json +0 -8
  562. wisent/examples/scripts/results/test_mts_dialog_evaluation.json +0 -30
  563. wisent/examples/scripts/results/test_mts_dialog_pairs.json +0 -8
  564. wisent/examples/scripts/results/test_multiblimp_evaluation.json +0 -51
  565. wisent/examples/scripts/results/test_multiblimp_pairs.json +0 -14
  566. wisent/examples/scripts/results/test_multimedqa_evaluation.json +0 -51
  567. wisent/examples/scripts/results/test_multimedqa_pairs.json +0 -14
  568. wisent/examples/scripts/results/test_multipl_e_evaluation.json +0 -30
  569. wisent/examples/scripts/results/test_multipl_e_pairs.json +0 -8
  570. wisent/examples/scripts/results/test_mutual_evaluation.json +0 -30
  571. wisent/examples/scripts/results/test_mutual_pairs.json +0 -8
  572. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_evaluation.json +0 -30
  573. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_pairs.json +0 -8
  574. wisent/examples/scripts/results/test_noreval_evaluation.json +0 -51
  575. wisent/examples/scripts/results/test_noreval_pairs.json +0 -14
  576. wisent/examples/scripts/results/test_noticia_evaluation.json +0 -30
  577. wisent/examples/scripts/results/test_noticia_pairs.json +0 -8
  578. wisent/examples/scripts/results/test_nq_open_evaluation.json +0 -30
  579. wisent/examples/scripts/results/test_nq_open_pairs.json +0 -8
  580. wisent/examples/scripts/results/test_olaph_evaluation.json +0 -30
  581. wisent/examples/scripts/results/test_olaph_pairs.json +0 -8
  582. wisent/examples/scripts/results/test_openbookqa_evaluation.json +0 -30
  583. wisent/examples/scripts/results/test_openbookqa_pairs.json +0 -8
  584. wisent/examples/scripts/results/test_openllm_evaluation.json +0 -51
  585. wisent/examples/scripts/results/test_openllm_pairs.json +0 -14
  586. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_evaluation.json +0 -30
  587. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_pairs.json +0 -8
  588. wisent/examples/scripts/results/test_paloma_evaluation.json +0 -51
  589. wisent/examples/scripts/results/test_paloma_pairs.json +0 -14
  590. wisent/examples/scripts/results/test_passkey/test_passkey_evaluation.json +0 -30
  591. wisent/examples/scripts/results/test_passkey/test_passkey_pairs.json +0 -8
  592. wisent/examples/scripts/results/test_paws-x_evaluation.json +0 -51
  593. wisent/examples/scripts/results/test_paws-x_pairs.json +0 -14
  594. wisent/examples/scripts/results/test_paws_en/test_paws_en_evaluation.json +0 -30
  595. wisent/examples/scripts/results/test_paws_en/test_paws_en_pairs.json +0 -8
  596. wisent/examples/scripts/results/test_penn_treebank_evaluation.json +0 -30
  597. wisent/examples/scripts/results/test_penn_treebank_pairs.json +0 -8
  598. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_evaluation.json +0 -30
  599. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_pairs.json +0 -8
  600. wisent/examples/scripts/results/test_piqa_evaluation.json +0 -30
  601. wisent/examples/scripts/results/test_piqa_pairs.json +0 -8
  602. wisent/examples/scripts/results/test_polemo2_evaluation.json +0 -30
  603. wisent/examples/scripts/results/test_polemo2_pairs.json +0 -8
  604. wisent/examples/scripts/results/test_polymath_en_high_evaluation.json +0 -30
  605. wisent/examples/scripts/results/test_polymath_en_high_pairs.json +0 -8
  606. wisent/examples/scripts/results/test_polymath_en_medium_evaluation.json +0 -30
  607. wisent/examples/scripts/results/test_polymath_en_medium_pairs.json +0 -8
  608. wisent/examples/scripts/results/test_polymath_zh_high_evaluation.json +0 -30
  609. wisent/examples/scripts/results/test_polymath_zh_high_pairs.json +0 -8
  610. wisent/examples/scripts/results/test_polymath_zh_medium_evaluation.json +0 -30
  611. wisent/examples/scripts/results/test_polymath_zh_medium_pairs.json +0 -8
  612. wisent/examples/scripts/results/test_portuguese_bench_evaluation.json +0 -51
  613. wisent/examples/scripts/results/test_portuguese_bench_pairs.json +0 -14
  614. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
  615. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
  616. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
  617. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
  618. wisent/examples/scripts/results/test_prost_evaluation.json +0 -30
  619. wisent/examples/scripts/results/test_prost_pairs.json +0 -8
  620. wisent/examples/scripts/results/test_ptb_evaluation.json +0 -30
  621. wisent/examples/scripts/results/test_ptb_pairs.json +0 -8
  622. wisent/examples/scripts/results/test_pubmedqa_evaluation.json +0 -30
  623. wisent/examples/scripts/results/test_pubmedqa_pairs.json +0 -8
  624. wisent/examples/scripts/results/test_pythia_evaluation.json +0 -51
  625. wisent/examples/scripts/results/test_pythia_pairs.json +0 -14
  626. wisent/examples/scripts/results/test_qa4mre_evaluation.json +0 -30
  627. wisent/examples/scripts/results/test_qa4mre_pairs.json +0 -8
  628. wisent/examples/scripts/results/test_qasper_evaluation.json +0 -30
  629. wisent/examples/scripts/results/test_qasper_pairs.json +0 -8
  630. wisent/examples/scripts/results/test_race_evaluation.json +0 -30
  631. wisent/examples/scripts/results/test_race_pairs.json +0 -8
  632. wisent/examples/scripts/results/test_realtoxicityprompts_evaluation.json +0 -30
  633. wisent/examples/scripts/results/test_realtoxicityprompts_pairs.json +0 -8
  634. wisent/examples/scripts/results/test_recode_evaluation.json +0 -30
  635. wisent/examples/scripts/results/test_recode_pairs.json +0 -8
  636. wisent/examples/scripts/results/test_record_evaluation.json +0 -30
  637. wisent/examples/scripts/results/test_record_pairs.json +0 -8
  638. wisent/examples/scripts/results/test_ruler_evaluation.json +0 -51
  639. wisent/examples/scripts/results/test_ruler_pairs.json +0 -14
  640. wisent/examples/scripts/results/test_sciq_evaluation.json +0 -30
  641. wisent/examples/scripts/results/test_sciq_pairs.json +0 -8
  642. wisent/examples/scripts/results/test_score_evaluation.json +0 -51
  643. wisent/examples/scripts/results/test_score_pairs.json +0 -14
  644. wisent/examples/scripts/results/test_self_consistency_evaluation.json +0 -30
  645. wisent/examples/scripts/results/test_self_consistency_pairs.json +0 -8
  646. wisent/examples/scripts/results/test_siqa/test_siqa_evaluation.json +0 -30
  647. wisent/examples/scripts/results/test_siqa/test_siqa_pairs.json +0 -8
  648. wisent/examples/scripts/results/test_siqa_evaluation.json +0 -30
  649. wisent/examples/scripts/results/test_siqa_pairs.json +0 -8
  650. wisent/examples/scripts/results/test_spanish_bench_evaluation.json +0 -51
  651. wisent/examples/scripts/results/test_spanish_bench_pairs.json +0 -14
  652. wisent/examples/scripts/results/test_squad2_evaluation.json +0 -30
  653. wisent/examples/scripts/results/test_squad2_pairs.json +0 -8
  654. wisent/examples/scripts/results/test_squadv2_evaluation.json +0 -30
  655. wisent/examples/scripts/results/test_squadv2_pairs.json +0 -8
  656. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_evaluation.json +0 -30
  657. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_pairs.json +0 -8
  658. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_evaluation.json +0 -51
  659. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_pairs.json +0 -14
  660. wisent/examples/scripts/results/test_swag_evaluation.json +0 -30
  661. wisent/examples/scripts/results/test_swag_pairs.json +0 -8
  662. wisent/examples/scripts/results/test_tinyBenchmarks_evaluation.json +0 -51
  663. wisent/examples/scripts/results/test_tinyBenchmarks_pairs.json +0 -14
  664. wisent/examples/scripts/results/test_tmmluplus_evaluation.json +0 -51
  665. wisent/examples/scripts/results/test_tmmluplus_pairs.json +0 -14
  666. wisent/examples/scripts/results/test_translation_evaluation.json +0 -51
  667. wisent/examples/scripts/results/test_translation_pairs.json +0 -14
  668. wisent/examples/scripts/results/test_triviaqa_evaluation.json +0 -30
  669. wisent/examples/scripts/results/test_triviaqa_pairs.json +0 -8
  670. wisent/examples/scripts/results/test_truthfulqa-multi_evaluation.json +0 -51
  671. wisent/examples/scripts/results/test_truthfulqa-multi_pairs.json +0 -14
  672. wisent/examples/scripts/results/test_truthfulqa_evaluation.json +0 -30
  673. wisent/examples/scripts/results/test_truthfulqa_mc1_evaluation.json +0 -30
  674. wisent/examples/scripts/results/test_truthfulqa_mc1_pairs.json +0 -8
  675. wisent/examples/scripts/results/test_truthfulqa_mc2_evaluation.json +0 -30
  676. wisent/examples/scripts/results/test_truthfulqa_mc2_pairs.json +0 -8
  677. wisent/examples/scripts/results/test_truthfulqa_pairs.json +0 -8
  678. wisent/examples/scripts/results/test_turkishmmlu_evaluation.json +0 -51
  679. wisent/examples/scripts/results/test_turkishmmlu_pairs.json +0 -14
  680. wisent/examples/scripts/results/test_unfair_tos_evaluation.json +0 -30
  681. wisent/examples/scripts/results/test_unfair_tos_pairs.json +0 -8
  682. wisent/examples/scripts/results/test_unscramble_evaluation.json +0 -51
  683. wisent/examples/scripts/results/test_unscramble_pairs.json +0 -14
  684. wisent/examples/scripts/results/test_webqs_evaluation.json +0 -30
  685. wisent/examples/scripts/results/test_webqs_pairs.json +0 -8
  686. wisent/examples/scripts/results/test_wikitext103_evaluation.json +0 -30
  687. wisent/examples/scripts/results/test_wikitext103_pairs.json +0 -8
  688. wisent/examples/scripts/results/test_wikitext_evaluation.json +0 -30
  689. wisent/examples/scripts/results/test_wikitext_pairs.json +0 -8
  690. wisent/examples/scripts/results/test_winogender_evaluation.json +0 -51
  691. wisent/examples/scripts/results/test_winogender_pairs.json +0 -14
  692. wisent/examples/scripts/results/test_winogrande_evaluation.json +0 -30
  693. wisent/examples/scripts/results/test_winogrande_pairs.json +0 -8
  694. wisent/examples/scripts/results/test_wmdp_evaluation.json +0 -30
  695. wisent/examples/scripts/results/test_wmdp_pairs.json +0 -8
  696. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_evaluation.json +0 -30
  697. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_pairs.json +0 -8
  698. wisent/examples/scripts/results/test_wmt14_en_fr_evaluation.json +0 -30
  699. wisent/examples/scripts/results/test_wmt14_en_fr_pairs.json +0 -8
  700. wisent/examples/scripts/results/test_wmt16_en_de_evaluation.json +0 -30
  701. wisent/examples/scripts/results/test_wmt16_en_de_pairs.json +0 -8
  702. wisent/examples/scripts/results/test_wmt16_ro_en_evaluation.json +0 -30
  703. wisent/examples/scripts/results/test_wmt16_ro_en_pairs.json +0 -8
  704. wisent/examples/scripts/results/test_wsc273_evaluation.json +0 -30
  705. wisent/examples/scripts/results/test_wsc273_pairs.json +0 -8
  706. wisent/examples/scripts/results/test_xcopa_evaluation.json +0 -51
  707. wisent/examples/scripts/results/test_xcopa_pairs.json +0 -14
  708. wisent/examples/scripts/results/test_xnli_eu_evaluation.json +0 -30
  709. wisent/examples/scripts/results/test_xnli_eu_pairs.json +0 -8
  710. wisent/examples/scripts/results/test_xnli_evaluation.json +0 -51
  711. wisent/examples/scripts/results/test_xnli_pairs.json +0 -14
  712. wisent/examples/scripts/results/test_xquad_evaluation.json +0 -51
  713. wisent/examples/scripts/results/test_xquad_pairs.json +0 -14
  714. wisent/examples/scripts/results/test_xstorycloze_evaluation.json +0 -51
  715. wisent/examples/scripts/results/test_xstorycloze_pairs.json +0 -14
  716. wisent/examples/scripts/results/test_xsum_evaluation.json +0 -30
  717. wisent/examples/scripts/results/test_xsum_pairs.json +0 -8
  718. wisent/examples/scripts/results/test_xwinograd_evaluation.json +0 -51
  719. wisent/examples/scripts/results/test_xwinograd_pairs.json +0 -14
  720. wisent/examples/scripts/results/test_yahoo_answers_topics_evaluation.json +0 -30
  721. wisent/examples/scripts/results/test_yahoo_answers_topics_pairs.json +0 -8
  722. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/WHEEL +0 -0
  723. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/entry_points.txt +0 -0
  724. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/licenses/LICENSE +0 -0
  725. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/top_level.txt +0 -0
@@ -8,8 +8,7 @@ from enum import Enum
8
8
 
9
9
  import torch
10
10
 
11
- from wisent.core.activations.core.atoms import ActivationAggregationStrategy
12
- from wisent.core.activations.prompt_construction_strategy import PromptConstructionStrategy
11
+ from wisent.core.activations.extraction_strategy import ExtractionStrategy
13
12
 
14
13
 
15
14
  class LinearityVerdict(Enum):
@@ -35,11 +34,8 @@ class LinearityConfig:
35
34
  layers_to_test: Optional[List[int]] = None
36
35
  """Specific layers to test. If None, tests sample across depth."""
37
36
 
38
- aggregation_strategies: Optional[List[ActivationAggregationStrategy]] = None
39
- """Aggregation strategies to test. If None, tests all."""
40
-
41
- prompt_strategies: Optional[List[PromptConstructionStrategy]] = None
42
- """Prompt strategies to test. If None, tests all."""
37
+ extraction_strategies: Optional[List[ExtractionStrategy]] = None
38
+ """Extraction strategies to test. If None, tests default set."""
43
39
 
44
40
  normalize_options: List[bool] = field(default_factory=lambda: [False, True])
45
41
  """Normalization options to test."""
@@ -128,24 +124,15 @@ def check_linearity(
128
124
  else:
129
125
  layers_to_test = cfg.layers_to_test
130
126
 
131
- # Determine aggregation strategies
132
- if cfg.aggregation_strategies is None:
133
- aggregation_strategies = [
134
- ActivationAggregationStrategy.LAST_TOKEN,
135
- ActivationAggregationStrategy.MEAN_POOLING,
136
- ActivationAggregationStrategy.MAX_POOLING,
137
- ]
138
- else:
139
- aggregation_strategies = cfg.aggregation_strategies
140
-
141
- # Determine prompt strategies
142
- if cfg.prompt_strategies is None:
143
- prompt_strategies = [
144
- PromptConstructionStrategy.CHAT_TEMPLATE,
145
- PromptConstructionStrategy.DIRECT_COMPLETION,
127
+ # Determine extraction strategies
128
+ if cfg.extraction_strategies is None:
129
+ extraction_strategies = [
130
+ ExtractionStrategy.CHAT_LAST,
131
+ ExtractionStrategy.CHAT_MEAN,
132
+ ExtractionStrategy.CHAT_MAX_NORM,
146
133
  ]
147
134
  else:
148
- prompt_strategies = cfg.prompt_strategies
135
+ extraction_strategies = cfg.extraction_strategies
149
136
 
150
137
  # Limit pairs
151
138
  test_pairs = pairs[:cfg.max_pairs]
@@ -157,62 +144,59 @@ def check_linearity(
157
144
 
158
145
  all_results = []
159
146
 
160
- for prompt_strategy in prompt_strategies:
161
- for agg_strategy in aggregation_strategies:
162
- for normalize in cfg.normalize_options:
163
- # Collect activations
164
- pos_activations = {l: [] for l in layers_to_test}
165
- neg_activations = {l: [] for l in layers_to_test}
166
-
167
- for pair in test_pairs:
168
- try:
169
- pair_with_acts = collector.collect_for_pair(
170
- pair,
171
- layers=[str(l) for l in layers_to_test],
172
- aggregation=agg_strategy,
173
- normalize_layers=normalize,
174
- prompt_strategy=prompt_strategy,
175
- )
176
-
177
- pos_la = pair_with_acts.positive_response.layers_activations
178
- neg_la = pair_with_acts.negative_response.layers_activations
179
-
180
- if pos_la and neg_la:
181
- for layer in layers_to_test:
182
- pos_t = pos_la.get(str(layer))
183
- neg_t = neg_la.get(str(layer))
184
- if pos_t is not None and neg_t is not None:
185
- pos_activations[layer].append(pos_t.flatten().cpu())
186
- neg_activations[layer].append(neg_t.flatten().cpu())
187
- except Exception:
188
- continue
189
-
190
- # Analyze each layer
191
- for layer in layers_to_test:
192
- pos_list = pos_activations[layer]
193
- neg_list = neg_activations[layer]
194
-
195
- if len(pos_list) < 10 or len(neg_list) < 10:
196
- continue
147
+ for strategy in extraction_strategies:
148
+ for normalize in cfg.normalize_options:
149
+ # Collect activations
150
+ pos_activations = {l: [] for l in layers_to_test}
151
+ neg_activations = {l: [] for l in layers_to_test}
152
+
153
+ for pair in test_pairs:
154
+ try:
155
+ pair_with_acts = collector.collect(
156
+ pair,
157
+ strategy=strategy,
158
+ layers=[str(l) for l in layers_to_test],
159
+ normalize=normalize,
160
+ )
197
161
 
198
- pos_tensor = torch.stack(pos_list)
199
- neg_tensor = torch.stack(neg_list)
162
+ pos_la = pair_with_acts.positive_response.layers_activations
163
+ neg_la = pair_with_acts.negative_response.layers_activations
200
164
 
201
- result = detect_geometry_structure(pos_tensor, neg_tensor, geo_config)
202
-
203
- linear_score = result.all_scores["linear"].score
204
- linear_details = result.all_scores["linear"].details
205
-
206
- all_results.append({
207
- "prompt_strategy": prompt_strategy.name,
208
- "aggregation": agg_strategy.name,
209
- "normalize": normalize,
210
- "layer": layer,
211
- "linear_score": linear_score,
212
- "cohens_d": linear_details.get("cohens_d", 0),
213
- "variance_explained": linear_details.get("variance_explained", 0),
214
- "best_structure": result.best_structure.value,
215
- })
165
+ if pos_la and neg_la:
166
+ for layer in layers_to_test:
167
+ pos_t = pos_la.get(str(layer))
168
+ neg_t = neg_la.get(str(layer))
169
+ if pos_t is not None and neg_t is not None:
170
+ pos_activations[layer].append(pos_t.flatten().cpu())
171
+ neg_activations[layer].append(neg_t.flatten().cpu())
172
+ except Exception:
173
+ continue
174
+
175
+ # Analyze each layer
176
+ for layer in layers_to_test:
177
+ pos_list = pos_activations[layer]
178
+ neg_list = neg_activations[layer]
179
+
180
+ if len(pos_list) < 10 or len(neg_list) < 10:
181
+ continue
182
+
183
+ pos_tensor = torch.stack(pos_list)
184
+ neg_tensor = torch.stack(neg_list)
185
+
186
+ result = detect_geometry_structure(pos_tensor, neg_tensor, geo_config)
187
+
188
+ linear_score = result.all_scores["linear"].score
189
+ linear_details = result.all_scores["linear"].details
190
+
191
+ all_results.append({
192
+ "extraction_strategy": strategy.value,
193
+ "normalize": normalize,
194
+ "layer": layer,
195
+ "linear_score": linear_score,
196
+ "cohens_d": linear_details.get("cohens_d", 0),
197
+ "variance_explained": linear_details.get("variance_explained", 0),
198
+ "best_structure": result.best_structure.value,
199
+ })
216
200
 
217
201
  if not all_results:
218
202
  return LinearityResult(
@@ -234,7 +218,7 @@ def check_linearity(
234
218
  verdict = LinearityVerdict.LINEAR
235
219
  recommendation = (
236
220
  f"Use CAA (single-direction steering) on layer {best['layer']} "
237
- f"with {best['prompt_strategy']} prompt and {best['aggregation']} aggregation."
221
+ f"with {best['extraction_strategy']} strategy."
238
222
  )
239
223
  elif best["linear_score"] >= cfg.weak_threshold and best["cohens_d"] >= cfg.min_cohens_d:
240
224
  verdict = LinearityVerdict.WEAKLY_LINEAR
@@ -254,8 +238,7 @@ def check_linearity(
254
238
  verdict=verdict,
255
239
  best_linear_score=best["linear_score"],
256
240
  best_config={
257
- "prompt_strategy": best["prompt_strategy"],
258
- "aggregation": best["aggregation"],
241
+ "extraction_strategy": best["extraction_strategy"],
259
242
  "normalize": best["normalize"],
260
243
  },
261
244
  best_layer=best["layer"],
@@ -281,7 +281,8 @@ def _compute_pca(
281
281
 
282
282
  n_components = min(5, n - 1)
283
283
  pca = PCA(n_components=n_components)
284
- pca.fit(difference_vectors.numpy())
284
+ # Convert to float32 for sklearn compatibility (BFloat16 not supported)
285
+ pca.fit(difference_vectors.float().numpy())
285
286
 
286
287
  pc1_var = pca.explained_variance_ratio_[0]
287
288
  pc2_var = pca.explained_variance_ratio_[1] if n_components > 1 else 0.0
@@ -372,7 +373,7 @@ def _compute_clustering(
372
373
  try:
373
374
  from sklearn.metrics import silhouette_score
374
375
 
375
- all_activations = torch.cat([positive_activations, negative_activations], dim=0).numpy()
376
+ all_activations = torch.cat([positive_activations, negative_activations], dim=0).float().numpy()
376
377
  labels = [0] * n_pos + [1] * n_neg
377
378
 
378
379
  silhouette = silhouette_score(all_activations, labels)
@@ -436,7 +437,7 @@ def _compute_cv_classification(
436
437
  from sklearn.linear_model import LogisticRegression
437
438
  from sklearn.model_selection import cross_val_score
438
439
 
439
- X = torch.cat([positive_activations, negative_activations], dim=0).numpy()
440
+ X = torch.cat([positive_activations, negative_activations], dim=0).float().numpy()
440
441
  y = np.array([1] * n_pos + [0] * n_neg)
441
442
 
442
443
  n_folds = min(config.cv_folds, min(n_pos, n_neg))
@@ -473,8 +474,8 @@ def _compute_cohens_d(
473
474
  direction = direction / direction_norm
474
475
 
475
476
  # Project all activations onto this direction
476
- pos_proj = (positive_activations @ direction).numpy()
477
- neg_proj = (negative_activations @ direction).numpy()
477
+ pos_proj = (positive_activations @ direction).float().numpy()
478
+ neg_proj = (negative_activations @ direction).float().numpy()
478
479
 
479
480
  # Cohen's d = (mean1 - mean2) / pooled_std
480
481
  mean_diff = pos_proj.mean() - neg_proj.mean()
@@ -133,14 +133,11 @@ EXTRACTORS: dict[str, str] = {
133
133
 
134
134
  # Coding benchmarks
135
135
  "humaneval": f"{base_import}humaneval:HumanEvalExtractor",
136
- "humaneval_plus": f"{base_import}humaneval:HumanEvalExtractor",
137
- "humaneval_64_instruct": f"{base_import}instructhumaneval:InstructHumanEvalExtractor",
138
- "humaneval_instruct": f"{base_import}instructhumaneval:InstructHumanEvalExtractor",
139
- "humanevalpack": f"{base_import}humaneval:HumanEvalExtractor",
140
- "instructhumaneval": f"{base_import}instructhumaneval:InstructHumanEvalExtractor",
141
- "mbpp": f"{base_import}mbpp:MBPPExtractor",
142
- "mbpp_plus": f"{base_import}mbpp:MBPPExtractor",
143
- "instruct_humaneval": f"{base_import}instructhumaneval:InstructHumanEvalExtractor",
136
+ "humaneval_64": f"{base_import}humaneval:HumanEval64Extractor",
137
+ "humaneval_plus": f"{base_import}humaneval:HumanEvalPlusExtractor",
138
+ "humaneval_instruct": f"{base_import}humaneval:HumanEvalInstructExtractor",
139
+ "humaneval_64_instruct": f"{base_import}humaneval:HumanEval64InstructExtractor",
140
+ "humanevalpack": f"{base_import}humanevalpack:HumanevalpackExtractor",
144
141
  "apps": f"{base_import}apps:AppsExtractor",
145
142
  "conala": f"{base_import}conala:ConalaExtractor",
146
143
  "concode": f"{base_import}concode:ConcodeExtractor",
@@ -156,13 +153,6 @@ EXTRACTORS: dict[str, str] = {
156
153
  "multiple_rs": f"{base_import}multipl_e:MultiplEExtractor",
157
154
  "multiple_go": f"{base_import}multipl_e:MultiplEExtractor",
158
155
  "codexglue": f"{base_import}codexglue:CodexglueExtractor",
159
- "code_x_glue": f"{base_import}codexglue:CodexglueExtractor",
160
- "codexglue_code_to_text_python": f"{base_import}codexglue:CodexglueExtractor",
161
- "codexglue_code_to_text_go": f"{base_import}codexglue:CodexglueExtractor",
162
- "codexglue_code_to_text_ruby": f"{base_import}codexglue:CodexglueExtractor",
163
- "codexglue_code_to_text_java": f"{base_import}codexglue:CodexglueExtractor",
164
- "codexglue_code_to_text_javascript": f"{base_import}codexglue:CodexglueExtractor",
165
- "codexglue_code_to_text_php": f"{base_import}codexglue:CodexglueExtractor",
166
156
  "livecodebench": f"{base_import}livecodebench:LivecodebenchExtractor",
167
157
 
168
158
  # Reasoning benchmarks
@@ -203,7 +193,6 @@ EXTRACTORS: dict[str, str] = {
203
193
  "ds1000": f"{base_import}ds1000:Ds1000Extractor",
204
194
  "evalita_mp": f"{base_import}evalita_mp:EvalitaMpExtractor",
205
195
  "flores": f"{base_import}flores:FloresExtractor",
206
- "freebase": f"{base_import}freebase:FreebaseExtractor",
207
196
  "humanevalpack": f"{base_import}humanevalpack:HumanevalpackExtractor",
208
197
  "iwslt2017_ar_en": f"{base_import}iwslt2017_ar_en:Iwslt2017ArEnExtractor",
209
198
  "iwslt2017_en_ar": f"{base_import}iwslt2017_en_ar:Iwslt2017EnArExtractor",
@@ -229,11 +218,8 @@ EXTRACTORS: dict[str, str] = {
229
218
  "flan_held_in": f"{base_import}flan_held_in:FlanHeldInExtractor",
230
219
  "gpt3_translation_benchmarks": f"{base_import}gpt3_translation_benchmarks:Gpt3TranslationBenchmarksExtractor",
231
220
  "multiple_choice": f"{base_import}multiple_choice:MultipleChoiceExtractor",
232
- "non_greedy_robustness_agieval_aqua_rat": f"{base_import}non_greedy_robustness_agieval_aqua_rat:NonGreedyRobustnessAgievalAquaRatExtractor",
233
- "option_order_robustness_agieval_aqua_rat": f"{base_import}option_order_robustness_agieval_aqua_rat:OptionOrderRobustnessAgievalAquaRatExtractor",
234
221
  "penn_treebank": f"{base_import}penn_treebank:PennTreebankExtractor",
235
222
  "ptb": f"{base_import}penn_treebank:PennTreebankExtractor",
236
- "prompt_robustness_agieval_aqua_rat": f"{base_import}prompt_robustness_agieval_aqua_rat:PromptRobustnessAgievalAquaRatExtractor",
237
223
  "self_consistency": f"{base_import}self_consistency:SelfConsistencyExtractor",
238
224
  "t0_eval": f"{base_import}t0_eval:T0EvalExtractor",
239
225
  "vaxx_stance": f"{base_import}vaxx_stance:VaxxStanceExtractor",
@@ -8,12 +8,16 @@ from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.concode
8
8
  from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.ds_1000 import Ds1000Extractor
9
9
  from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.hle import HleExtractor
10
10
  from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.hmmt import HMMTExtractor
11
- from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.humaneval import HumanEvalExtractor
12
- from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.instructhumaneval import InstructHumanEvalExtractor
11
+ from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.humaneval import (
12
+ HumanEvalExtractor,
13
+ HumanEval64Extractor,
14
+ HumanEvalPlusExtractor,
15
+ HumanEvalInstructExtractor,
16
+ HumanEval64InstructExtractor,
17
+ )
13
18
  from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.livecodebench import LivecodebenchExtractor
14
19
  from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.livemathbench import LiveMathBenchExtractor
15
20
  from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.math500 import MATH500Extractor
16
- from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.mbpp import MBPPExtractor
17
21
  from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.mercury import MercuryExtractor
18
22
  from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.multipl_e import MultiplEExtractor
19
23
  from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.polymath import PolyMathExtractor
@@ -30,11 +34,13 @@ __all__ = [
30
34
  "HleExtractor",
31
35
  "HMMTExtractor",
32
36
  "HumanEvalExtractor",
33
- "InstructHumanEvalExtractor",
37
+ "HumanEval64Extractor",
38
+ "HumanEvalPlusExtractor",
39
+ "HumanEvalInstructExtractor",
40
+ "HumanEval64InstructExtractor",
34
41
  "LivecodebenchExtractor",
35
42
  "LiveMathBenchExtractor",
36
43
  "MATH500Extractor",
37
- "MBPPExtractor",
38
44
  "MercuryExtractor",
39
45
  "MultiplEExtractor",
40
46
  "PolyMathExtractor",
@@ -1,8 +1,11 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import json
4
+ import random
5
+ import re
3
6
  from typing import Any
7
+
4
8
  from wisent.core.cli_logger import setup_logger
5
- import json
6
9
 
7
10
  from wisent.core.contrastive_pairs.core.pair import ContrastivePair
8
11
  from wisent.core.contrastive_pairs.huggingface_pairs.atoms import HuggingFaceBenchmarkExtractor
@@ -88,6 +91,9 @@ class AppsExtractor(HuggingFaceBenchmarkExtractor):
88
91
  log.debug(f"Could not parse solutions array: {e}")
89
92
  return None
90
93
 
94
+ # Prepend common imports (APPS solutions assume LeetCode-style environment)
95
+ correct_answer = self._prepend_imports(correct_answer)
96
+
91
97
  # Create incorrect answer (modify or corrupt)
92
98
  incorrect_answer = self._create_incorrect_answer(correct_answer)
93
99
 
@@ -96,10 +102,11 @@ class AppsExtractor(HuggingFaceBenchmarkExtractor):
96
102
 
97
103
  # Parse input_output JSON to create test code
98
104
  test_code = None
105
+ entry_point = None
99
106
  if input_output:
100
107
  try:
101
108
  io_data = json.loads(input_output) if isinstance(input_output, str) else input_output
102
- test_code = self._build_test_code_from_io(io_data)
109
+ test_code, entry_point = self._build_test_code_from_io(io_data)
103
110
  except (json.JSONDecodeError, TypeError) as e:
104
111
  log.debug(f"Could not parse input_output: {e}")
105
112
 
@@ -107,6 +114,8 @@ class AppsExtractor(HuggingFaceBenchmarkExtractor):
107
114
  "label": "apps",
108
115
  "source": "codeparrot/apps",
109
116
  "test_code": test_code,
117
+ "entry_point": entry_point,
118
+ "language": "python",
110
119
  }
111
120
 
112
121
  return self._build_pair(
@@ -120,29 +129,82 @@ class AppsExtractor(HuggingFaceBenchmarkExtractor):
120
129
  log.error(f"Error extracting pair from doc: {exc}", exc_info=True)
121
130
  return None
122
131
 
123
- def _build_test_code_from_io(self, io_data: dict) -> str:
132
+ @staticmethod
133
+ def _build_test_code_from_io(io_data: dict) -> tuple[str, str | None]:
124
134
  """Build test code from input/output data.
125
-
126
- APPS solutions are script-style (stdin/stdout), not functions.
127
- We use subprocess to run solution.py with the input.
135
+
136
+ APPS has two types of problems:
137
+ 1. stdin/stdout: No fn_name, run via subprocess
138
+ 2. call-based: Has fn_name, import and call Solution().fn_name()
139
+
140
+ Returns:
141
+ Tuple of (test_code, entry_point)
128
142
  """
129
143
  inputs = io_data.get("inputs", [])
130
144
  outputs = io_data.get("outputs", [])
131
-
132
- if not inputs or not outputs:
133
- return None
145
+ fn_name = io_data.get("fn_name")
134
146
 
135
- # Build test code that runs solution.py as a subprocess
136
- # Include normalize function to handle whitespace differences in APPS dataset
137
- test_code = '''import subprocess
147
+ if not inputs or not outputs:
148
+ return None, None
149
+
150
+ if fn_name:
151
+ return AppsExtractor._build_call_based_test_code(inputs, outputs, fn_name)
152
+ else:
153
+ return AppsExtractor._build_stdin_test_code(inputs, outputs)
154
+
155
+ @staticmethod
156
+ def _build_call_based_test_code(
157
+ inputs: list, outputs: list, fn_name: str
158
+ ) -> tuple[str, None]:
159
+ """Build test code for call-based (LeetCode-style) problems."""
160
+ total = len(inputs)
161
+ test_code = f'''import sys
162
+ from solution import Solution
163
+ from typing import List, Optional, Dict, Tuple, Set, Any
164
+
165
+ def compare_outputs(actual, expected):
166
+ """Compare outputs, handling floating point and nested structures."""
167
+ if isinstance(expected, float) and isinstance(actual, float):
168
+ return abs(actual - expected) < 1e-6
169
+ if isinstance(expected, list) and isinstance(actual, list):
170
+ if len(expected) != len(actual):
171
+ return False
172
+ return all(compare_outputs(a, e) for a, e in zip(actual, expected))
173
+ return actual == expected
174
+
175
+ if __name__ == '__main__':
176
+ sol = Solution()
177
+ passed = 0
178
+ total = {total}
179
+ '''
180
+ for i, (inp, out) in enumerate(zip(inputs, outputs)):
181
+ # inp is typically a list of arguments
182
+ if isinstance(inp, list):
183
+ args_repr = ", ".join(repr(arg) for arg in inp)
184
+ else:
185
+ args_repr = repr(inp)
186
+ test_code += f" # Test case {i+1}\n"
187
+ test_code += f" try:\n"
188
+ test_code += f" result = sol.{fn_name}({args_repr})\n"
189
+ test_code += f" expected = {repr(out)}\n"
190
+ test_code += f" if compare_outputs(result, expected):\n"
191
+ test_code += f" passed += 1\n"
192
+ test_code += f" except Exception:\n"
193
+ test_code += f" pass\n\n"
194
+
195
+ test_code += " print(f'PASSED:{passed}/{total}')\n"
196
+ test_code += " sys.exit(0 if passed == total else 1)\n"
197
+ return test_code, None
198
+
199
+ @staticmethod
200
+ def _build_stdin_test_code(inputs: list, outputs: list) -> tuple[str, None]:
201
+ """Build test code for stdin/stdout style problems."""
202
+ total = len(inputs)
203
+ test_code = f'''import subprocess
138
204
  import sys
139
205
 
140
206
  def normalize_output(s):
141
- """Normalize output by stripping trailing whitespace from each line.
142
-
143
- APPS dataset has inconsistent trailing whitespace in expected outputs.
144
- This normalizes both actual and expected to enable fair comparison.
145
- """
207
+ """Normalize output by stripping trailing whitespace from each line."""
146
208
  lines = s.split('\\n')
147
209
  normalized = '\\n'.join(line.rstrip() for line in lines)
148
210
  return normalized.strip()
@@ -157,26 +219,78 @@ def run_solution(input_str):
157
219
  timeout=10
158
220
  )
159
221
  if result.returncode != 0:
160
- raise RuntimeError(f"Solution failed: {result.stderr}")
222
+ raise RuntimeError(f"Solution failed: {{result.stderr}}")
161
223
  return result.stdout
162
224
 
225
+ if __name__ == '__main__':
226
+ passed = 0
227
+ total = {total}
163
228
  '''
164
- test_code += "if __name__ == '__main__':\n"
165
-
166
229
  for i, (inp, out) in enumerate(zip(inputs, outputs)):
167
230
  test_code += f" # Test case {i+1}\n"
168
- test_code += f" result = run_solution({repr(inp)})\n"
169
- test_code += f" expected = {repr(out)}\n"
170
- test_code += f" assert normalize_output(result) == normalize_output(expected), f'Test {i+1} failed: expected {{repr(expected)}}, got {{repr(result)}}'\n\n"
171
-
172
- test_code += " print('All tests passed!')\n"
173
-
174
- return test_code
231
+ test_code += f" try:\n"
232
+ test_code += f" result = run_solution({repr(inp)})\n"
233
+ test_code += f" expected = {repr(out)}\n"
234
+ test_code += f" if normalize_output(result) == normalize_output(expected):\n"
235
+ test_code += f" passed += 1\n"
236
+ test_code += f" except Exception:\n"
237
+ test_code += f" pass\n\n"
238
+
239
+ test_code += " print(f'PASSED:{passed}/{total}')\n"
240
+ test_code += " sys.exit(0 if passed == total else 1)\n"
241
+ return test_code, None
242
+
243
+ # Common imports for LeetCode-style solutions
244
+ COMMON_IMPORTS = """\
245
+ from typing import List, Optional, Dict, Tuple, Set, Any
246
+ import collections
247
+ import heapq
248
+ import itertools
249
+ import functools
250
+ import math
251
+ import bisect
252
+ from collections import defaultdict, Counter, deque
253
+ """
254
+
255
+ @staticmethod
256
+ def _prepend_imports(code: str) -> str:
257
+ """Prepend common imports to solution code.
258
+
259
+ APPS solutions assume LeetCode-style environment where
260
+ List, collections, heapq, etc. are pre-imported.
261
+ """
262
+ # Skip if code already has typing imports
263
+ if "from typing import" in code or "import typing" in code:
264
+ return code
265
+ return AppsExtractor.COMMON_IMPORTS + code
175
266
 
176
267
  def _create_incorrect_answer(self, correct: str) -> str:
177
- """Create an incorrect answer by modifying the correct one."""
178
- # For code, corrupt it slightly
179
- if len(correct) > 10:
180
- return correct[:len(correct)//2] + "# CORRUPTED" + correct[len(correct)//2:]
181
- return f"{correct} # INCORRECT"
268
+ """Create an incorrect answer by shuffling letters in words.
269
+
270
+ This reliably breaks code by corrupting variable/function names,
271
+ causing NameError or SyntaxError.
272
+ """
273
+ def shuffle_word(word: str) -> str:
274
+ """Shuffle all letters in a word."""
275
+ if len(word) <= 2:
276
+ return word
277
+ letters = list(word)
278
+ random.shuffle(letters)
279
+ shuffled = ''.join(letters)
280
+ if shuffled == word:
281
+ return word[::-1] # Reverse if shuffle didn't change
282
+ return shuffled
283
+
284
+ def replace_word(match: re.Match) -> str:
285
+ word = match.group(0)
286
+ return shuffle_word(word)
287
+
288
+ # Shuffle words with 3+ characters
289
+ result = re.sub(r'[A-Za-z]{3,}', replace_word, correct)
290
+
291
+ # If nothing changed (all short words), append syntax error
292
+ if result == correct:
293
+ result = correct + "\n!!SYNTAX_ERROR!!"
294
+
295
+ return result
182
296
 
@@ -20,7 +20,7 @@ class CodexglueExtractor(HuggingFaceBenchmarkExtractor):
20
20
  - code: str (code answer/solution)
21
21
  """
22
22
 
23
- evaluator_name = "generation" # Text similarity for code-to-text tasks
23
+ evaluator_name = "generation"
24
24
 
25
25
  def extract_contrastive_pairs(
26
26
  self,
@@ -82,7 +82,7 @@ class CodexglueExtractor(HuggingFaceBenchmarkExtractor):
82
82
  incorrect_answer = self._create_incorrect_answer(correct_answer)
83
83
 
84
84
  # Format the question
85
- formatted_question = f"Question: {question}\n\nWhat is the answer?"
85
+ formatted_question = f"{question}\n\nGenerate code based on description:"
86
86
 
87
87
  metadata = {
88
88
  "label": "codexglue",