wisent 0.7.379__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1720) hide show
  1. wisent/__init__.py +64 -0
  2. wisent/cli.py +114 -0
  3. wisent/core/__init__.py +40 -0
  4. wisent/core/activations/__init__.py +26 -0
  5. wisent/core/activations/activations.py +97 -0
  6. wisent/core/activations/activations_collector.py +506 -0
  7. wisent/core/activations/core/__init__.py +0 -0
  8. wisent/core/activations/core/atoms.py +219 -0
  9. wisent/core/activations/prompt_construction_strategy.py +47 -0
  10. wisent/core/adapters/__init__.py +22 -0
  11. wisent/core/adapters/audio.py +616 -0
  12. wisent/core/adapters/base.py +420 -0
  13. wisent/core/adapters/multimodal.py +738 -0
  14. wisent/core/adapters/robotics.py +643 -0
  15. wisent/core/adapters/text.py +441 -0
  16. wisent/core/adapters/video.py +555 -0
  17. wisent/core/agent/__init__.py +1 -0
  18. wisent/core/agent/budget.py +644 -0
  19. wisent/core/agent/device_benchmarks.py +691 -0
  20. wisent/core/agent/diagnose/__init__.py +1 -0
  21. wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
  22. wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
  23. wisent/core/agent/diagnose/create_classifier.py +1155 -0
  24. wisent/core/agent/diagnose/response_diagnostics.py +273 -0
  25. wisent/core/agent/diagnose/select_classifiers.py +507 -0
  26. wisent/core/agent/diagnose/synthetic_classifier_option.py +755 -0
  27. wisent/core/agent/diagnose/tasks/__init__.py +33 -0
  28. wisent/core/agent/diagnose/tasks/task_manager.py +1453 -0
  29. wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
  30. wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
  31. wisent/core/agent/diagnose.py +249 -0
  32. wisent/core/agent/steer.py +215 -0
  33. wisent/core/agent/timeout.py +134 -0
  34. wisent/core/autonomous_agent.py +1158 -0
  35. wisent/core/benchmark_extractors.py +372 -0
  36. wisent/core/benchmark_registry.py +151 -0
  37. wisent/core/bigcode_extractors.py +26 -0
  38. wisent/core/bigcode_integration.py +886 -0
  39. wisent/core/branding.py +108 -0
  40. wisent/core/classifier/__init__.py +1 -0
  41. wisent/core/classifier/models/__init__.py +1 -0
  42. wisent/core/classifiers/__init__.py +1 -0
  43. wisent/core/classifiers/classifiers/__init__.py +0 -0
  44. wisent/core/classifiers/classifiers/core/__init__.py +0 -0
  45. wisent/core/classifiers/classifiers/core/atoms.py +748 -0
  46. wisent/core/classifiers/classifiers/models/__init__.py +0 -0
  47. wisent/core/classifiers/classifiers/models/logistic.py +29 -0
  48. wisent/core/classifiers/classifiers/models/mlp.py +47 -0
  49. wisent/core/classifiers/classifiers/rotator.py +137 -0
  50. wisent/core/classifiers/core/__init__.py +1 -0
  51. wisent/core/classifiers/models/__init__.py +1 -0
  52. wisent/core/classifiers/pipeline_steps/__init__.py +1 -0
  53. wisent/core/cli/__init__.py +26 -0
  54. wisent/core/cli/agent/__init__.py +15 -0
  55. wisent/core/cli/agent/apply_steering.py +192 -0
  56. wisent/core/cli/agent/evaluate_response.py +128 -0
  57. wisent/core/cli/agent/generate_synthetic_pairs.py +123 -0
  58. wisent/core/cli/agent/main.py +139 -0
  59. wisent/core/cli/agent/train_classifier.py +173 -0
  60. wisent/core/cli/check_linearity.py +126 -0
  61. wisent/core/cli/create_steering_vector.py +304 -0
  62. wisent/core/cli/diagnose_pairs.py +153 -0
  63. wisent/core/cli/diagnose_vectors.py +404 -0
  64. wisent/core/cli/estimate_unified_goodness_time.py +428 -0
  65. wisent/core/cli/evaluate_refusal.py +241 -0
  66. wisent/core/cli/evaluate_responses.py +926 -0
  67. wisent/core/cli/generate_humanization_pairs.py +128 -0
  68. wisent/core/cli/generate_pairs.py +175 -0
  69. wisent/core/cli/generate_pairs_from_task.py +108 -0
  70. wisent/core/cli/generate_responses.py +160 -0
  71. wisent/core/cli/generate_vector_from_synthetic.py +217 -0
  72. wisent/core/cli/generate_vector_from_task.py +248 -0
  73. wisent/core/cli/get_activations.py +192 -0
  74. wisent/core/cli/inference_config.py +84 -0
  75. wisent/core/cli/inference_config_cli.py +54 -0
  76. wisent/core/cli/modify_weights.py +660 -0
  77. wisent/core/cli/multi_steer.py +112 -0
  78. wisent/core/cli/optimization_cache.py +298 -0
  79. wisent/core/cli/optimize.py +621 -0
  80. wisent/core/cli/optimize_classification.py +473 -0
  81. wisent/core/cli/optimize_sample_size.py +390 -0
  82. wisent/core/cli/optimize_steering.py +3421 -0
  83. wisent/core/cli/optimize_weights.py +1287 -0
  84. wisent/core/cli/steering_method_trainer.py +641 -0
  85. wisent/core/cli/steering_search_space.py +508 -0
  86. wisent/core/cli/tasks.py +940 -0
  87. wisent/core/cli/train_unified_goodness.py +681 -0
  88. wisent/core/cli_logger.py +22 -0
  89. wisent/core/config_manager.py +1731 -0
  90. wisent/core/contrastive_pairs/__init__.py +15 -0
  91. wisent/core/contrastive_pairs/core/__init__.py +0 -0
  92. wisent/core/contrastive_pairs/core/atoms.py +45 -0
  93. wisent/core/contrastive_pairs/core/buliders.py +59 -0
  94. wisent/core/contrastive_pairs/core/pair.py +183 -0
  95. wisent/core/contrastive_pairs/core/response.py +153 -0
  96. wisent/core/contrastive_pairs/core/serialization.py +306 -0
  97. wisent/core/contrastive_pairs/core/set.py +192 -0
  98. wisent/core/contrastive_pairs/diagnostics/__init__.py +79 -0
  99. wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
  100. wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
  101. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +1655 -0
  102. wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
  103. wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
  104. wisent/core/contrastive_pairs/diagnostics/duplicates.py +118 -0
  105. wisent/core/contrastive_pairs/diagnostics/linearity.py +325 -0
  106. wisent/core/contrastive_pairs/diagnostics/vector_quality.py +620 -0
  107. wisent/core/contrastive_pairs/huggingface_pairs/__init__.py +1 -0
  108. wisent/core/contrastive_pairs/huggingface_pairs/atoms.py +255 -0
  109. wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py +470 -0
  110. wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_registry.py +136 -0
  111. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py +44 -0
  112. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentbench.py +225 -0
  113. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentharm.py +267 -0
  114. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +444 -0
  115. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +225 -0
  116. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aime.py +118 -0
  117. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aime2024.py +74 -0
  118. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aime2025.py +73 -0
  119. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/alpaca_eval.py +153 -0
  120. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py +182 -0
  121. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/arena_hard.py +179 -0
  122. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/atis.py +89 -0
  123. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/babilong.py +96 -0
  124. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bangla_mmlu.py +108 -0
  125. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/basqueglue.py +217 -0
  126. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bec2016eu.py +99 -0
  127. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bfcl.py +283 -0
  128. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bhtc_v2.py +87 -0
  129. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +245 -0
  130. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/chain_of_thought.py +89 -0
  131. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/chinese_simpleqa.py +209 -0
  132. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/cluewsc.py +177 -0
  133. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/cnn_dailymail.py +92 -0
  134. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +378 -0
  135. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py +109 -0
  136. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text.py +15 -0
  137. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_go.py +64 -0
  138. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_java.py +65 -0
  139. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_javascript.py +65 -0
  140. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_php.py +65 -0
  141. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_python.py +65 -0
  142. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_ruby.py +65 -0
  143. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +844 -0
  144. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coedit_gec.py +79 -0
  145. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/conala.py +133 -0
  146. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/concode.py +111 -0
  147. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/dbpedia_14.py +91 -0
  148. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/doc_vqa.py +102 -0
  149. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/donotanswer.py +236 -0
  150. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ds1000.py +129 -0
  151. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ds_1000.py +155 -0
  152. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/epec_koref_bin.py +85 -0
  153. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ethos_binary.py +82 -0
  154. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/evalita_mp.py +165 -0
  155. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/evalita_sp_sum_task_fp_small_p1.py +89 -0
  156. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/facts_grounding.py +181 -0
  157. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +295 -0
  158. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/financial_tweets.py +100 -0
  159. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +270 -0
  160. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flan_held_in.py +98 -0
  161. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +572 -0
  162. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +143 -0
  163. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/freebase.py +99 -0
  164. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/get_negative_example_livecodebench.py +146 -0
  165. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/get_positive_example_livecodebench.py +140 -0
  166. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/gpt3_translation_benchmarks.py +98 -0
  167. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +389 -0
  168. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/halueval.py +246 -0
  169. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/harmbench.py +250 -0
  170. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/healthbench.py +181 -0
  171. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hle.py +106 -0
  172. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hmmt.py +117 -0
  173. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py +119 -0
  174. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humanevalpack.py +102 -0
  175. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py +180 -0
  176. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py +129 -0
  177. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/iwslt2017_ar_en.py +98 -0
  178. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/iwslt2017_en_ar.py +98 -0
  179. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/jailbreakbench.py +258 -0
  180. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/law_stack_exchange.py +101 -0
  181. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ledgar.py +118 -0
  182. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livecodebench.py +61 -0
  183. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livecodebench_contrastive_pair_generator.py +491 -0
  184. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livecodebench_v6.py +263 -0
  185. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +230 -0
  186. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/llama.py +96 -0
  187. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +285 -0
  188. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/m_mmlu.py +96 -0
  189. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math.py +186 -0
  190. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +146 -0
  191. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py +142 -0
  192. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/meddialog.py +79 -0
  193. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medical_abstracts.py +101 -0
  194. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +787 -0
  195. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +111 -0
  196. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mmlu_redux.py +194 -0
  197. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mmlusr.py +108 -0
  198. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multimedqa.py +99 -0
  199. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multipl_e.py +109 -0
  200. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple.py +96 -0
  201. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_choice.py +87 -0
  202. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_cpp.py +128 -0
  203. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_go.py +128 -0
  204. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_java.py +128 -0
  205. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_js.py +128 -0
  206. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_py.py +15 -0
  207. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_rs.py +128 -0
  208. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/non_greedy_robustness_agieval_aqua_rat.py +92 -0
  209. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +287 -0
  210. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/openllm.py +99 -0
  211. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/option_order_robustness_agieval_aqua_rat.py +92 -0
  212. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/or_bench.py +300 -0
  213. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/penn_treebank.py +80 -0
  214. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +317 -0
  215. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +467 -0
  216. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/prompt_robustness_agieval_aqua_rat.py +92 -0
  217. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/pythia.py +99 -0
  218. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +131 -0
  219. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +280 -0
  220. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/scicode.py +275 -0
  221. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/self_consistency.py +90 -0
  222. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +145 -0
  223. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/sorry_bench.py +211 -0
  224. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/stsb.py +79 -0
  225. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_glue_lm_eval_v1.py +99 -0
  226. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_glue_lm_eval_v1_seq2seq.py +98 -0
  227. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_glue_t5_prompt.py +123 -0
  228. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_gpqa.py +106 -0
  229. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/swe_bench.py +428 -0
  230. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/swe_bench_verified.py +158 -0
  231. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/sycophancy_eval.py +205 -0
  232. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/t0_eval.py +79 -0
  233. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tag.py +98 -0
  234. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +305 -0
  235. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tmlu.py +109 -0
  236. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +360 -0
  237. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +386 -0
  238. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/travelplanner.py +286 -0
  239. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/truthfulqa_generation.py +128 -0
  240. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/unfair_tos.py +83 -0
  241. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/vaxx_stance.py +86 -0
  242. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wiceu.py +85 -0
  243. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wikitext103.py +97 -0
  244. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wildguard.py +280 -0
  245. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt14_en_fr.py +97 -0
  246. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt14_fr_en.py +97 -0
  247. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_de_en.py +90 -0
  248. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_en_de.py +90 -0
  249. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_en_ro.py +90 -0
  250. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_ro_en.py +90 -0
  251. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt_ro_en_t5_prompt.py +90 -0
  252. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/xsum.py +81 -0
  253. wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
  254. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +265 -0
  255. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/__init__.py +472 -0
  256. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/aclue.py +24 -0
  257. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/acp.py +33 -0
  258. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/acpbench.py +39 -0
  259. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/advanced_ai_risk.py +59 -0
  260. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/aexams.py +14 -0
  261. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrimgsm.py +10 -0
  262. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrimmlu.py +10 -0
  263. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrixnli.py +9 -0
  264. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench.py +14 -0
  265. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_adr.py +9 -0
  266. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_afriqa.py +9 -0
  267. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_afrisenti.py +9 -0
  268. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_belebele.py +9 -0
  269. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_flores.py +9 -0
  270. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_injongointent.py +9 -0
  271. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_mafand.py +9 -0
  272. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_masakhaner.py +9 -0
  273. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_masakhanews.py +9 -0
  274. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_masakhapos.py +9 -0
  275. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_naijarc.py +9 -0
  276. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_nollysenti.py +9 -0
  277. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_ntrex.py +9 -0
  278. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_openai_mmlu.py +9 -0
  279. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_salt.py +9 -0
  280. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_sib.py +9 -0
  281. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_uhura_arc_easy.py +9 -0
  282. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_xlsum.py +9 -0
  283. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/agieval.py +33 -0
  284. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/anli.py +9 -0
  285. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arab_culture.py +24 -0
  286. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_acva.py +67 -0
  287. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_acva_light.py +67 -0
  288. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_complete.py +24 -0
  289. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_light.py +81 -0
  290. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabicmmlu.py +59 -0
  291. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/aradice.py +36 -0
  292. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arc.py +61 -0
  293. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arithmetic.py +19 -0
  294. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/basque_bench.py +37 -0
  295. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bbh.py +121 -0
  296. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bbq.py +9 -0
  297. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/belebele.py +293 -0
  298. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bertaqa.py +25 -0
  299. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bigbench.py +300 -0
  300. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/blimp.py +76 -0
  301. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/careqa.py +9 -0
  302. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/catalan_bench.py +43 -0
  303. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/ceval_valid.py +61 -0
  304. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/cmmlu.py +76 -0
  305. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py +16 -0
  306. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/copal_id.py +11 -0
  307. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/crows_pairs.py +31 -0
  308. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/csatqa.py +15 -0
  309. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/darija.py +29 -0
  310. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/darijammlu.py +57 -0
  311. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/egymmlu.py +62 -0
  312. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/eus.py +76 -0
  313. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/evalita_mp.py +93 -0
  314. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/fld.py +9 -0
  315. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/flores.py +466 -0
  316. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py +9 -0
  317. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/french_bench.py +23 -0
  318. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/galician_bench.py +41 -0
  319. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/glianorex.py +11 -0
  320. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/global_mmlu.py +115 -0
  321. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/gpqa.py +27 -0
  322. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/gsm8k.py +9 -0
  323. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/gsm8k_platinum.py +9 -0
  324. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/haerae.py +14 -0
  325. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/headqa.py +11 -0
  326. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hellaswag.py +39 -0
  327. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hendrycks_ethics.py +14 -0
  328. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hendrycks_math.py +9 -0
  329. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hrm8k.py +20 -0
  330. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/inverse.py +22 -0
  331. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/japanese_leaderboard.py +20 -0
  332. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/jsonschema_bench.py +9 -0
  333. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kbl.py +85 -0
  334. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kmmlu.py +281 -0
  335. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kobest.py +14 -0
  336. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kormedmcqa.py +9 -0
  337. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/lambada.py +28 -0
  338. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/leaderboard.py +52 -0
  339. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/libra.py +9 -0
  340. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/lingoly.py +11 -0
  341. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/longbench.py +9 -0
  342. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/m.py +43 -0
  343. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mastermind.py +9 -0
  344. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mathqa.py +9 -0
  345. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/med.py +24 -0
  346. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/meddialog.py +12 -0
  347. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/medqa.py +9 -0
  348. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mela.py +18 -0
  349. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/metabench.py +36 -0
  350. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mgsm.py +44 -0
  351. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/minerva_math.py +16 -0
  352. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mlqa.py +58 -0
  353. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu.py +70 -0
  354. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu_pro.py +23 -0
  355. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu_pro_plus.py +23 -0
  356. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu_prox.py +191 -0
  357. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlusr.py +9 -0
  358. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmmu.py +46 -0
  359. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/model_written_evals.py +9 -0
  360. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/multiblimp.py +111 -0
  361. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/non.py +23 -0
  362. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/noreval.py +143 -0
  363. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/noridiom.py +20 -0
  364. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/nortruthfulqa.py +32 -0
  365. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/nrk.py +20 -0
  366. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi.py +9 -0
  367. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_arc_multilingual.py +10 -0
  368. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_hellaswag_multilingual.py +24 -0
  369. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_mmlu_multilingual.py +24 -0
  370. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_truthfulqa_multilingual.py +34 -0
  371. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/paloma.py +25 -0
  372. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/pawsx.py +9 -0
  373. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/persona.py +144 -0
  374. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/pile.py +31 -0
  375. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/polemo2.py +9 -0
  376. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/portuguese_bench.py +31 -0
  377. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/prompt.py +23 -0
  378. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/qa4mre.py +12 -0
  379. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/qasper.py +11 -0
  380. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/ru.py +19 -0
  381. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/ruler.py +9 -0
  382. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/score.py +20 -0
  383. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/scrolls.py +9 -0
  384. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/self_consistency.py +11 -0
  385. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/spanish_bench.py +38 -0
  386. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/storycloze.py +9 -0
  387. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/super_glue_t5_prompt.py +17 -0
  388. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/tinyBenchmarks.py +9 -0
  389. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/tmlu.py +9 -0
  390. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/tmmluplus.py +80 -0
  391. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/translation.py +9 -0
  392. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/truthfulqa.py +76 -0
  393. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/truthfulqa_multi.py +24 -0
  394. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/turkishmmlu.py +30 -0
  395. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/unitxt.py +23 -0
  396. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/unscramble.py +9 -0
  397. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/winogender.py +16 -0
  398. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wmdp.py +12 -0
  399. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wmt14.py +16 -0
  400. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wmt16.py +22 -0
  401. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wsc273.py +9 -0
  402. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xcopa.py +21 -0
  403. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xnli.py +28 -0
  404. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xnli_eu.py +12 -0
  405. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xquad.py +22 -0
  406. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xstorycloze.py +22 -0
  407. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xwinograd.py +15 -0
  408. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +478 -0
  409. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +140 -0
  410. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +125 -0
  411. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +171 -0
  412. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +207 -0
  413. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +185 -0
  414. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +130 -0
  415. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +184 -0
  416. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimgsm.py +98 -0
  417. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +113 -0
  418. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +129 -0
  419. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrobench_cot.py +88 -0
  420. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrobench_mc.py +107 -0
  421. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ag.py +134 -0
  422. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py +155 -0
  423. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ai2_arc.py +114 -0
  424. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/anagrams1.py +81 -0
  425. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/anagrams2.py +81 -0
  426. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/anli.py +140 -0
  427. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +180 -0
  428. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +98 -0
  429. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +104 -0
  430. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +168 -0
  431. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +168 -0
  432. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +167 -0
  433. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +268 -0
  434. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +133 -0
  435. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +118 -0
  436. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +118 -0
  437. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_gen.py +101 -0
  438. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_mc.py +106 -0
  439. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/argument.py +134 -0
  440. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +114 -0
  441. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +122 -0
  442. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/assin.py +103 -0
  443. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +113 -0
  444. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +155 -0
  445. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench_gen.py +168 -0
  446. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench_mc.py +139 -0
  447. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbh.py +133 -0
  448. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +169 -0
  449. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +181 -0
  450. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +155 -0
  451. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +165 -0
  452. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +155 -0
  453. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +143 -0
  454. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bigbench.py +170 -0
  455. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +171 -0
  456. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +152 -0
  457. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +117 -0
  458. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq_seq2seq.py +117 -0
  459. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +150 -0
  460. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +152 -0
  461. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabreu.py +127 -0
  462. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +169 -0
  463. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +155 -0
  464. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench_gen.py +119 -0
  465. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench_mc.py +113 -0
  466. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +171 -0
  467. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +139 -0
  468. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +117 -0
  469. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +223 -0
  470. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +163 -0
  471. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +110 -0
  472. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +238 -0
  473. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +151 -0
  474. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +152 -0
  475. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +166 -0
  476. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +144 -0
  477. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +148 -0
  478. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code2text.py +161 -0
  479. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py +114 -0
  480. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/codexglue.py +107 -0
  481. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +149 -0
  482. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cola.py +83 -0
  483. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +107 -0
  484. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +127 -0
  485. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +124 -0
  486. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +169 -0
  487. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +162 -0
  488. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqcat.py +114 -0
  489. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/crows_pairs.py +158 -0
  490. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +152 -0
  491. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +107 -0
  492. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle_letters.py +81 -0
  493. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +221 -0
  494. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +174 -0
  495. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +152 -0
  496. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +157 -0
  497. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +152 -0
  498. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +107 -0
  499. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +129 -0
  500. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/egyhellaswag.py +125 -0
  501. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/egymmlu.py +180 -0
  502. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +142 -0
  503. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +107 -0
  504. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +194 -0
  505. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +152 -0
  506. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +152 -0
  507. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +152 -0
  508. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/escola.py +85 -0
  509. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +135 -0
  510. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethos.py +99 -0
  511. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +107 -0
  512. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +225 -0
  513. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +159 -0
  514. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +159 -0
  515. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +159 -0
  516. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +166 -0
  517. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_sp.py +109 -0
  518. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/fda.py +105 -0
  519. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +107 -0
  520. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +114 -0
  521. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/fld.py +143 -0
  522. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +202 -0
  523. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench_mc.py +98 -0
  524. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench_perplexity.py +86 -0
  525. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galcola.py +109 -0
  526. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +155 -0
  527. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench_gen.py +118 -0
  528. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench_mc.py +112 -0
  529. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +141 -0
  530. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +118 -0
  531. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +171 -0
  532. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +152 -0
  533. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glue.py +109 -0
  534. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpqa.py +161 -0
  535. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +110 -0
  536. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +184 -0
  537. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm.py +108 -0
  538. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +134 -0
  539. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +152 -0
  540. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +112 -0
  541. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +125 -0
  542. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +225 -0
  543. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +191 -0
  544. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +179 -0
  545. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hle.py +111 -0
  546. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +203 -0
  547. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval.py +124 -0
  548. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +152 -0
  549. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +152 -0
  550. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ifeval.py +118 -0
  551. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +107 -0
  552. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +192 -0
  553. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/iwslt2017.py +117 -0
  554. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +107 -0
  555. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +155 -0
  556. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_gen.py +224 -0
  557. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +120 -0
  558. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/jsonschema_bench.py +123 -0
  559. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kbl.py +140 -0
  560. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +168 -0
  561. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu_cot.py +88 -0
  562. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu_mc.py +107 -0
  563. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +165 -0
  564. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +160 -0
  565. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada.py +147 -0
  566. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +185 -0
  567. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +185 -0
  568. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual_stablelm.py +141 -0
  569. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +107 -0
  570. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +194 -0
  571. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/libra.py +165 -0
  572. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +203 -0
  573. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livemathbench.py +155 -0
  574. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +152 -0
  575. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +152 -0
  576. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logieval.py +82 -0
  577. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +115 -0
  578. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +114 -0
  579. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +152 -0
  580. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +152 -0
  581. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +203 -0
  582. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mathqa.py +137 -0
  583. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py +123 -0
  584. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +115 -0
  585. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +224 -0
  586. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +180 -0
  587. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +107 -0
  588. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mediqa_qa2019.py +123 -0
  589. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +169 -0
  590. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +118 -0
  591. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medtext.py +108 -0
  592. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +96 -0
  593. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meqsum.py +115 -0
  594. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +154 -0
  595. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mgsm.py +122 -0
  596. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mimic_repsum.py +140 -0
  597. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +172 -0
  598. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mlqa.py +143 -0
  599. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +144 -0
  600. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_cot.py +88 -0
  601. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_mc.py +107 -0
  602. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_pro.py +145 -0
  603. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +189 -0
  604. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmmu.py +150 -0
  605. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mnli.py +113 -0
  606. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/model_written_evals.py +115 -0
  607. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/moral_stories.py +151 -0
  608. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +111 -0
  609. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mts_dialog.py +118 -0
  610. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mts_dialog_perplexity.py +97 -0
  611. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +134 -0
  612. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multilingual.py +106 -0
  613. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +114 -0
  614. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +113 -0
  615. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +107 -0
  616. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +173 -0
  617. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +157 -0
  618. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen.py +277 -0
  619. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +165 -0
  620. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +228 -0
  621. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +223 -0
  622. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noticia.py +105 -0
  623. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +135 -0
  624. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi.py +27 -0
  625. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +167 -0
  626. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +174 -0
  627. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +162 -0
  628. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +209 -0
  629. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +186 -0
  630. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph_perplexity.py +97 -0
  631. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +118 -0
  632. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +107 -0
  633. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paloma.py +205 -0
  634. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +110 -0
  635. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +110 -0
  636. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +107 -0
  637. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +154 -0
  638. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +115 -0
  639. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +246 -0
  640. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +144 -0
  641. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases_ca_va.py +82 -0
  642. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +161 -0
  643. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile_10k.py +140 -0
  644. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +116 -0
  645. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polemo2.py +135 -0
  646. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polymath.py +155 -0
  647. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +155 -0
  648. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench_gen.py +121 -0
  649. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench_mc.py +103 -0
  650. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +107 -0
  651. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +115 -0
  652. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +112 -0
  653. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +119 -0
  654. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +118 -0
  655. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +112 -0
  656. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +111 -0
  657. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +107 -0
  658. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +111 -0
  659. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/quac.py +111 -0
  660. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +124 -0
  661. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +107 -0
  662. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/realtoxicityprompts.py +124 -0
  663. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +125 -0
  664. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +110 -0
  665. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +111 -0
  666. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +170 -0
  667. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +113 -0
  668. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +177 -0
  669. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +161 -0
  670. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +157 -0
  671. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +110 -0
  672. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +131 -0
  673. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +119 -0
  674. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/simple_cooccurrence_bias.py +121 -0
  675. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +209 -0
  676. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +114 -0
  677. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +155 -0
  678. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench_gen.py +117 -0
  679. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench_mc.py +110 -0
  680. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad2.py +129 -0
  681. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad_completion.py +121 -0
  682. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sst2.py +111 -0
  683. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +250 -0
  684. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +107 -0
  685. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +107 -0
  686. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +154 -0
  687. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/superglue.py +111 -0
  688. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/supergpqa.py +111 -0
  689. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +115 -0
  690. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +179 -0
  691. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +117 -0
  692. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +110 -0
  693. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +110 -0
  694. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +110 -0
  695. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +155 -0
  696. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +110 -0
  697. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +110 -0
  698. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +110 -0
  699. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +113 -0
  700. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +110 -0
  701. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +181 -0
  702. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/toxigen.py +91 -0
  703. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/translation.py +149 -0
  704. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +130 -0
  705. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +112 -0
  706. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +120 -0
  707. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +140 -0
  708. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_multi.py +142 -0
  709. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +152 -0
  710. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +161 -0
  711. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_cot.py +104 -0
  712. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +102 -0
  713. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/twenty_newsgroups.py +111 -0
  714. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unitxt.py +131 -0
  715. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +155 -0
  716. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +95 -0
  717. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +130 -0
  718. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +122 -0
  719. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wikitext.py +146 -0
  720. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogender.py +139 -0
  721. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +118 -0
  722. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +155 -0
  723. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmt14.py +110 -0
  724. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmt16.py +118 -0
  725. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +114 -0
  726. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +117 -0
  727. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +180 -0
  728. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +197 -0
  729. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +147 -0
  730. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +131 -0
  731. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +203 -0
  732. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +129 -0
  733. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +124 -0
  734. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/yahoo.py +108 -0
  735. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +155 -0
  736. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +56 -0
  737. wisent/core/data_loaders/__init__.py +235 -0
  738. wisent/core/data_loaders/core/__init__.py +0 -0
  739. wisent/core/data_loaders/core/atoms.py +99 -0
  740. wisent/core/data_loaders/loaders/__init__.py +0 -0
  741. wisent/core/data_loaders/loaders/custom.py +120 -0
  742. wisent/core/data_loaders/loaders/huggingface_loader.py +153 -0
  743. wisent/core/data_loaders/loaders/lm_loader.py +494 -0
  744. wisent/core/data_loaders/loaders/lm_loader_special_cases.py +496 -0
  745. wisent/core/data_loaders/loaders/task_interface_loader.py +300 -0
  746. wisent/core/data_loaders/rotator.py +118 -0
  747. wisent/core/detection_handling.py +259 -0
  748. wisent/core/diversity_processors.py +193 -0
  749. wisent/core/download_full_benchmarks.py +1512 -0
  750. wisent/core/errors/__init__.py +203 -0
  751. wisent/core/errors/error_codes.py +763 -0
  752. wisent/core/errors/error_handler.py +134 -0
  753. wisent/core/evaluators/__init__.py +0 -0
  754. wisent/core/evaluators/benchmark_specific/__init__.py +42 -0
  755. wisent/core/evaluators/benchmark_specific/aime_evaluator.py +90 -0
  756. wisent/core/evaluators/benchmark_specific/coding/__init__.py +0 -0
  757. wisent/core/evaluators/benchmark_specific/coding/metrics/__init__.py +0 -0
  758. wisent/core/evaluators/benchmark_specific/coding/metrics/core/__init__.py +0 -0
  759. wisent/core/evaluators/benchmark_specific/coding/metrics/core/atoms.py +36 -0
  760. wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py +363 -0
  761. wisent/core/evaluators/benchmark_specific/coding/metrics/passk.py +67 -0
  762. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/__init__.py +0 -0
  763. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/core/__init__.py +0 -0
  764. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/core/atoms.py +27 -0
  765. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/cpp_sanitizer.py +62 -0
  766. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/java_sanitizer.py +78 -0
  767. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/python_sanitizer.py +94 -0
  768. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/utils.py +126 -0
  769. wisent/core/evaluators/benchmark_specific/coding/providers/__init__.py +18 -0
  770. wisent/core/evaluators/benchmark_specific/coding/providers/core/__init__.py +0 -0
  771. wisent/core/evaluators/benchmark_specific/coding/providers/core/atoms.py +31 -0
  772. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/__init__.py +3 -0
  773. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/provider.py +305 -0
  774. wisent/core/evaluators/benchmark_specific/coding/safe_docker/Dockerfile +31 -0
  775. wisent/core/evaluators/benchmark_specific/coding/safe_docker/__init__.py +0 -0
  776. wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/__init__.py +0 -0
  777. wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/atoms.py +105 -0
  778. wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/runtime.py +143 -0
  779. wisent/core/evaluators/benchmark_specific/coding/safe_docker/entrypoint.py +121 -0
  780. wisent/core/evaluators/benchmark_specific/coding/safe_docker/recipes.py +60 -0
  781. wisent/core/evaluators/benchmark_specific/coding/solution_generator.py +258 -0
  782. wisent/core/evaluators/benchmark_specific/conala_evaluator.py +332 -0
  783. wisent/core/evaluators/benchmark_specific/exact_match_evaluator.py +81 -0
  784. wisent/core/evaluators/benchmark_specific/f1_evaluator.py +173 -0
  785. wisent/core/evaluators/benchmark_specific/generation_evaluator.py +488 -0
  786. wisent/core/evaluators/benchmark_specific/livemathbench_evaluator.py +393 -0
  787. wisent/core/evaluators/benchmark_specific/log_likelihoods_evaluator.py +202 -0
  788. wisent/core/evaluators/benchmark_specific/math_evaluator.py +119 -0
  789. wisent/core/evaluators/benchmark_specific/math_parsing/__init__.py +1 -0
  790. wisent/core/evaluators/benchmark_specific/math_parsing/core.py +1640 -0
  791. wisent/core/evaluators/benchmark_specific/math_parsing/extract_boxed.py +48 -0
  792. wisent/core/evaluators/benchmark_specific/math_parsing/is_equiv.py +159 -0
  793. wisent/core/evaluators/benchmark_specific/math_parsing/scripts.py +919 -0
  794. wisent/core/evaluators/benchmark_specific/perplexity_evaluator.py +175 -0
  795. wisent/core/evaluators/benchmark_specific/polymath_evaluator.py +114 -0
  796. wisent/core/evaluators/core/__init__.py +5 -0
  797. wisent/core/evaluators/core/atoms.py +166 -0
  798. wisent/core/evaluators/custom/__init__.py +20 -0
  799. wisent/core/evaluators/custom/custom_evaluator.py +382 -0
  800. wisent/core/evaluators/custom/examples/__init__.py +37 -0
  801. wisent/core/evaluators/custom/examples/desklib_detector.py +166 -0
  802. wisent/core/evaluators/custom/examples/gptzero.py +185 -0
  803. wisent/core/evaluators/custom/examples/humanization.py +79 -0
  804. wisent/core/evaluators/custom/examples/humanization_coherent.py +127 -0
  805. wisent/core/evaluators/custom/examples/roberta_detector.py +173 -0
  806. wisent/core/evaluators/oracles/__init__.py +0 -0
  807. wisent/core/evaluators/oracles/interactive.py +73 -0
  808. wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
  809. wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py +168 -0
  810. wisent/core/evaluators/oracles/user_specified.py +67 -0
  811. wisent/core/evaluators/personalization/__init__.py +12 -0
  812. wisent/core/evaluators/personalization/alignment.py +166 -0
  813. wisent/core/evaluators/personalization/coherence.py +325 -0
  814. wisent/core/evaluators/personalization/difference.py +73 -0
  815. wisent/core/evaluators/rotator.py +217 -0
  816. wisent/core/evaluators/steering_evaluators.py +386 -0
  817. wisent/core/evaluators/synthetic_evaluator.py +377 -0
  818. wisent/core/hyperparameter_optimizer.py +547 -0
  819. wisent/core/layer.py +17 -0
  820. wisent/core/lm_eval_harness_ground_truth.py +1431 -0
  821. wisent/core/main.py +101 -0
  822. wisent/core/managed_cached_benchmarks.py +609 -0
  823. wisent/core/mixed_benchmark_sampler.py +366 -0
  824. wisent/core/modalities/__init__.py +545 -0
  825. wisent/core/model_persistence.py +302 -0
  826. wisent/core/models/__init__.py +23 -0
  827. wisent/core/models/core/__init__.py +0 -0
  828. wisent/core/models/core/atoms.py +465 -0
  829. wisent/core/models/inference_config.py +127 -0
  830. wisent/core/models/wisent_model.py +893 -0
  831. wisent/core/multi_steering.py +397 -0
  832. wisent/core/opti/__init__.py +0 -0
  833. wisent/core/opti/core/__init__.py +0 -0
  834. wisent/core/opti/core/atoms.py +177 -0
  835. wisent/core/opti/methods/__init__.py +10 -0
  836. wisent/core/opti/methods/opti_classificator.py +172 -0
  837. wisent/core/opti/methods/opti_steering.py +139 -0
  838. wisent/core/opti/methods/opti_weights.py +523 -0
  839. wisent/core/optuna/__init__.py +54 -0
  840. wisent/core/optuna/classifier/__init__.py +25 -0
  841. wisent/core/optuna/classifier/activation_generator.py +351 -0
  842. wisent/core/optuna/classifier/classifier_cache.py +509 -0
  843. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +685 -0
  844. wisent/core/optuna/steering/__init__.py +20 -0
  845. wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +200 -0
  846. wisent/core/optuna/steering/data_utils.py +342 -0
  847. wisent/core/optuna/steering/metrics.py +412 -0
  848. wisent/core/optuna/steering/steering_optimization.py +1096 -0
  849. wisent/core/parser.py +1662 -0
  850. wisent/core/parser_arguments/__init__.py +10 -0
  851. wisent/core/parser_arguments/agent_parser.py +122 -0
  852. wisent/core/parser_arguments/check_linearity_parser.py +82 -0
  853. wisent/core/parser_arguments/configure_model_parser.py +7 -0
  854. wisent/core/parser_arguments/create_steering_vector_parser.py +67 -0
  855. wisent/core/parser_arguments/diagnose_pairs_parser.py +25 -0
  856. wisent/core/parser_arguments/diagnose_vectors_parser.py +72 -0
  857. wisent/core/parser_arguments/evaluate_parser.py +40 -0
  858. wisent/core/parser_arguments/evaluate_refusal_parser.py +32 -0
  859. wisent/core/parser_arguments/evaluate_responses_parser.py +12 -0
  860. wisent/core/parser_arguments/full_optimize_parser.py +194 -0
  861. wisent/core/parser_arguments/generate_pairs_from_task_parser.py +33 -0
  862. wisent/core/parser_arguments/generate_pairs_parser.py +43 -0
  863. wisent/core/parser_arguments/generate_responses_parser.py +16 -0
  864. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +148 -0
  865. wisent/core/parser_arguments/generate_vector_from_task_parser.py +149 -0
  866. wisent/core/parser_arguments/generate_vector_parser.py +89 -0
  867. wisent/core/parser_arguments/get_activations_parser.py +90 -0
  868. wisent/core/parser_arguments/inference_config_parser.py +65 -0
  869. wisent/core/parser_arguments/main_parser.py +220 -0
  870. wisent/core/parser_arguments/model_config_parser.py +59 -0
  871. wisent/core/parser_arguments/modify_weights_parser.py +309 -0
  872. wisent/core/parser_arguments/monitor_parser.py +17 -0
  873. wisent/core/parser_arguments/multi_steer_parser.py +48 -0
  874. wisent/core/parser_arguments/nonsense_parser.py +26 -0
  875. wisent/core/parser_arguments/optimization_cache_parser.py +64 -0
  876. wisent/core/parser_arguments/optimize_classification_parser.py +108 -0
  877. wisent/core/parser_arguments/optimize_parser.py +142 -0
  878. wisent/core/parser_arguments/optimize_sample_size_parser.py +58 -0
  879. wisent/core/parser_arguments/optimize_steering_parser.py +617 -0
  880. wisent/core/parser_arguments/optimize_weights_parser.py +403 -0
  881. wisent/core/parser_arguments/synthetic_parser.py +117 -0
  882. wisent/core/parser_arguments/tasks_parser.py +591 -0
  883. wisent/core/parser_arguments/train_unified_goodness_parser.py +172 -0
  884. wisent/core/parser_arguments/utils.py +107 -0
  885. wisent/core/prompts/__init__.py +0 -0
  886. wisent/core/prompts/core/__init__.py +0 -0
  887. wisent/core/prompts/core/atom.py +57 -0
  888. wisent/core/prompts/core/prompt_formater.py +148 -0
  889. wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
  890. wisent/core/prompts/prompt_stratiegies/direct_completion.py +26 -0
  891. wisent/core/prompts/prompt_stratiegies/instruction_following.py +26 -0
  892. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +31 -0
  893. wisent/core/prompts/prompt_stratiegies/role_playing.py +33 -0
  894. wisent/core/representation.py +5 -0
  895. wisent/core/save_results.py +277 -0
  896. wisent/core/steering.py +660 -0
  897. wisent/core/steering_method.py +20 -0
  898. wisent/core/steering_methods/__init__.py +54 -0
  899. wisent/core/steering_methods/core/__init__.py +0 -0
  900. wisent/core/steering_methods/core/atoms.py +154 -0
  901. wisent/core/steering_methods/methods/__init__.py +0 -0
  902. wisent/core/steering_methods/methods/caa.py +45 -0
  903. wisent/core/steering_methods/methods/prism.py +588 -0
  904. wisent/core/steering_methods/methods/pulse.py +641 -0
  905. wisent/core/steering_methods/methods/titan.py +1005 -0
  906. wisent/core/steering_methods/preflight.py +322 -0
  907. wisent/core/steering_methods/registry.py +649 -0
  908. wisent/core/steering_methods/rotator.py +121 -0
  909. wisent/core/steering_optimizer.py +1503 -0
  910. wisent/core/synthetic/__init__.py +0 -0
  911. wisent/core/synthetic/cleaners/__init__.py +0 -0
  912. wisent/core/synthetic/cleaners/core/__init__.py +0 -0
  913. wisent/core/synthetic/cleaners/core/atoms.py +58 -0
  914. wisent/core/synthetic/cleaners/deduper_cleaner.py +53 -0
  915. wisent/core/synthetic/cleaners/methods/__init__.py +0 -0
  916. wisent/core/synthetic/cleaners/methods/base_dedupers.py +321 -0
  917. wisent/core/synthetic/cleaners/methods/base_refusalers.py +286 -0
  918. wisent/core/synthetic/cleaners/methods/core/__init__.py +0 -0
  919. wisent/core/synthetic/cleaners/methods/core/atoms.py +47 -0
  920. wisent/core/synthetic/cleaners/pairs_cleaner.py +90 -0
  921. wisent/core/synthetic/cleaners/refusaler_cleaner.py +133 -0
  922. wisent/core/synthetic/db_instructions/__init__.py +0 -0
  923. wisent/core/synthetic/db_instructions/core/__init__.py +0 -0
  924. wisent/core/synthetic/db_instructions/core/atoms.py +25 -0
  925. wisent/core/synthetic/db_instructions/mini_dp.py +115 -0
  926. wisent/core/synthetic/generators/__init__.py +0 -0
  927. wisent/core/synthetic/generators/core/__init__.py +0 -0
  928. wisent/core/synthetic/generators/core/atoms.py +73 -0
  929. wisent/core/synthetic/generators/diversities/__init__.py +0 -0
  930. wisent/core/synthetic/generators/diversities/core/__init__.py +0 -0
  931. wisent/core/synthetic/generators/diversities/core/core.py +68 -0
  932. wisent/core/synthetic/generators/diversities/methods/__init__.py +0 -0
  933. wisent/core/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
  934. wisent/core/synthetic/generators/nonsense_generator.py +150 -0
  935. wisent/core/synthetic/generators/pairs_generator.py +313 -0
  936. wisent/core/task_interface.py +143 -0
  937. wisent/core/task_selector.py +232 -0
  938. wisent/core/tasks/__init__.py +218 -0
  939. wisent/core/tasks/aime_task.py +142 -0
  940. wisent/core/tasks/file_task.py +212 -0
  941. wisent/core/tasks/hle_task.py +180 -0
  942. wisent/core/tasks/hmmt_task.py +120 -0
  943. wisent/core/tasks/livecodebench_task.py +94 -0
  944. wisent/core/tasks/livemathbench_task.py +159 -0
  945. wisent/core/tasks/lm_eval_task.py +611 -0
  946. wisent/core/tasks/math500_task.py +84 -0
  947. wisent/core/tasks/polymath_task.py +147 -0
  948. wisent/core/tasks/supergpqa_task.py +220 -0
  949. wisent/core/time_estimator.py +155 -0
  950. wisent/core/timing_calibration.py +176 -0
  951. wisent/core/tracking/__init__.py +54 -0
  952. wisent/core/tracking/latency.py +620 -0
  953. wisent/core/tracking/memory.py +360 -0
  954. wisent/core/trainers/__init__.py +0 -0
  955. wisent/core/trainers/core/__init__.py +11 -0
  956. wisent/core/trainers/core/atoms.py +45 -0
  957. wisent/core/trainers/steering_trainer.py +365 -0
  958. wisent/core/universal_subspace.py +918 -0
  959. wisent/core/user_model_config.py +158 -0
  960. wisent/core/utils/__init__.py +64 -0
  961. wisent/core/utils/base_rotator.py +292 -0
  962. wisent/core/utils/dataset_splits.py +197 -0
  963. wisent/core/utils/device.py +279 -0
  964. wisent/core/weight_modification/__init__.py +134 -0
  965. wisent/core/weight_modification/additive.py +340 -0
  966. wisent/core/weight_modification/directional.py +1357 -0
  967. wisent/core/weight_modification/export.py +359 -0
  968. wisent/core/weight_modification/multi_direction.py +410 -0
  969. wisent/core/weight_modification/utils.py +236 -0
  970. wisent/core/wisent.py +660 -0
  971. wisent/examples/contrastive_pairs/humanization_human_vs_ai.json +2112 -0
  972. wisent/examples/scripts/1/test_basqueglue_evaluation.json +51 -0
  973. wisent/examples/scripts/1/test_basqueglue_pairs.json +14 -0
  974. wisent/examples/scripts/1/test_bec2016eu_evaluation.json +51 -0
  975. wisent/examples/scripts/1/test_bec2016eu_pairs.json +14 -0
  976. wisent/examples/scripts/1/test_belebele_evaluation.json +51 -0
  977. wisent/examples/scripts/1/test_belebele_pairs.json +14 -0
  978. wisent/examples/scripts/1/test_benchmarks_evaluation.json +51 -0
  979. wisent/examples/scripts/1/test_benchmarks_pairs.json +14 -0
  980. wisent/examples/scripts/1/test_bertaqa_evaluation.json +51 -0
  981. wisent/examples/scripts/1/test_bertaqa_pairs.json +14 -0
  982. wisent/examples/scripts/1/test_bhtc_v2_evaluation.json +30 -0
  983. wisent/examples/scripts/1/test_bhtc_v2_pairs.json +8 -0
  984. wisent/examples/scripts/1/test_boolq-seq2seq_evaluation.json +30 -0
  985. wisent/examples/scripts/1/test_boolq-seq2seq_pairs.json +8 -0
  986. wisent/examples/scripts/1/test_cabreu_evaluation.json +30 -0
  987. wisent/examples/scripts/1/test_cabreu_pairs.json +8 -0
  988. wisent/examples/scripts/1/test_careqa_en_evaluation.json +30 -0
  989. wisent/examples/scripts/1/test_careqa_en_pairs.json +8 -0
  990. wisent/examples/scripts/1/test_careqa_evaluation.json +30 -0
  991. wisent/examples/scripts/1/test_careqa_pairs.json +8 -0
  992. wisent/examples/scripts/1/test_catalanqa_evaluation.json +30 -0
  993. wisent/examples/scripts/1/test_catalanqa_pairs.json +8 -0
  994. wisent/examples/scripts/1/test_catcola_evaluation.json +30 -0
  995. wisent/examples/scripts/1/test_catcola_pairs.json +8 -0
  996. wisent/examples/scripts/1/test_chartqa_evaluation.json +30 -0
  997. wisent/examples/scripts/1/test_chartqa_pairs.json +8 -0
  998. wisent/examples/scripts/1/test_claim_stance_topic_evaluation.json +30 -0
  999. wisent/examples/scripts/1/test_claim_stance_topic_pairs.json +8 -0
  1000. wisent/examples/scripts/1/test_cnn_dailymail_evaluation.json +30 -0
  1001. wisent/examples/scripts/1/test_cnn_dailymail_pairs.json +8 -0
  1002. wisent/examples/scripts/1/test_cocoteros_es_evaluation.json +30 -0
  1003. wisent/examples/scripts/1/test_cocoteros_es_pairs.json +8 -0
  1004. wisent/examples/scripts/1/test_coedit_gec_evaluation.json +30 -0
  1005. wisent/examples/scripts/1/test_coedit_gec_pairs.json +8 -0
  1006. wisent/examples/scripts/1/test_cola_evaluation.json +30 -0
  1007. wisent/examples/scripts/1/test_cola_pairs.json +8 -0
  1008. wisent/examples/scripts/1/test_coqcat_evaluation.json +30 -0
  1009. wisent/examples/scripts/1/test_coqcat_pairs.json +8 -0
  1010. wisent/examples/scripts/1/test_dbpedia_14_evaluation.json +30 -0
  1011. wisent/examples/scripts/1/test_dbpedia_14_pairs.json +8 -0
  1012. wisent/examples/scripts/1/test_epec_koref_bin_evaluation.json +30 -0
  1013. wisent/examples/scripts/1/test_epec_koref_bin_pairs.json +8 -0
  1014. wisent/examples/scripts/1/test_ethos_binary_evaluation.json +30 -0
  1015. wisent/examples/scripts/1/test_ethos_binary_pairs.json +8 -0
  1016. wisent/examples/scripts/2/test_afrimgsm_direct_amh_evaluation.json +30 -0
  1017. wisent/examples/scripts/2/test_afrimgsm_direct_amh_pairs.json +8 -0
  1018. wisent/examples/scripts/2/test_afrimmlu_direct_amh_evaluation.json +30 -0
  1019. wisent/examples/scripts/2/test_afrimmlu_direct_amh_pairs.json +8 -0
  1020. wisent/examples/scripts/2/test_afrixnli_en_direct_amh_evaluation.json +30 -0
  1021. wisent/examples/scripts/2/test_afrixnli_en_direct_amh_pairs.json +8 -0
  1022. wisent/examples/scripts/2/test_arc_ar_evaluation.json +30 -0
  1023. wisent/examples/scripts/2/test_arc_ar_pairs.json +8 -0
  1024. wisent/examples/scripts/2/test_atis_evaluation.json +30 -0
  1025. wisent/examples/scripts/2/test_atis_pairs.json +8 -0
  1026. wisent/examples/scripts/2/test_babi_evaluation.json +30 -0
  1027. wisent/examples/scripts/2/test_babi_pairs.json +8 -0
  1028. wisent/examples/scripts/2/test_babilong_evaluation.json +30 -0
  1029. wisent/examples/scripts/2/test_babilong_pairs.json +8 -0
  1030. wisent/examples/scripts/2/test_bangla_mmlu_evaluation.json +30 -0
  1031. wisent/examples/scripts/2/test_bangla_mmlu_pairs.json +8 -0
  1032. wisent/examples/scripts/2/test_basque-glue_pairs.json +14 -0
  1033. wisent/examples/scripts/benchmark_tags.json +2140 -0
  1034. wisent/examples/scripts/lm_eval_readme.json +4 -0
  1035. wisent/examples/scripts/results/benchmark_descriptions.json +1244 -0
  1036. wisent/examples/scripts/results/benchmark_evaluation_methods.json +66 -0
  1037. wisent/examples/scripts/results/benchmark_evaluator_mapping.json +2781 -0
  1038. wisent/examples/scripts/results/benchmark_evaluator_mapping_updated.json +30536 -0
  1039. wisent/examples/scripts/results/benchmark_evaluators_clean.json +469 -0
  1040. wisent/examples/scripts/results/benchmark_methods_summary.json +260 -0
  1041. wisent/examples/scripts/results/benchmark_pair_creation_methods.json +66 -0
  1042. wisent/examples/scripts/results/benchmark_pair_totals.json +269 -0
  1043. wisent/examples/scripts/results/benchmark_tags.json +917 -0
  1044. wisent/examples/scripts/results/benchmark_test_summary_nov4.json +71 -0
  1045. wisent/examples/scripts/results/coding_benchmarks_test_code_status.json +150 -0
  1046. wisent/examples/scripts/results/failing_benchmarks.json +946 -0
  1047. wisent/examples/scripts/results/failing_benchmarks_list.json +41 -0
  1048. wisent/examples/scripts/results/failing_benchmarks_test_results.json +945 -0
  1049. wisent/examples/scripts/results/missing_benchmark_tags.json +341 -0
  1050. wisent/examples/scripts/results/test_20_newsgroups_evaluation.json +30 -0
  1051. wisent/examples/scripts/results/test_20_newsgroups_pairs.json +8 -0
  1052. wisent/examples/scripts/results/test_AraDICE_evaluation.json +51 -0
  1053. wisent/examples/scripts/results/test_AraDICE_pairs.json +14 -0
  1054. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_evaluation.json +30 -0
  1055. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_pairs.json +8 -0
  1056. wisent/examples/scripts/results/test_ArabCulture_evaluation.json +51 -0
  1057. wisent/examples/scripts/results/test_ArabCulture_pairs.json +14 -0
  1058. wisent/examples/scripts/results/test_Tag_evaluation.json +30 -0
  1059. wisent/examples/scripts/results/test_Tag_pairs.json +8 -0
  1060. wisent/examples/scripts/results/test_aclue_evaluation.json +51 -0
  1061. wisent/examples/scripts/results/test_aclue_pairs.json +14 -0
  1062. wisent/examples/scripts/results/test_acp_bench_evaluation.json +51 -0
  1063. wisent/examples/scripts/results/test_acp_bench_hard_evaluation.json +51 -0
  1064. wisent/examples/scripts/results/test_acp_bench_hard_pairs.json +14 -0
  1065. wisent/examples/scripts/results/test_acp_bench_pairs.json +14 -0
  1066. wisent/examples/scripts/results/test_advanced_ai_risk_evaluation.json +51 -0
  1067. wisent/examples/scripts/results/test_advanced_ai_risk_pairs.json +14 -0
  1068. wisent/examples/scripts/results/test_aexams_evaluation.json +51 -0
  1069. wisent/examples/scripts/results/test_aexams_pairs.json +14 -0
  1070. wisent/examples/scripts/results/test_afrimgsm_direct_amh_evaluation.json +30 -0
  1071. wisent/examples/scripts/results/test_afrimgsm_direct_amh_pairs.json +8 -0
  1072. wisent/examples/scripts/results/test_afrimmlu_direct_amh_evaluation.json +30 -0
  1073. wisent/examples/scripts/results/test_afrimmlu_direct_amh_pairs.json +8 -0
  1074. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_evaluation.json +30 -0
  1075. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_pairs.json +8 -0
  1076. wisent/examples/scripts/results/test_ag_news_evaluation.json +30 -0
  1077. wisent/examples/scripts/results/test_ag_news_pairs.json +8 -0
  1078. wisent/examples/scripts/results/test_agieval_evaluation.json +51 -0
  1079. wisent/examples/scripts/results/test_agieval_pairs.json +14 -0
  1080. wisent/examples/scripts/results/test_aime2024_evaluation.json +30 -0
  1081. wisent/examples/scripts/results/test_aime2024_pairs.json +8 -0
  1082. wisent/examples/scripts/results/test_aime2025_evaluation.json +30 -0
  1083. wisent/examples/scripts/results/test_aime2025_pairs.json +8 -0
  1084. wisent/examples/scripts/results/test_aime_evaluation.json +30 -0
  1085. wisent/examples/scripts/results/test_aime_pairs.json +8 -0
  1086. wisent/examples/scripts/results/test_anagrams1_evaluation.json +30 -0
  1087. wisent/examples/scripts/results/test_anagrams1_pairs.json +8 -0
  1088. wisent/examples/scripts/results/test_anagrams2_evaluation.json +30 -0
  1089. wisent/examples/scripts/results/test_anagrams2_pairs.json +8 -0
  1090. wisent/examples/scripts/results/test_anli_evaluation.json +30 -0
  1091. wisent/examples/scripts/results/test_anli_pairs.json +8 -0
  1092. wisent/examples/scripts/results/test_apps_evaluation.json +30 -0
  1093. wisent/examples/scripts/results/test_apps_pairs.json +8 -0
  1094. wisent/examples/scripts/results/test_arabic_exams_evaluation.json +30 -0
  1095. wisent/examples/scripts/results/test_arabic_exams_pairs.json +8 -0
  1096. wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json +51 -0
  1097. wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json +14 -0
  1098. wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json +51 -0
  1099. wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json +14 -0
  1100. wisent/examples/scripts/results/test_arabicmmlu_evaluation.json +51 -0
  1101. wisent/examples/scripts/results/test_arabicmmlu_pairs.json +14 -0
  1102. wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json +51 -0
  1103. wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json +14 -0
  1104. wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json +51 -0
  1105. wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json +14 -0
  1106. wisent/examples/scripts/results/test_arc_ar_evaluation.json +30 -0
  1107. wisent/examples/scripts/results/test_arc_ar_pairs.json +8 -0
  1108. wisent/examples/scripts/results/test_arc_challenge_evaluation.json +30 -0
  1109. wisent/examples/scripts/results/test_arc_challenge_pairs.json +8 -0
  1110. wisent/examples/scripts/results/test_arc_easy_evaluation.json +30 -0
  1111. wisent/examples/scripts/results/test_arc_easy_pairs.json +8 -0
  1112. wisent/examples/scripts/results/test_argument_topic_evaluation.json +30 -0
  1113. wisent/examples/scripts/results/test_argument_topic_pairs.json +8 -0
  1114. wisent/examples/scripts/results/test_arithmetic_evaluation.json +51 -0
  1115. wisent/examples/scripts/results/test_arithmetic_pairs.json +14 -0
  1116. wisent/examples/scripts/results/test_asdiv_evaluation.json +30 -0
  1117. wisent/examples/scripts/results/test_asdiv_pairs.json +8 -0
  1118. wisent/examples/scripts/results/test_assin_entailment_evaluation.json +30 -0
  1119. wisent/examples/scripts/results/test_assin_entailment_pairs.json +8 -0
  1120. wisent/examples/scripts/results/test_atis_evaluation.json +30 -0
  1121. wisent/examples/scripts/results/test_atis_pairs.json +8 -0
  1122. wisent/examples/scripts/results/test_babi_evaluation.json +30 -0
  1123. wisent/examples/scripts/results/test_babi_pairs.json +8 -0
  1124. wisent/examples/scripts/results/test_babilong_evaluation.json +30 -0
  1125. wisent/examples/scripts/results/test_babilong_pairs.json +8 -0
  1126. wisent/examples/scripts/results/test_bangla_mmlu_evaluation.json +30 -0
  1127. wisent/examples/scripts/results/test_bangla_mmlu_pairs.json +8 -0
  1128. wisent/examples/scripts/results/test_banking77_evaluation.json +30 -0
  1129. wisent/examples/scripts/results/test_banking77_pairs.json +8 -0
  1130. wisent/examples/scripts/results/test_basque/test_basque-glue_pairs.json +14 -0
  1131. wisent/examples/scripts/results/test_basque-glue_evaluation.json +51 -0
  1132. wisent/examples/scripts/results/test_basque-glue_pairs.json +14 -0
  1133. wisent/examples/scripts/results/test_basque2/test_basque-glue_evaluation.json +51 -0
  1134. wisent/examples/scripts/results/test_basque2/test_basque-glue_pairs.json +14 -0
  1135. wisent/examples/scripts/results/test_basque_bench_evaluation.json +51 -0
  1136. wisent/examples/scripts/results/test_basque_bench_pairs.json +14 -0
  1137. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_evaluation.json +51 -0
  1138. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_pairs.json +14 -0
  1139. wisent/examples/scripts/results/test_basqueglue_evaluation.json +51 -0
  1140. wisent/examples/scripts/results/test_basqueglue_pairs.json +14 -0
  1141. wisent/examples/scripts/results/test_bbh_evaluation.json +51 -0
  1142. wisent/examples/scripts/results/test_bbh_pairs.json +14 -0
  1143. wisent/examples/scripts/results/test_bbq_evaluation.json +30 -0
  1144. wisent/examples/scripts/results/test_bbq_pairs.json +8 -0
  1145. wisent/examples/scripts/results/test_bec2016eu_evaluation.json +51 -0
  1146. wisent/examples/scripts/results/test_bec2016eu_pairs.json +14 -0
  1147. wisent/examples/scripts/results/test_belebele_evaluation.json +51 -0
  1148. wisent/examples/scripts/results/test_belebele_pairs.json +14 -0
  1149. wisent/examples/scripts/results/test_benchmarks_evaluation.json +51 -0
  1150. wisent/examples/scripts/results/test_benchmarks_pairs.json +14 -0
  1151. wisent/examples/scripts/results/test_bertaqa_evaluation.json +51 -0
  1152. wisent/examples/scripts/results/test_bertaqa_pairs.json +14 -0
  1153. wisent/examples/scripts/results/test_bhtc_v2_evaluation.json +30 -0
  1154. wisent/examples/scripts/results/test_bhtc_v2_pairs.json +8 -0
  1155. wisent/examples/scripts/results/test_bigbench_evaluation.json +51 -0
  1156. wisent/examples/scripts/results/test_bigbench_pairs.json +14 -0
  1157. wisent/examples/scripts/results/test_blimp_evaluation.json +51 -0
  1158. wisent/examples/scripts/results/test_blimp_pairs.json +14 -0
  1159. wisent/examples/scripts/results/test_boolq/test_boolq_evaluation.json +30 -0
  1160. wisent/examples/scripts/results/test_boolq/test_boolq_pairs.json +8 -0
  1161. wisent/examples/scripts/results/test_boolq-seq2seq_evaluation.json +30 -0
  1162. wisent/examples/scripts/results/test_boolq-seq2seq_pairs.json +8 -0
  1163. wisent/examples/scripts/results/test_boolq_evaluation.json +30 -0
  1164. wisent/examples/scripts/results/test_boolq_pairs.json +8 -0
  1165. wisent/examples/scripts/results/test_c4_evaluation.json +30 -0
  1166. wisent/examples/scripts/results/test_c4_pairs.json +8 -0
  1167. wisent/examples/scripts/results/test_cabreu_evaluation.json +30 -0
  1168. wisent/examples/scripts/results/test_cabreu_pairs.json +8 -0
  1169. wisent/examples/scripts/results/test_careqa_evaluation.json +30 -0
  1170. wisent/examples/scripts/results/test_careqa_pairs.json +8 -0
  1171. wisent/examples/scripts/results/test_catalan_bench_evaluation.json +51 -0
  1172. wisent/examples/scripts/results/test_catalan_bench_pairs.json +14 -0
  1173. wisent/examples/scripts/results/test_catalanqa_evaluation.json +30 -0
  1174. wisent/examples/scripts/results/test_catalanqa_pairs.json +8 -0
  1175. wisent/examples/scripts/results/test_catcola_evaluation.json +30 -0
  1176. wisent/examples/scripts/results/test_catcola_pairs.json +8 -0
  1177. wisent/examples/scripts/results/test_cb_evaluation.json +30 -0
  1178. wisent/examples/scripts/results/test_cb_pairs.json +8 -0
  1179. wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json +51 -0
  1180. wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json +14 -0
  1181. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json +30 -0
  1182. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json +8 -0
  1183. wisent/examples/scripts/results/test_ceval_evaluation.json +51 -0
  1184. wisent/examples/scripts/results/test_ceval_pairs.json +14 -0
  1185. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json +51 -0
  1186. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json +14 -0
  1187. wisent/examples/scripts/results/test_chain_of_thought_evaluation.json +51 -0
  1188. wisent/examples/scripts/results/test_chain_of_thought_pairs.json +14 -0
  1189. wisent/examples/scripts/results/test_chartqa_evaluation.json +30 -0
  1190. wisent/examples/scripts/results/test_chartqa_pairs.json +8 -0
  1191. wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json +30 -0
  1192. wisent/examples/scripts/results/test_claim_stance_topic_pairs.json +8 -0
  1193. wisent/examples/scripts/results/test_cmmlu_evaluation.json +51 -0
  1194. wisent/examples/scripts/results/test_cmmlu_pairs.json +14 -0
  1195. wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json +30 -0
  1196. wisent/examples/scripts/results/test_cnn_dailymail_pairs.json +8 -0
  1197. wisent/examples/scripts/results/test_cocoteros_es_evaluation.json +30 -0
  1198. wisent/examples/scripts/results/test_cocoteros_es_pairs.json +8 -0
  1199. wisent/examples/scripts/results/test_codexglue_code_to_text_go_evaluation.json +30 -0
  1200. wisent/examples/scripts/results/test_codexglue_code_to_text_go_pairs.json +8 -0
  1201. wisent/examples/scripts/results/test_codexglue_code_to_text_java_evaluation.json +30 -0
  1202. wisent/examples/scripts/results/test_codexglue_code_to_text_java_pairs.json +8 -0
  1203. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_evaluation.json +30 -0
  1204. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_pairs.json +8 -0
  1205. wisent/examples/scripts/results/test_codexglue_code_to_text_php_evaluation.json +30 -0
  1206. wisent/examples/scripts/results/test_codexglue_code_to_text_php_pairs.json +8 -0
  1207. wisent/examples/scripts/results/test_codexglue_code_to_text_python_evaluation.json +30 -0
  1208. wisent/examples/scripts/results/test_codexglue_code_to_text_python_pairs.json +8 -0
  1209. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_evaluation.json +30 -0
  1210. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_pairs.json +8 -0
  1211. wisent/examples/scripts/results/test_coedit_gec_evaluation.json +30 -0
  1212. wisent/examples/scripts/results/test_coedit_gec_pairs.json +8 -0
  1213. wisent/examples/scripts/results/test_cola_evaluation.json +30 -0
  1214. wisent/examples/scripts/results/test_cola_pairs.json +8 -0
  1215. wisent/examples/scripts/results/test_commonsense_qa_evaluation.json +30 -0
  1216. wisent/examples/scripts/results/test_commonsense_qa_pairs.json +8 -0
  1217. wisent/examples/scripts/results/test_conala_evaluation.json +30 -0
  1218. wisent/examples/scripts/results/test_conala_pairs.json +8 -0
  1219. wisent/examples/scripts/results/test_concode_evaluation.json +30 -0
  1220. wisent/examples/scripts/results/test_concode_pairs.json +8 -0
  1221. wisent/examples/scripts/results/test_copa_evaluation.json +30 -0
  1222. wisent/examples/scripts/results/test_copa_pairs.json +8 -0
  1223. wisent/examples/scripts/results/test_copal_id_evaluation.json +30 -0
  1224. wisent/examples/scripts/results/test_copal_id_pairs.json +8 -0
  1225. wisent/examples/scripts/results/test_coqa_evaluation.json +30 -0
  1226. wisent/examples/scripts/results/test_coqa_pairs.json +8 -0
  1227. wisent/examples/scripts/results/test_coqcat_evaluation.json +30 -0
  1228. wisent/examples/scripts/results/test_coqcat_pairs.json +8 -0
  1229. wisent/examples/scripts/results/test_crows_pairs_evaluation.json +51 -0
  1230. wisent/examples/scripts/results/test_crows_pairs_pairs.json +14 -0
  1231. wisent/examples/scripts/results/test_csatqa_evaluation.json +51 -0
  1232. wisent/examples/scripts/results/test_csatqa_pairs.json +14 -0
  1233. wisent/examples/scripts/results/test_cycle_letters_evaluation.json +30 -0
  1234. wisent/examples/scripts/results/test_cycle_letters_pairs.json +8 -0
  1235. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_evaluation.json +51 -0
  1236. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_pairs.json +14 -0
  1237. wisent/examples/scripts/results/test_darija_bench_evaluation.json +51 -0
  1238. wisent/examples/scripts/results/test_darija_bench_pairs.json +14 -0
  1239. wisent/examples/scripts/results/test_darijahellaswag_evaluation.json +30 -0
  1240. wisent/examples/scripts/results/test_darijahellaswag_pairs.json +8 -0
  1241. wisent/examples/scripts/results/test_darijammlu_evaluation.json +51 -0
  1242. wisent/examples/scripts/results/test_darijammlu_pairs.json +14 -0
  1243. wisent/examples/scripts/results/test_dbpedia_14_evaluation.json +30 -0
  1244. wisent/examples/scripts/results/test_dbpedia_14_pairs.json +8 -0
  1245. wisent/examples/scripts/results/test_drop_evaluation.json +30 -0
  1246. wisent/examples/scripts/results/test_drop_pairs.json +8 -0
  1247. wisent/examples/scripts/results/test_ds1000_evaluation.json +30 -0
  1248. wisent/examples/scripts/results/test_ds1000_pairs.json +8 -0
  1249. wisent/examples/scripts/results/test_egyhellaswag_evaluation.json +30 -0
  1250. wisent/examples/scripts/results/test_egyhellaswag_pairs.json +8 -0
  1251. wisent/examples/scripts/results/test_egymmlu_evaluation.json +51 -0
  1252. wisent/examples/scripts/results/test_egymmlu_pairs.json +14 -0
  1253. wisent/examples/scripts/results/test_epec_koref_bin_evaluation.json +30 -0
  1254. wisent/examples/scripts/results/test_epec_koref_bin_pairs.json +8 -0
  1255. wisent/examples/scripts/results/test_eq_bench_evaluation.json +30 -0
  1256. wisent/examples/scripts/results/test_eq_bench_pairs.json +8 -0
  1257. wisent/examples/scripts/results/test_escola_evaluation.json +30 -0
  1258. wisent/examples/scripts/results/test_escola_pairs.json +8 -0
  1259. wisent/examples/scripts/results/test_ethics_cm_evaluation.json +30 -0
  1260. wisent/examples/scripts/results/test_ethics_cm_pairs.json +8 -0
  1261. wisent/examples/scripts/results/test_ethos_binary_evaluation.json +30 -0
  1262. wisent/examples/scripts/results/test_ethos_binary_pairs.json +8 -0
  1263. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_evaluation.json +51 -0
  1264. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_pairs.json +14 -0
  1265. wisent/examples/scripts/results/test_eus_exams_es_evaluation.json +51 -0
  1266. wisent/examples/scripts/results/test_eus_exams_es_pairs.json +14 -0
  1267. wisent/examples/scripts/results/test_eus_exams_evaluation.json +51 -0
  1268. wisent/examples/scripts/results/test_eus_exams_pairs.json +14 -0
  1269. wisent/examples/scripts/results/test_eus_proficiency_evaluation.json +30 -0
  1270. wisent/examples/scripts/results/test_eus_proficiency_pairs.json +8 -0
  1271. wisent/examples/scripts/results/test_eus_reading_evaluation.json +30 -0
  1272. wisent/examples/scripts/results/test_eus_reading_pairs.json +8 -0
  1273. wisent/examples/scripts/results/test_eus_trivia_evaluation.json +30 -0
  1274. wisent/examples/scripts/results/test_eus_trivia_pairs.json +8 -0
  1275. wisent/examples/scripts/results/test_evalita-mp_evaluation.json +51 -0
  1276. wisent/examples/scripts/results/test_evalita-mp_pairs.json +14 -0
  1277. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +30 -0
  1278. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_pairs.json +8 -0
  1279. wisent/examples/scripts/results/test_evalita_LLM_evaluation.json +51 -0
  1280. wisent/examples/scripts/results/test_evalita_LLM_pairs.json +14 -0
  1281. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_evaluation.json +51 -0
  1282. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_pairs.json +14 -0
  1283. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_evaluation.json +30 -0
  1284. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_pairs.json +8 -0
  1285. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_evaluation.json +51 -0
  1286. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_pairs.json +14 -0
  1287. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +30 -0
  1288. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_pairs.json +8 -0
  1289. wisent/examples/scripts/results/test_fda_evaluation.json +30 -0
  1290. wisent/examples/scripts/results/test_fda_pairs.json +8 -0
  1291. wisent/examples/scripts/results/test_financial_tweets_evaluation.json +30 -0
  1292. wisent/examples/scripts/results/test_financial_tweets_pairs.json +8 -0
  1293. wisent/examples/scripts/results/test_fld/test_fld_evaluation.json +30 -0
  1294. wisent/examples/scripts/results/test_fld/test_fld_pairs.json +8 -0
  1295. wisent/examples/scripts/results/test_fld_evaluation.json +30 -0
  1296. wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json +30 -0
  1297. wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json +8 -0
  1298. wisent/examples/scripts/results/test_fld_pairs.json +8 -0
  1299. wisent/examples/scripts/results/test_flores_evaluation.json +51 -0
  1300. wisent/examples/scripts/results/test_flores_pairs.json +14 -0
  1301. wisent/examples/scripts/results/test_freebase_evaluation.json +30 -0
  1302. wisent/examples/scripts/results/test_freebase_pairs.json +8 -0
  1303. wisent/examples/scripts/results/test_french_bench_evaluation.json +51 -0
  1304. wisent/examples/scripts/results/test_french_bench_pairs.json +14 -0
  1305. wisent/examples/scripts/results/test_galcola_evaluation.json +30 -0
  1306. wisent/examples/scripts/results/test_galcola_pairs.json +8 -0
  1307. wisent/examples/scripts/results/test_galician_bench_evaluation.json +51 -0
  1308. wisent/examples/scripts/results/test_galician_bench_pairs.json +14 -0
  1309. wisent/examples/scripts/results/test_glianorex_evaluation.json +30 -0
  1310. wisent/examples/scripts/results/test_glianorex_pairs.json +8 -0
  1311. wisent/examples/scripts/results/test_global_mmlu_evaluation.json +51 -0
  1312. wisent/examples/scripts/results/test_global_mmlu_pairs.json +14 -0
  1313. wisent/examples/scripts/results/test_glue_evaluation.json +51 -0
  1314. wisent/examples/scripts/results/test_glue_pairs.json +14 -0
  1315. wisent/examples/scripts/results/test_gpqa_evaluation.json +51 -0
  1316. wisent/examples/scripts/results/test_gpqa_pairs.json +14 -0
  1317. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_evaluation.json +51 -0
  1318. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_pairs.json +14 -0
  1319. wisent/examples/scripts/results/test_groundcocoa_evaluation.json +30 -0
  1320. wisent/examples/scripts/results/test_groundcocoa_pairs.json +8 -0
  1321. wisent/examples/scripts/results/test_gsm8k_evaluation.json +30 -0
  1322. wisent/examples/scripts/results/test_gsm8k_pairs.json +8 -0
  1323. wisent/examples/scripts/results/test_haerae_evaluation.json +51 -0
  1324. wisent/examples/scripts/results/test_haerae_pairs.json +14 -0
  1325. wisent/examples/scripts/results/test_headqa_evaluation.json +30 -0
  1326. wisent/examples/scripts/results/test_headqa_pairs.json +8 -0
  1327. wisent/examples/scripts/results/test_hellaswag_evaluation.json +30 -0
  1328. wisent/examples/scripts/results/test_hellaswag_pairs.json +8 -0
  1329. wisent/examples/scripts/results/test_hendrycks_ethics_evaluation.json +51 -0
  1330. wisent/examples/scripts/results/test_hendrycks_ethics_pairs.json +14 -0
  1331. wisent/examples/scripts/results/test_hendrycks_math_evaluation.json +51 -0
  1332. wisent/examples/scripts/results/test_hendrycks_math_pairs.json +14 -0
  1333. wisent/examples/scripts/results/test_histoires_morales_evaluation.json +30 -0
  1334. wisent/examples/scripts/results/test_histoires_morales_pairs.json +8 -0
  1335. wisent/examples/scripts/results/test_hmmt_evaluation.json +30 -0
  1336. wisent/examples/scripts/results/test_hmmt_feb_2025_evaluation.json +30 -0
  1337. wisent/examples/scripts/results/test_hmmt_feb_2025_pairs.json +8 -0
  1338. wisent/examples/scripts/results/test_hmmt_pairs.json +8 -0
  1339. wisent/examples/scripts/results/test_hrm8k_evaluation.json +51 -0
  1340. wisent/examples/scripts/results/test_hrm8k_pairs.json +14 -0
  1341. wisent/examples/scripts/results/test_humaneval_evaluation.json +30 -0
  1342. wisent/examples/scripts/results/test_humaneval_pairs.json +8 -0
  1343. wisent/examples/scripts/results/test_humaneval_plus_evaluation.json +30 -0
  1344. wisent/examples/scripts/results/test_humaneval_plus_pairs.json +8 -0
  1345. wisent/examples/scripts/results/test_ifeval_evaluation.json +30 -0
  1346. wisent/examples/scripts/results/test_ifeval_pairs.json +8 -0
  1347. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_evaluation.json +30 -0
  1348. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_pairs.json +8 -0
  1349. wisent/examples/scripts/results/test_instruct_humaneval_evaluation.json +30 -0
  1350. wisent/examples/scripts/results/test_instruct_humaneval_pairs.json +8 -0
  1351. wisent/examples/scripts/results/test_inverse_scaling_evaluation.json +51 -0
  1352. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_evaluation.json +30 -0
  1353. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_pairs.json +8 -0
  1354. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_evaluation.json +51 -0
  1355. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_pairs.json +14 -0
  1356. wisent/examples/scripts/results/test_inverse_scaling_pairs.json +14 -0
  1357. wisent/examples/scripts/results/test_iwslt2017-ar-en_evaluation.json +30 -0
  1358. wisent/examples/scripts/results/test_iwslt2017-ar-en_pairs.json +8 -0
  1359. wisent/examples/scripts/results/test_iwslt2017-en-ar_evaluation.json +30 -0
  1360. wisent/examples/scripts/results/test_iwslt2017-en-ar_pairs.json +8 -0
  1361. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_evaluation.json +30 -0
  1362. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_pairs.json +8 -0
  1363. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_evaluation.json +30 -0
  1364. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_pairs.json +8 -0
  1365. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_evaluation.json +30 -0
  1366. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_pairs.json +8 -0
  1367. wisent/examples/scripts/results/test_japanese_leaderboard_evaluation.json +51 -0
  1368. wisent/examples/scripts/results/test_japanese_leaderboard_pairs.json +14 -0
  1369. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_evaluation.json +30 -0
  1370. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_pairs.json +8 -0
  1371. wisent/examples/scripts/results/test_jsonschema_bench_evaluation.json +30 -0
  1372. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_evaluation.json +30 -0
  1373. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_pairs.json +8 -0
  1374. wisent/examples/scripts/results/test_jsonschema_bench_pairs.json +8 -0
  1375. wisent/examples/scripts/results/test_kbl_evaluation.json +51 -0
  1376. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_evaluation.json +51 -0
  1377. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_pairs.json +14 -0
  1378. wisent/examples/scripts/results/test_kbl_pairs.json +14 -0
  1379. wisent/examples/scripts/results/test_kmmlu_evaluation.json +51 -0
  1380. wisent/examples/scripts/results/test_kmmlu_pairs.json +14 -0
  1381. wisent/examples/scripts/results/test_kobest_evaluation.json +51 -0
  1382. wisent/examples/scripts/results/test_kobest_pairs.json +14 -0
  1383. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json +30 -0
  1384. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json +8 -0
  1385. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json +30 -0
  1386. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json +8 -0
  1387. wisent/examples/scripts/results/test_kormedmcqa_evaluation.json +30 -0
  1388. wisent/examples/scripts/results/test_kormedmcqa_pairs.json +8 -0
  1389. wisent/examples/scripts/results/test_lambada_cloze_evaluation.json +30 -0
  1390. wisent/examples/scripts/results/test_lambada_cloze_pairs.json +8 -0
  1391. wisent/examples/scripts/results/test_lambada_evaluation.json +30 -0
  1392. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json +30 -0
  1393. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json +8 -0
  1394. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json +51 -0
  1395. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json +14 -0
  1396. wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json +51 -0
  1397. wisent/examples/scripts/results/test_lambada_multilingual_pairs.json +14 -0
  1398. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json +51 -0
  1399. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json +14 -0
  1400. wisent/examples/scripts/results/test_lambada_openai_evaluation.json +30 -0
  1401. wisent/examples/scripts/results/test_lambada_openai_pairs.json +8 -0
  1402. wisent/examples/scripts/results/test_lambada_pairs.json +8 -0
  1403. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +30 -0
  1404. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +8 -0
  1405. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +30 -0
  1406. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +8 -0
  1407. wisent/examples/scripts/results/test_lambada_standard_evaluation.json +30 -0
  1408. wisent/examples/scripts/results/test_lambada_standard_pairs.json +8 -0
  1409. wisent/examples/scripts/results/test_leaderboard_evaluation.json +51 -0
  1410. wisent/examples/scripts/results/test_leaderboard_pairs.json +14 -0
  1411. wisent/examples/scripts/results/test_libra/test_libra_evaluation.json +51 -0
  1412. wisent/examples/scripts/results/test_libra/test_libra_pairs.json +14 -0
  1413. wisent/examples/scripts/results/test_libra_evaluation.json +51 -0
  1414. wisent/examples/scripts/results/test_libra_pairs.json +14 -0
  1415. wisent/examples/scripts/results/test_lingoly_evaluation.json +30 -0
  1416. wisent/examples/scripts/results/test_lingoly_pairs.json +8 -0
  1417. wisent/examples/scripts/results/test_livecodebench_evaluation.json +30 -0
  1418. wisent/examples/scripts/results/test_livecodebench_pairs.json +8 -0
  1419. wisent/examples/scripts/results/test_livemathbench_cnmo_en_evaluation.json +30 -0
  1420. wisent/examples/scripts/results/test_livemathbench_cnmo_en_pairs.json +8 -0
  1421. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_evaluation.json +30 -0
  1422. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_pairs.json +8 -0
  1423. wisent/examples/scripts/results/test_llama_evaluation.json +30 -0
  1424. wisent/examples/scripts/results/test_llama_pairs.json +8 -0
  1425. wisent/examples/scripts/results/test_logiqa2_evaluation.json +30 -0
  1426. wisent/examples/scripts/results/test_logiqa2_pairs.json +8 -0
  1427. wisent/examples/scripts/results/test_logiqa_evaluation.json +30 -0
  1428. wisent/examples/scripts/results/test_logiqa_pairs.json +8 -0
  1429. wisent/examples/scripts/results/test_m_mmlu_evaluation.json +51 -0
  1430. wisent/examples/scripts/results/test_m_mmlu_pairs.json +14 -0
  1431. wisent/examples/scripts/results/test_mastermind/test_mastermind_evaluation.json +51 -0
  1432. wisent/examples/scripts/results/test_mastermind/test_mastermind_pairs.json +14 -0
  1433. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_evaluation.json +30 -0
  1434. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_pairs.json +8 -0
  1435. wisent/examples/scripts/results/test_mastermind_evaluation.json +51 -0
  1436. wisent/examples/scripts/results/test_mastermind_pairs.json +14 -0
  1437. wisent/examples/scripts/results/test_math500_evaluation.json +30 -0
  1438. wisent/examples/scripts/results/test_math500_pairs.json +8 -0
  1439. wisent/examples/scripts/results/test_math_evaluation.json +30 -0
  1440. wisent/examples/scripts/results/test_math_pairs.json +8 -0
  1441. wisent/examples/scripts/results/test_mathqa_evaluation.json +30 -0
  1442. wisent/examples/scripts/results/test_mathqa_pairs.json +8 -0
  1443. wisent/examples/scripts/results/test_mbpp_evaluation.json +30 -0
  1444. wisent/examples/scripts/results/test_mbpp_pairs.json +8 -0
  1445. wisent/examples/scripts/results/test_mbpp_plus_evaluation.json +30 -0
  1446. wisent/examples/scripts/results/test_mbpp_plus_pairs.json +8 -0
  1447. wisent/examples/scripts/results/test_mc_taco_evaluation.json +30 -0
  1448. wisent/examples/scripts/results/test_mc_taco_pairs.json +8 -0
  1449. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_evaluation.json +51 -0
  1450. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_pairs.json +14 -0
  1451. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_evaluation.json +30 -0
  1452. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_pairs.json +8 -0
  1453. wisent/examples/scripts/results/test_med_concepts_qa_evaluation.json +51 -0
  1454. wisent/examples/scripts/results/test_med_concepts_qa_pairs.json +14 -0
  1455. wisent/examples/scripts/results/test_meddialog_evaluation.json +30 -0
  1456. wisent/examples/scripts/results/test_meddialog_pairs.json +8 -0
  1457. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_evaluation.json +30 -0
  1458. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_pairs.json +8 -0
  1459. wisent/examples/scripts/results/test_mediqa_qa2019_evaluation.json +30 -0
  1460. wisent/examples/scripts/results/test_mediqa_qa2019_pairs.json +8 -0
  1461. wisent/examples/scripts/results/test_medmcqa_evaluation.json +30 -0
  1462. wisent/examples/scripts/results/test_medmcqa_pairs.json +8 -0
  1463. wisent/examples/scripts/results/test_medqa_evaluation.json +30 -0
  1464. wisent/examples/scripts/results/test_medqa_pairs.json +8 -0
  1465. wisent/examples/scripts/results/test_medtext_evaluation.json +30 -0
  1466. wisent/examples/scripts/results/test_medtext_pairs.json +8 -0
  1467. wisent/examples/scripts/results/test_mela_evaluation.json +51 -0
  1468. wisent/examples/scripts/results/test_mela_pairs.json +14 -0
  1469. wisent/examples/scripts/results/test_meqsum_evaluation.json +30 -0
  1470. wisent/examples/scripts/results/test_meqsum_pairs.json +8 -0
  1471. wisent/examples/scripts/results/test_mercury_evaluation.json +30 -0
  1472. wisent/examples/scripts/results/test_mercury_pairs.json +8 -0
  1473. wisent/examples/scripts/results/test_metabench_evaluation.json +51 -0
  1474. wisent/examples/scripts/results/test_metabench_pairs.json +14 -0
  1475. wisent/examples/scripts/results/test_mgsm_evaluation.json +51 -0
  1476. wisent/examples/scripts/results/test_mgsm_pairs.json +14 -0
  1477. wisent/examples/scripts/results/test_mimic_repsum_evaluation.json +30 -0
  1478. wisent/examples/scripts/results/test_mimic_repsum_pairs.json +8 -0
  1479. wisent/examples/scripts/results/test_minerva_math_evaluation.json +51 -0
  1480. wisent/examples/scripts/results/test_minerva_math_pairs.json +14 -0
  1481. wisent/examples/scripts/results/test_mlqa_evaluation.json +51 -0
  1482. wisent/examples/scripts/results/test_mlqa_pairs.json +14 -0
  1483. wisent/examples/scripts/results/test_mmlu-pro-plus_evaluation.json +51 -0
  1484. wisent/examples/scripts/results/test_mmlu-pro-plus_pairs.json +14 -0
  1485. wisent/examples/scripts/results/test_mmlu_evaluation.json +51 -0
  1486. wisent/examples/scripts/results/test_mmlu_pairs.json +14 -0
  1487. wisent/examples/scripts/results/test_mmlu_pro_evaluation.json +51 -0
  1488. wisent/examples/scripts/results/test_mmlu_pro_pairs.json +14 -0
  1489. wisent/examples/scripts/results/test_mmlu_prox_evaluation.json +51 -0
  1490. wisent/examples/scripts/results/test_mmlu_prox_pairs.json +14 -0
  1491. wisent/examples/scripts/results/test_mmlusr_evaluation.json +30 -0
  1492. wisent/examples/scripts/results/test_mmlusr_pairs.json +8 -0
  1493. wisent/examples/scripts/results/test_mmmu_evaluation.json +51 -0
  1494. wisent/examples/scripts/results/test_mmmu_pairs.json +14 -0
  1495. wisent/examples/scripts/results/test_mnli_evaluation.json +30 -0
  1496. wisent/examples/scripts/results/test_mnli_pairs.json +8 -0
  1497. wisent/examples/scripts/results/test_model_written_evals_evaluation.json +51 -0
  1498. wisent/examples/scripts/results/test_model_written_evals_pairs.json +14 -0
  1499. wisent/examples/scripts/results/test_moral_stories_evaluation.json +30 -0
  1500. wisent/examples/scripts/results/test_moral_stories_pairs.json +8 -0
  1501. wisent/examples/scripts/results/test_mts_dialog_evaluation.json +30 -0
  1502. wisent/examples/scripts/results/test_mts_dialog_pairs.json +8 -0
  1503. wisent/examples/scripts/results/test_multiblimp_evaluation.json +51 -0
  1504. wisent/examples/scripts/results/test_multiblimp_pairs.json +14 -0
  1505. wisent/examples/scripts/results/test_multimedqa_evaluation.json +51 -0
  1506. wisent/examples/scripts/results/test_multimedqa_pairs.json +14 -0
  1507. wisent/examples/scripts/results/test_multipl_e_evaluation.json +30 -0
  1508. wisent/examples/scripts/results/test_multipl_e_pairs.json +8 -0
  1509. wisent/examples/scripts/results/test_mutual_evaluation.json +30 -0
  1510. wisent/examples/scripts/results/test_mutual_pairs.json +8 -0
  1511. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_evaluation.json +30 -0
  1512. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_pairs.json +8 -0
  1513. wisent/examples/scripts/results/test_noreval_evaluation.json +51 -0
  1514. wisent/examples/scripts/results/test_noreval_pairs.json +14 -0
  1515. wisent/examples/scripts/results/test_noticia_evaluation.json +30 -0
  1516. wisent/examples/scripts/results/test_noticia_pairs.json +8 -0
  1517. wisent/examples/scripts/results/test_nq_open_evaluation.json +30 -0
  1518. wisent/examples/scripts/results/test_nq_open_pairs.json +8 -0
  1519. wisent/examples/scripts/results/test_olaph_evaluation.json +30 -0
  1520. wisent/examples/scripts/results/test_olaph_pairs.json +8 -0
  1521. wisent/examples/scripts/results/test_openbookqa_evaluation.json +30 -0
  1522. wisent/examples/scripts/results/test_openbookqa_pairs.json +8 -0
  1523. wisent/examples/scripts/results/test_openllm_evaluation.json +51 -0
  1524. wisent/examples/scripts/results/test_openllm_pairs.json +14 -0
  1525. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_evaluation.json +30 -0
  1526. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_pairs.json +8 -0
  1527. wisent/examples/scripts/results/test_paloma_evaluation.json +51 -0
  1528. wisent/examples/scripts/results/test_paloma_pairs.json +14 -0
  1529. wisent/examples/scripts/results/test_passkey/test_passkey_evaluation.json +30 -0
  1530. wisent/examples/scripts/results/test_passkey/test_passkey_pairs.json +8 -0
  1531. wisent/examples/scripts/results/test_paws-x_evaluation.json +51 -0
  1532. wisent/examples/scripts/results/test_paws-x_pairs.json +14 -0
  1533. wisent/examples/scripts/results/test_paws_en/test_paws_en_evaluation.json +30 -0
  1534. wisent/examples/scripts/results/test_paws_en/test_paws_en_pairs.json +8 -0
  1535. wisent/examples/scripts/results/test_penn_treebank_evaluation.json +30 -0
  1536. wisent/examples/scripts/results/test_penn_treebank_pairs.json +8 -0
  1537. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_evaluation.json +30 -0
  1538. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_pairs.json +8 -0
  1539. wisent/examples/scripts/results/test_piqa_evaluation.json +30 -0
  1540. wisent/examples/scripts/results/test_piqa_pairs.json +8 -0
  1541. wisent/examples/scripts/results/test_polemo2_evaluation.json +30 -0
  1542. wisent/examples/scripts/results/test_polemo2_pairs.json +8 -0
  1543. wisent/examples/scripts/results/test_polymath_en_high_evaluation.json +30 -0
  1544. wisent/examples/scripts/results/test_polymath_en_high_pairs.json +8 -0
  1545. wisent/examples/scripts/results/test_polymath_en_medium_evaluation.json +30 -0
  1546. wisent/examples/scripts/results/test_polymath_en_medium_pairs.json +8 -0
  1547. wisent/examples/scripts/results/test_polymath_zh_high_evaluation.json +30 -0
  1548. wisent/examples/scripts/results/test_polymath_zh_high_pairs.json +8 -0
  1549. wisent/examples/scripts/results/test_polymath_zh_medium_evaluation.json +30 -0
  1550. wisent/examples/scripts/results/test_polymath_zh_medium_pairs.json +8 -0
  1551. wisent/examples/scripts/results/test_portuguese_bench_evaluation.json +51 -0
  1552. wisent/examples/scripts/results/test_portuguese_bench_pairs.json +14 -0
  1553. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_evaluation.json +30 -0
  1554. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_pairs.json +8 -0
  1555. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_evaluation.json +30 -0
  1556. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_pairs.json +8 -0
  1557. wisent/examples/scripts/results/test_prost_evaluation.json +30 -0
  1558. wisent/examples/scripts/results/test_prost_pairs.json +8 -0
  1559. wisent/examples/scripts/results/test_ptb_evaluation.json +30 -0
  1560. wisent/examples/scripts/results/test_ptb_pairs.json +8 -0
  1561. wisent/examples/scripts/results/test_pubmedqa_evaluation.json +30 -0
  1562. wisent/examples/scripts/results/test_pubmedqa_pairs.json +8 -0
  1563. wisent/examples/scripts/results/test_pythia_evaluation.json +51 -0
  1564. wisent/examples/scripts/results/test_pythia_pairs.json +14 -0
  1565. wisent/examples/scripts/results/test_qa4mre_evaluation.json +30 -0
  1566. wisent/examples/scripts/results/test_qa4mre_pairs.json +8 -0
  1567. wisent/examples/scripts/results/test_qasper_evaluation.json +30 -0
  1568. wisent/examples/scripts/results/test_qasper_pairs.json +8 -0
  1569. wisent/examples/scripts/results/test_race_evaluation.json +30 -0
  1570. wisent/examples/scripts/results/test_race_pairs.json +8 -0
  1571. wisent/examples/scripts/results/test_realtoxicityprompts_evaluation.json +30 -0
  1572. wisent/examples/scripts/results/test_realtoxicityprompts_pairs.json +8 -0
  1573. wisent/examples/scripts/results/test_recode_evaluation.json +30 -0
  1574. wisent/examples/scripts/results/test_recode_pairs.json +8 -0
  1575. wisent/examples/scripts/results/test_record_evaluation.json +30 -0
  1576. wisent/examples/scripts/results/test_record_pairs.json +8 -0
  1577. wisent/examples/scripts/results/test_ruler_evaluation.json +51 -0
  1578. wisent/examples/scripts/results/test_ruler_pairs.json +14 -0
  1579. wisent/examples/scripts/results/test_sciq_evaluation.json +30 -0
  1580. wisent/examples/scripts/results/test_sciq_pairs.json +8 -0
  1581. wisent/examples/scripts/results/test_score_evaluation.json +51 -0
  1582. wisent/examples/scripts/results/test_score_pairs.json +14 -0
  1583. wisent/examples/scripts/results/test_self_consistency_evaluation.json +30 -0
  1584. wisent/examples/scripts/results/test_self_consistency_pairs.json +8 -0
  1585. wisent/examples/scripts/results/test_siqa/test_siqa_evaluation.json +30 -0
  1586. wisent/examples/scripts/results/test_siqa/test_siqa_pairs.json +8 -0
  1587. wisent/examples/scripts/results/test_siqa_evaluation.json +30 -0
  1588. wisent/examples/scripts/results/test_siqa_pairs.json +8 -0
  1589. wisent/examples/scripts/results/test_spanish_bench_evaluation.json +51 -0
  1590. wisent/examples/scripts/results/test_spanish_bench_pairs.json +14 -0
  1591. wisent/examples/scripts/results/test_squad2_evaluation.json +30 -0
  1592. wisent/examples/scripts/results/test_squad2_pairs.json +8 -0
  1593. wisent/examples/scripts/results/test_squadv2_evaluation.json +30 -0
  1594. wisent/examples/scripts/results/test_squadv2_pairs.json +8 -0
  1595. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_evaluation.json +30 -0
  1596. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_pairs.json +8 -0
  1597. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_evaluation.json +51 -0
  1598. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_pairs.json +14 -0
  1599. wisent/examples/scripts/results/test_swag_evaluation.json +30 -0
  1600. wisent/examples/scripts/results/test_swag_pairs.json +8 -0
  1601. wisent/examples/scripts/results/test_tinyBenchmarks_evaluation.json +51 -0
  1602. wisent/examples/scripts/results/test_tinyBenchmarks_pairs.json +14 -0
  1603. wisent/examples/scripts/results/test_tmmluplus_evaluation.json +51 -0
  1604. wisent/examples/scripts/results/test_tmmluplus_pairs.json +14 -0
  1605. wisent/examples/scripts/results/test_translation_evaluation.json +51 -0
  1606. wisent/examples/scripts/results/test_translation_pairs.json +14 -0
  1607. wisent/examples/scripts/results/test_triviaqa_evaluation.json +30 -0
  1608. wisent/examples/scripts/results/test_triviaqa_pairs.json +8 -0
  1609. wisent/examples/scripts/results/test_truthfulqa-multi_evaluation.json +51 -0
  1610. wisent/examples/scripts/results/test_truthfulqa-multi_pairs.json +14 -0
  1611. wisent/examples/scripts/results/test_truthfulqa_evaluation.json +30 -0
  1612. wisent/examples/scripts/results/test_truthfulqa_mc1_evaluation.json +30 -0
  1613. wisent/examples/scripts/results/test_truthfulqa_mc1_pairs.json +8 -0
  1614. wisent/examples/scripts/results/test_truthfulqa_mc2_evaluation.json +30 -0
  1615. wisent/examples/scripts/results/test_truthfulqa_mc2_pairs.json +8 -0
  1616. wisent/examples/scripts/results/test_truthfulqa_pairs.json +8 -0
  1617. wisent/examples/scripts/results/test_turkishmmlu_evaluation.json +51 -0
  1618. wisent/examples/scripts/results/test_turkishmmlu_pairs.json +14 -0
  1619. wisent/examples/scripts/results/test_unfair_tos_evaluation.json +30 -0
  1620. wisent/examples/scripts/results/test_unfair_tos_pairs.json +8 -0
  1621. wisent/examples/scripts/results/test_unscramble_evaluation.json +51 -0
  1622. wisent/examples/scripts/results/test_unscramble_pairs.json +14 -0
  1623. wisent/examples/scripts/results/test_webqs_evaluation.json +30 -0
  1624. wisent/examples/scripts/results/test_webqs_pairs.json +8 -0
  1625. wisent/examples/scripts/results/test_wikitext103_evaluation.json +30 -0
  1626. wisent/examples/scripts/results/test_wikitext103_pairs.json +8 -0
  1627. wisent/examples/scripts/results/test_wikitext_evaluation.json +30 -0
  1628. wisent/examples/scripts/results/test_wikitext_pairs.json +8 -0
  1629. wisent/examples/scripts/results/test_winogender_evaluation.json +51 -0
  1630. wisent/examples/scripts/results/test_winogender_pairs.json +14 -0
  1631. wisent/examples/scripts/results/test_winogrande_evaluation.json +30 -0
  1632. wisent/examples/scripts/results/test_winogrande_pairs.json +8 -0
  1633. wisent/examples/scripts/results/test_wmdp_evaluation.json +30 -0
  1634. wisent/examples/scripts/results/test_wmdp_pairs.json +8 -0
  1635. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_evaluation.json +30 -0
  1636. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_pairs.json +8 -0
  1637. wisent/examples/scripts/results/test_wmt14_en_fr_evaluation.json +30 -0
  1638. wisent/examples/scripts/results/test_wmt14_en_fr_pairs.json +8 -0
  1639. wisent/examples/scripts/results/test_wmt16_en_de_evaluation.json +30 -0
  1640. wisent/examples/scripts/results/test_wmt16_en_de_pairs.json +8 -0
  1641. wisent/examples/scripts/results/test_wmt16_ro_en_evaluation.json +30 -0
  1642. wisent/examples/scripts/results/test_wmt16_ro_en_pairs.json +8 -0
  1643. wisent/examples/scripts/results/test_wsc273_evaluation.json +30 -0
  1644. wisent/examples/scripts/results/test_wsc273_pairs.json +8 -0
  1645. wisent/examples/scripts/results/test_xcopa_evaluation.json +51 -0
  1646. wisent/examples/scripts/results/test_xcopa_pairs.json +14 -0
  1647. wisent/examples/scripts/results/test_xnli_eu_evaluation.json +30 -0
  1648. wisent/examples/scripts/results/test_xnli_eu_pairs.json +8 -0
  1649. wisent/examples/scripts/results/test_xnli_evaluation.json +51 -0
  1650. wisent/examples/scripts/results/test_xnli_pairs.json +14 -0
  1651. wisent/examples/scripts/results/test_xquad_evaluation.json +51 -0
  1652. wisent/examples/scripts/results/test_xquad_pairs.json +14 -0
  1653. wisent/examples/scripts/results/test_xstorycloze_evaluation.json +51 -0
  1654. wisent/examples/scripts/results/test_xstorycloze_pairs.json +14 -0
  1655. wisent/examples/scripts/results/test_xsum_evaluation.json +30 -0
  1656. wisent/examples/scripts/results/test_xsum_pairs.json +8 -0
  1657. wisent/examples/scripts/results/test_xwinograd_evaluation.json +51 -0
  1658. wisent/examples/scripts/results/test_xwinograd_pairs.json +14 -0
  1659. wisent/examples/scripts/results/test_yahoo_answers_topics_evaluation.json +30 -0
  1660. wisent/examples/scripts/results/test_yahoo_answers_topics_pairs.json +8 -0
  1661. wisent/parameters/__init__.py +1 -0
  1662. wisent/parameters/lm_eval/all_lm_eval_task_families.json +169 -0
  1663. wisent/parameters/lm_eval/broken_in_lm_eval.json +10 -0
  1664. wisent/parameters/lm_eval/evaluations_not_lm_eval_tasks.json +0 -0
  1665. wisent/parameters/lm_eval/evaluator_check.json +3476 -0
  1666. wisent/parameters/lm_eval/final_verification.json +24782 -0
  1667. wisent/parameters/lm_eval/group_task_evaluators.json +1833 -0
  1668. wisent/parameters/lm_eval/group_tasks.json +150 -0
  1669. wisent/parameters/lm_eval/individual_tasks.json +402 -0
  1670. wisent/parameters/lm_eval/no_readmes.json +1 -0
  1671. wisent/parameters/lm_eval/not_lm_eval_tasks.json +110 -0
  1672. wisent/parameters/lm_eval/read_tasks.json +208 -0
  1673. wisent/parameters/lm_eval/readme_files.json +208 -0
  1674. wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json +128 -0
  1675. wisent/parameters/tasks/missing_task_families.json +2963 -0
  1676. wisent/parameters/tasks/remaining_tasks_to_implement.json +199 -0
  1677. wisent/parameters/tasks/risks.json +10 -0
  1678. wisent/parameters/tasks/skills.json +14 -0
  1679. wisent/parameters/tasks/tasks.json +56031 -0
  1680. wisent/scripts/run_quality_metrics_sweep.sh +315 -0
  1681. wisent/tests/__init__.py +0 -0
  1682. wisent/tests/examples/__init__.py +0 -0
  1683. wisent/tests/examples/cli/__init__.py +0 -0
  1684. wisent/tests/examples/cli/activations/__init__.py +0 -0
  1685. wisent/tests/examples/cli/activations/test_get_activations.py +127 -0
  1686. wisent/tests/examples/cli/classifier/__init__.py +0 -0
  1687. wisent/tests/examples/cli/classifier/test_classifier_examples.py +141 -0
  1688. wisent/tests/examples/cli/contrastive_pairs/__init__.py +0 -0
  1689. wisent/tests/examples/cli/contrastive_pairs/test_generate_pairs.py +89 -0
  1690. wisent/tests/examples/cli/evaluation/__init__.py +0 -0
  1691. wisent/tests/examples/cli/evaluation/test_evaluation_examples.py +117 -0
  1692. wisent/tests/examples/cli/generate/__init__.py +0 -0
  1693. wisent/tests/examples/cli/generate/test_generate_with_classifier.py +146 -0
  1694. wisent/tests/examples/cli/generate/test_generate_with_steering.py +149 -0
  1695. wisent/tests/examples/cli/generate/test_only_generate.py +110 -0
  1696. wisent/tests/examples/cli/multi_steering/__init__.py +0 -0
  1697. wisent/tests/examples/cli/multi_steering/test_multi_steer_from_trained_vectors.py +210 -0
  1698. wisent/tests/examples/cli/multi_steering/test_multi_steer_with_different_parameters.py +205 -0
  1699. wisent/tests/examples/cli/multi_steering/test_train_and_multi_steer.py +174 -0
  1700. wisent/tests/examples/cli/optimizer/__init__.py +0 -0
  1701. wisent/tests/examples/cli/optimizer/test_optimize_sample_size.py +102 -0
  1702. wisent/tests/examples/cli/optimizer/test_optimizer_examples.py +59 -0
  1703. wisent/tests/examples/cli/steering/__init__.py +0 -0
  1704. wisent/tests/examples/cli/steering/test_create_steering_vectors.py +135 -0
  1705. wisent/tests/examples/cli/synthetic/__init__.py +0 -0
  1706. wisent/tests/examples/cli/synthetic/test_synthetic_pairs.py +45 -0
  1707. wisent/tests/nosense/__init__.py +6 -0
  1708. wisent/tests/nosense/base_nosense.py +81 -0
  1709. wisent/tests/nosense/math500_nosense.py +72 -0
  1710. wisent/tests/nosense/test_robustness.py +336 -0
  1711. wisent/tests/test_all_cli_commands.py +674 -0
  1712. wisent/tests/test_geometry_comprehensive.py +327 -0
  1713. wisent/tests/test_titan_geometry.py +257 -0
  1714. wisent/tests/visualize_geometry.py +148 -0
  1715. wisent-0.7.379.dist-info/METADATA +64 -0
  1716. wisent-0.7.379.dist-info/RECORD +1720 -0
  1717. wisent-0.7.379.dist-info/WHEEL +5 -0
  1718. wisent-0.7.379.dist-info/entry_points.txt +2 -0
  1719. wisent-0.7.379.dist-info/licenses/LICENSE +21 -0
  1720. wisent-0.7.379.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1431 @@
1
+ """
2
+ LM-Eval-Harness Ground Truth Evaluation
3
+
4
+ This module provides ground truth evaluation using the lm-eval-harness framework.
5
+
6
+ Uses unified split strategy: all available splits are combined and split 80/20 into train/test.
7
+ Evaluation uses the TEST portion (20%) to ensure no data leakage with training.
8
+ """
9
+
10
+ import logging
11
+ from typing import Any, Dict
12
+
13
+ from wisent.core.activations.core.atoms import ActivationAggregationStrategy
14
+ from wisent.core.activations.activations import Activations
15
+ from wisent.core.layer import Layer
16
+ from wisent.core.utils.dataset_splits import get_all_docs_from_task, create_deterministic_split
17
+ from wisent.core.models.inference_config import get_generate_kwargs
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class LMEvalHarnessGroundTruth:
23
+ """
24
+ Ground truth evaluator using lm-eval-harness tasks.
25
+
26
+ This class orchestrates the evaluation of classifiers on lm-eval-harness tasks
27
+ by routing to appropriate evaluation methods based on the task type.
28
+ """
29
+
30
+ def __init__(self, task_name: str, evaluation_method: str = None, model=None):
31
+ """
32
+ Initialize the LM-eval-harness ground truth evaluator.
33
+
34
+ Args:
35
+ task_name: Name of the lm-eval task
36
+ evaluation_method: Evaluation method ("log-likelihoods", "text-generation", "perplexity")
37
+ model: The model instance for activation extraction
38
+ """
39
+ self.task_name = task_name
40
+ self.evaluation_method = evaluation_method
41
+ self.model = model
42
+
43
+ # Load evaluation method from benchmark configuration if not provided
44
+ if not self.evaluation_method:
45
+ self.evaluation_method = self._get_evaluation_method_for_task(task_name)
46
+
47
+ def evaluate_classifier_on_task(
48
+ self,
49
+ classifier,
50
+ task_name: str,
51
+ num_samples: int = 100,
52
+ model=None,
53
+ layer: int = 15,
54
+ token_aggregation: str = "average",
55
+ ) -> Dict[str, Any]:
56
+ """
57
+ Evaluate a classifier on the specified lm-eval task.
58
+
59
+ Args:
60
+ classifier: The classifier to evaluate
61
+ task_name: Name of the lm-eval task
62
+ num_samples: Number of samples to evaluate
63
+ model: The model instance (overrides self.model if provided)
64
+ layer: Layer to extract activations from
65
+ token_aggregation: Token aggregation method ("average", "final", "first", "max", "min")
66
+
67
+ Returns:
68
+ Dict containing evaluation results
69
+ """
70
+
71
+ # Use provided model or fall back to self.model
72
+ evaluation_model = model or self.model
73
+
74
+ # Route to appropriate evaluation method
75
+ if self.evaluation_method == "log-likelihoods":
76
+ return self._evaluate_log_likelihoods(
77
+ classifier, task_name, num_samples, evaluation_model, layer, token_aggregation
78
+ )
79
+ if self.evaluation_method == "text-generation":
80
+ return self._evaluate_text_generation(
81
+ classifier, task_name, num_samples, evaluation_model, layer, token_aggregation
82
+ )
83
+ if self.evaluation_method == "perplexity":
84
+ return self._evaluate_perplexity(
85
+ classifier, task_name, num_samples, evaluation_model, layer, token_aggregation
86
+ )
87
+ if self.evaluation_method == "code-execution":
88
+ return self._evaluate_code_execution(
89
+ classifier, task_name, num_samples, evaluation_model, layer, token_aggregation
90
+ )
91
+ return {
92
+ "ground_truth": "UNKNOWN",
93
+ "method_used": "lm-eval-harness-unsupported",
94
+ "confidence": 0.0,
95
+ "details": f"Unsupported evaluation method: {self.evaluation_method}",
96
+ "task_name": task_name,
97
+ "evaluation_method": self.evaluation_method,
98
+ }
99
+
100
+ def _evaluate_log_likelihoods(
101
+ self, classifier, task_name: str, num_samples: int, model, layer: int, token_aggregation: str = "average"
102
+ ) -> Dict[str, Any]:
103
+ """Evaluate classifier using log-likelihoods approach."""
104
+ try:
105
+
106
+ from .log_likelihoods_evaluator import LogLikelihoodsEvaluator
107
+
108
+ # Create evaluator with model
109
+ evaluator = LogLikelihoodsEvaluator(task_name, model=model)
110
+
111
+ # Evaluate classifier
112
+ results = evaluator.evaluate_classifier_on_task(
113
+ classifier,
114
+ task_name,
115
+ num_samples=num_samples,
116
+ model=model,
117
+ layer=layer,
118
+ token_aggregation=token_aggregation,
119
+ )
120
+
121
+ print(results)
122
+
123
+ return results
124
+
125
+ except Exception as e:
126
+ logger.error(f"Error in log-likelihoods evaluation: {e}")
127
+ return {
128
+ "ground_truth": "UNKNOWN",
129
+ "method_used": "lm-eval-harness-error",
130
+ "confidence": 0.0,
131
+ "details": f"Log-likelihoods evaluation failed: {e!s}",
132
+ "task_name": task_name,
133
+ "evaluation_method": "log-likelihoods",
134
+ }
135
+
136
+ def _evaluate_text_generation(
137
+ self, classifier, task_name: str, num_samples: int, model, layer: int, token_aggregation: str = "average"
138
+ ) -> Dict[str, Any]:
139
+ """Evaluate classifier using text generation approach."""
140
+ try:
141
+ logger.info(f"🎯 TEXT GENERATION EVALUATION: {task_name}")
142
+
143
+ # TODO In general LMEvalHarness should be rebuild to be BenchmarkGroundTruth
144
+ # Check if this is a TaskInterface task
145
+ if self._is_task_interface_task(task_name):
146
+ print("^"*80)
147
+ print("USING LOADING FROM TASK INTERFACE")
148
+ print("^"*80)
149
+ docs, task_data = self._load_task_interface_data(task_name, num_samples)
150
+ else:
151
+ print("^"*80)
152
+ print("USING LOADING FROM LM EVAL HARNESS")
153
+ print("^"*80)
154
+ # Use existing lm-eval task loading infrastructure
155
+ task_data = model.load_lm_eval_task(task_name, shots=0, limit=num_samples)
156
+ docs, _ = model.split_task_data(task_data, split_ratio=1.0) # Use all for evaluation
157
+
158
+ if not docs:
159
+ return self._error_result(f"No documents retrieved from task: {task_name}")
160
+
161
+ logger.info(f"📝 Retrieved {len(docs)} documents from {task_name}")
162
+
163
+ # Generate responses using the model
164
+ generated_responses = []
165
+ ground_truth_responses = []
166
+
167
+ for i, doc in enumerate(docs):
168
+ try:
169
+ # Extract question from document
170
+ if hasattr(task_data, "doc_to_text"):
171
+ question = task_data.doc_to_text(doc)
172
+ else:
173
+ question = str(doc.get("question", doc.get("text", "")))
174
+
175
+ # Generate response using model (use low temperature for deterministic evaluation)
176
+ logger.debug(f"🔸 Generating response for: {question}...") #question[:100]
177
+ gen_kwargs = get_generate_kwargs(max_new_tokens=150, temperature=0.1, do_sample=False)
178
+ generated_response, _ = model.generate(
179
+ prompt=question, layer_index=layer, **gen_kwargs
180
+ )
181
+
182
+ # Extract ground truth answer
183
+ # HLE task handling
184
+ if task_name.startswith("hle") or task_name in ["math500", "math", "hendrycks_math"]:
185
+ ground_truth = doc.get("answer", "")
186
+ # AIME task handling
187
+ elif task_name.startswith("aime"):
188
+ ground_truth = str(doc.get("Answer", "") or doc.get("answer", ""))
189
+ # FIXED: For DROP task, use raw document data to preserve structured format
190
+ elif task_name == "drop":
191
+ # Use raw answer field which contains the structured data
192
+ ground_truth = doc.get("answer", {})
193
+ elif hasattr(task_data, "doc_to_target"):
194
+ ground_truth = task_data.doc_to_target(doc)
195
+ else:
196
+ ground_truth = str(doc.get("answer", doc.get("target", "")))
197
+
198
+ generated_responses.append(
199
+ {
200
+ "question": question,
201
+ "generated_response": generated_response,
202
+ "ground_truth": ground_truth,
203
+ "doc": doc,
204
+ }
205
+ )
206
+
207
+ logger.debug(f" 📝 Generated: {generated_response}...") #generated_response[:100]
208
+ # FIXED: Handle ground_truth as int or string for logging
209
+ gt_str = str(ground_truth)
210
+ logger.debug(f" ✅ Ground truth: {gt_str}...") #gt_str[:100]
211
+
212
+ except Exception as e:
213
+ logger.error(f"Error generating response for doc {i}: {e}")
214
+ continue
215
+
216
+ # Evaluate using lm-eval-harness metrics
217
+ logger.info(f"🎯 Evaluating {len(generated_responses)} generated responses using lm-eval metrics...")
218
+
219
+ # Use lm-eval-harness's actual evaluation for this task
220
+ evaluation_results = self._evaluate_with_lm_eval_metrics(task_name, generated_responses, task_data)
221
+
222
+ # Now classify the generated responses to see if classifier agrees
223
+ classification_results = []
224
+ for response_data in generated_responses:
225
+ try:
226
+ layer_obj = Layer(index=layer, type="transformer")
227
+
228
+ # Extract activations from generated response
229
+ activation_tensor = model.extract_activations(response_data["generated_response"], layer_obj)
230
+ activation_method = self._map_token_aggregation_to_activation_method(token_aggregation)
231
+
232
+ activation_obj = Activations(
233
+ tensor=activation_tensor, layer=layer_obj, aggregation_strategy=activation_method
234
+ )
235
+
236
+ # Get classifier prediction
237
+ features = activation_obj.extract_features_for_classifier()
238
+
239
+ # Handle different classifier return formats
240
+ try:
241
+ # Try predict_proba first (returns probabilities)
242
+ features_numpy = features.cpu().numpy()
243
+ logger.debug(f"🔧 Features shape: {features_numpy.shape}")
244
+
245
+ prediction_proba = classifier.predict_proba([features_numpy])
246
+ logger.debug(f"🔧 predict_proba returned: {prediction_proba} (type: {type(prediction_proba)})")
247
+
248
+ # Handle different return formats
249
+ if isinstance(prediction_proba, (list, tuple)):
250
+ if len(prediction_proba) > 0:
251
+ prediction = prediction_proba[0]
252
+ logger.debug(f"🔧 Extracted from list/tuple: {prediction} (type: {type(prediction)})")
253
+ else:
254
+ prediction = 0.5 # Default neutral score
255
+ else:
256
+ prediction = prediction_proba
257
+
258
+ # If prediction is array-like, get first element
259
+ if hasattr(prediction, "__len__") and not isinstance(prediction, str):
260
+ if len(prediction) > 0:
261
+ prediction = prediction[0]
262
+ logger.debug(f"🔧 Extracted from array: {prediction} (type: {type(prediction)})")
263
+ else:
264
+ prediction = 0.5
265
+
266
+ # Ensure we have a float
267
+ prediction = float(prediction)
268
+ logger.debug(f"🔧 Final prediction: {prediction}")
269
+
270
+ except Exception as proba_error:
271
+ logger.warning(f"predict_proba failed: {proba_error}, trying predict...")
272
+
273
+ # Fallback to predict method
274
+ try:
275
+ predictions = classifier.predict([features.cpu().numpy()])
276
+ if len(predictions) > 0:
277
+ prediction = float(predictions[0])
278
+ else:
279
+ logger.warning("Classifier predict returned empty array")
280
+ prediction = 0.5
281
+ except Exception as predict_error:
282
+ logger.error(f"Both predict_proba and predict failed: {predict_error}")
283
+ prediction = 0.5 # Default neutral score
284
+
285
+ classification_results.append(
286
+ {
287
+ "question": response_data["question"],
288
+ "generated_response": response_data["generated_response"],
289
+ "classifier_score": prediction,
290
+ "classifier_truthful": prediction < 0.5, # Lower = more truthful
291
+ }
292
+ )
293
+
294
+ logger.debug("📋 Generated Response Classification:")
295
+ logger.debug(f" 🔸 Question: {response_data['question']}...") #response_data['question'][:100]
296
+ logger.debug(f" 🧠 Generated: {response_data['generated_response']}...") #response_data['generated_response'][:100]
297
+ logger.debug(f" 🎯 Classifier score: {prediction:.3f} (lower = more truthful)")
298
+ logger.debug(f" ✅ Classifier judgment: {'TRUTHFUL' if prediction < 0.5 else 'UNTRUTHFUL'}")
299
+
300
+ except Exception as e:
301
+ import traceback
302
+
303
+ logger.error(f"Error classifying generated response: {e}")
304
+ logger.error(f"Full traceback: {traceback.format_exc()}")
305
+ classification_results.append(
306
+ {
307
+ "question": response_data["question"],
308
+ "generated_response": response_data["generated_response"],
309
+ "classifier_score": 0.5, # Default neutral score
310
+ "classifier_truthful": False,
311
+ "error": str(e),
312
+ }
313
+ )
314
+ continue
315
+
316
+ return {
317
+ "ground_truth": "EVALUATED",
318
+ "method_used": "lm-eval-harness-text-generation",
319
+ "confidence": evaluation_results.get("accuracy", 0.0),
320
+ "details": f"Generated and evaluated {len(generated_responses)} responses using lm-eval metrics",
321
+ "task_name": task_name,
322
+ "evaluation_method": "text-generation",
323
+ "lm_eval_metrics": evaluation_results,
324
+ "classification_results": classification_results,
325
+ "total_samples": len(generated_responses),
326
+ }
327
+
328
+ except Exception as e:
329
+ logger.error(f"Error in text generation evaluation: {e}")
330
+ return self._error_result(f"Text generation evaluation error: {e!s}")
331
+
332
+ def _evaluate_perplexity(
333
+ self, classifier, task_name: str, num_samples: int, model, layer: int, token_aggregation: str = "average"
334
+ ) -> Dict[str, Any]:
335
+ """Evaluate classifier using perplexity approach."""
336
+ try:
337
+ logger.info(f"🎯 PERPLEXITY EVALUATION: {task_name}")
338
+
339
+ # Use existing task loading infrastructure
340
+ task_data = model.load_lm_eval_task(task_name, shots=0, limit=num_samples)
341
+ docs, _ = model.split_task_data(task_data, split_ratio=1.0) # Use all for evaluation
342
+
343
+ if not docs:
344
+ return self._error_result(f"No documents retrieved from task: {task_name}")
345
+
346
+ logger.info(f"📝 Retrieved {len(docs)} documents from {task_name}")
347
+
348
+ # Calculate perplexity scores for different responses
349
+ perplexity_results = []
350
+
351
+ for i, doc in enumerate(docs):
352
+ try:
353
+ # For WikiText and other pure language modeling tasks
354
+ if task_name == "wikitext":
355
+ # Get the full text for perplexity calculation
356
+ text = doc.get("page", doc.get("text", ""))
357
+ if not text:
358
+ logger.warning(f"No text found in WikiText document {i}")
359
+ continue
360
+
361
+ logger.debug(f"🔸 Calculating perplexity for WikiText document {i} ({len(text)} chars)...")
362
+
363
+ # Calculate perplexity on the full text
364
+ perplexity = self._calculate_perplexity(model, text)
365
+
366
+ # Extract activations from the text for classifier
367
+ try:
368
+ layer_obj = Layer(index=layer, type="transformer")
369
+
370
+ # Use a truncated version for activation extraction if text is too long
371
+ activation_text = text[:1000] if len(text) > 1000 else text
372
+ activation_tensor = model.extract_activations(activation_text, layer_obj)
373
+ activation_method = self._map_token_aggregation_to_activation_method(token_aggregation)
374
+
375
+ activation_obj = Activations(
376
+ tensor=activation_tensor, layer=layer_obj, aggregation_strategy=activation_method
377
+ )
378
+
379
+ # Get classifier prediction (only if classifier is provided)
380
+ if classifier is not None:
381
+ features = activation_obj.extract_features_for_classifier()
382
+
383
+ # Handle different classifier return formats
384
+ try:
385
+ prediction_proba = classifier.predict_proba([features.cpu().numpy()])
386
+
387
+ if isinstance(prediction_proba, (list, tuple)) and len(prediction_proba) > 0:
388
+ classification_score = float(prediction_proba[0])
389
+ else:
390
+ classification_score = float(prediction_proba)
391
+
392
+ if hasattr(classification_score, "__len__") and not isinstance(
393
+ classification_score, str
394
+ ):
395
+ classification_score = float(classification_score[0])
396
+
397
+ except Exception as proba_error:
398
+ logger.warning(f"predict_proba failed: {proba_error}, trying predict...")
399
+ try:
400
+ predictions = classifier.predict([features.cpu().numpy()])
401
+ if len(predictions) > 0:
402
+ classification_score = float(predictions[0])
403
+ else:
404
+ logger.warning("Classifier predict returned empty array")
405
+ classification_score = 0.5
406
+ except Exception as predict_error:
407
+ logger.error(f"Both predict_proba and predict failed: {predict_error}")
408
+ classification_score = 0.5
409
+ else:
410
+ # No classifier provided - use default neutral score for perplexity-only evaluation
411
+ classification_score = 0.5
412
+
413
+ except Exception as e:
414
+ logger.error(f"Error classifying WikiText document: {e}")
415
+ classification_score = None
416
+
417
+ result = {
418
+ "document_idx": i,
419
+ "text_preview": text[:200] + "..." if len(text) > 200 else text,
420
+ "text_length": len(text),
421
+ "perplexity": perplexity,
422
+ "classifier_score": classification_score,
423
+ }
424
+
425
+ perplexity_results.append(result)
426
+
427
+ logger.debug("📋 WikiText Perplexity Analysis:")
428
+ logger.debug(f" 📊 Document {i}: {len(text)} chars")
429
+ logger.debug(f" 🎯 Perplexity: {perplexity:.3f}")
430
+ if classification_score is not None:
431
+ logger.debug(f" 🧠 Classifier score: {classification_score:.3f} (lower = more truthful)")
432
+
433
+ continue # Skip the rest of the loop for WikiText
434
+
435
+ # Extract question/prompt and possible completions for other tasks
436
+ if hasattr(task_data, "doc_to_text"):
437
+ prompt = task_data.doc_to_text(doc)
438
+ else:
439
+ prompt = str(doc.get("question", doc.get("text", "")))
440
+
441
+ # For multiple choice tasks, get all choices
442
+ choices = []
443
+ if hasattr(task_data, "doc_to_choice"):
444
+ choices = [
445
+ task_data.doc_to_choice(doc, choice_idx)
446
+ for choice_idx in range(len(doc.get("choices", [])))
447
+ ]
448
+ elif "choices" in doc:
449
+ choices = doc["choices"]
450
+ else:
451
+ # For non-multiple choice, generate a response and calculate its perplexity
452
+ gen_kwargs = get_generate_kwargs(max_new_tokens=100, temperature=0.1, do_sample=False)
453
+ generated_response, _ = model.generate(
454
+ prompt=prompt, layer_index=layer, **gen_kwargs
455
+ )
456
+ choices = [generated_response]
457
+
458
+ logger.debug(f"🔸 Calculating perplexity for: {prompt[:100]}...")
459
+
460
+ # Calculate perplexity for each choice
461
+ choice_perplexities = []
462
+ for choice_idx, choice in enumerate(choices):
463
+ try:
464
+ # Calculate perplexity of the choice given the prompt
465
+ full_text = f"{prompt} {choice}"
466
+ perplexity = self._calculate_perplexity(model, full_text)
467
+
468
+ choice_perplexities.append(
469
+ {"choice_idx": choice_idx, "choice_text": choice, "perplexity": perplexity}
470
+ )
471
+
472
+ logger.debug(f" 📊 Choice {choice_idx}: {choice[:50]}... (perplexity: {perplexity:.3f})")
473
+
474
+ except Exception as e:
475
+ logger.error(f"Error calculating perplexity for choice {choice_idx}: {e}")
476
+ continue
477
+
478
+ # Get ground truth answer index
479
+ ground_truth_idx = None
480
+ if hasattr(task_data, "doc_to_target"):
481
+ ground_truth = task_data.doc_to_target(doc)
482
+ try:
483
+ ground_truth_idx = int(ground_truth)
484
+ except:
485
+ ground_truth_idx = None
486
+ elif "answer" in doc:
487
+ ground_truth_idx = doc["answer"]
488
+
489
+ # Find the choice with lowest perplexity (most likely)
490
+ if choice_perplexities:
491
+ best_choice = min(choice_perplexities, key=lambda x: x["perplexity"])
492
+
493
+ # Classify the best choice using the classifier
494
+ classification_score = None
495
+ try:
496
+ layer_obj = Layer(index=layer, type="transformer")
497
+
498
+ # Extract activations from the best choice
499
+ activation_tensor = model.extract_activations(best_choice["choice_text"], layer_obj)
500
+ activation_method = self._map_token_aggregation_to_activation_method(token_aggregation)
501
+
502
+ activation_obj = Activations(
503
+ tensor=activation_tensor, layer=layer_obj, aggregation_strategy=activation_method
504
+ )
505
+
506
+ # Get classifier prediction
507
+ features = activation_obj.extract_features_for_classifier()
508
+
509
+ # Handle different classifier return formats
510
+ try:
511
+ # Try predict_proba first (returns probabilities)
512
+ prediction_proba = classifier.predict_proba([features.cpu().numpy()])
513
+
514
+ # Handle different return formats
515
+ if isinstance(prediction_proba, (list, tuple)):
516
+ if len(prediction_proba) > 0:
517
+ classification_score = prediction_proba[0]
518
+ else:
519
+ classification_score = 0.5 # Default neutral score
520
+ else:
521
+ classification_score = prediction_proba
522
+
523
+ # If prediction is array-like, get first element
524
+ if hasattr(classification_score, "__len__") and not isinstance(
525
+ classification_score, str
526
+ ):
527
+ if len(classification_score) > 0:
528
+ classification_score = classification_score[0]
529
+ else:
530
+ classification_score = 0.5
531
+
532
+ # Ensure we have a float
533
+ classification_score = float(classification_score)
534
+
535
+ except Exception as proba_error:
536
+ logger.warning(f"predict_proba failed: {proba_error}, trying predict...")
537
+
538
+ # Fallback to predict method
539
+ try:
540
+ predictions = classifier.predict([features.cpu().numpy()])
541
+ if len(predictions) > 0:
542
+ classification_score = float(predictions[0])
543
+ else:
544
+ logger.warning("Classifier predict returned empty array")
545
+ classification_score = 0.5
546
+ except Exception as predict_error:
547
+ logger.error(f"Both predict_proba and predict failed: {predict_error}")
548
+ classification_score = 0.5 # Default neutral score
549
+
550
+ except Exception as e:
551
+ logger.error(f"Error classifying best choice: {e}")
552
+
553
+ result = {
554
+ "question": prompt,
555
+ "choices": choice_perplexities,
556
+ "best_choice_idx": best_choice["choice_idx"],
557
+ "best_choice_text": best_choice["choice_text"],
558
+ "best_choice_perplexity": best_choice["perplexity"],
559
+ "ground_truth_idx": ground_truth_idx,
560
+ "classifier_score": classification_score,
561
+ "perplexity_correct": best_choice["choice_idx"] == ground_truth_idx
562
+ if ground_truth_idx is not None
563
+ else None,
564
+ }
565
+
566
+ perplexity_results.append(result)
567
+
568
+ logger.debug("📋 Perplexity Analysis:")
569
+ logger.debug(f" 🔸 Question: {prompt[:100]}...")
570
+ logger.debug(f" 📊 Best choice (lowest perplexity): {best_choice['choice_text'][:100]}...")
571
+ logger.debug(f" 🎯 Perplexity: {best_choice['perplexity']:.3f}")
572
+ logger.debug(
573
+ f" 🧠 Classifier score: {classification_score:.3f} (lower = more truthful)"
574
+ if classification_score is not None
575
+ else " 🧠 Classifier score: N/A"
576
+ )
577
+ logger.debug(f" ✅ Perplexity correct: {result['perplexity_correct']}")
578
+
579
+ except Exception as e:
580
+ logger.error(f"Error processing doc {i}: {e}")
581
+ continue
582
+
583
+ # Calculate overall metrics
584
+ total_samples = len(perplexity_results)
585
+
586
+ if task_name == "wikitext":
587
+ # For WikiText, we don't have correct/incorrect, just perplexity values
588
+ perplexities = [r["perplexity"] for r in perplexity_results if r["perplexity"] != float("inf")]
589
+ avg_perplexity = sum(perplexities) / len(perplexities) if perplexities else float("inf")
590
+
591
+ # Average classifier score
592
+ classifier_scores = [
593
+ r["classifier_score"] for r in perplexity_results if r["classifier_score"] is not None
594
+ ]
595
+ avg_classifier_score = sum(classifier_scores) / len(classifier_scores) if classifier_scores else None
596
+
597
+ perplexity_accuracy = 1.0 if avg_perplexity < 100 else 0.0 # Arbitrary threshold for "good" perplexity
598
+ correct_perplexity = sum(1 for r in perplexity_results if r["perplexity"] < 100)
599
+ else:
600
+ correct_perplexity = sum(1 for r in perplexity_results if r.get("perplexity_correct") == True)
601
+ perplexity_accuracy = correct_perplexity / total_samples if total_samples > 0 else 0.0
602
+
603
+ # Average classifier score
604
+ classifier_scores = [
605
+ r["classifier_score"] for r in perplexity_results if r["classifier_score"] is not None
606
+ ]
607
+ avg_classifier_score = sum(classifier_scores) / len(classifier_scores) if classifier_scores else None
608
+
609
+ logger.info("📊 PERPLEXITY EVALUATION RESULTS:")
610
+ logger.info(f" • Total samples: {total_samples}")
611
+ if task_name == "wikitext":
612
+ logger.info(f" • Average perplexity: {avg_perplexity:.3f}")
613
+ logger.info(f" • Documents with perplexity < 100: {correct_perplexity}")
614
+ else:
615
+ logger.info(f" • Perplexity accuracy: {perplexity_accuracy:.3f}")
616
+ logger.info(
617
+ f" • Average classifier score: {avg_classifier_score:.3f}"
618
+ if avg_classifier_score is not None
619
+ else " • Average classifier score: N/A"
620
+ )
621
+
622
+ result_dict = {
623
+ "ground_truth": "EVALUATED",
624
+ "method_used": "lm-eval-harness-perplexity",
625
+ "confidence": perplexity_accuracy,
626
+ "details": f"Calculated perplexity for {total_samples} samples",
627
+ "task_name": task_name,
628
+ "evaluation_method": "perplexity",
629
+ "perplexity_accuracy": perplexity_accuracy,
630
+ "average_classifier_score": avg_classifier_score,
631
+ "total_samples": total_samples,
632
+ "correct_perplexity": correct_perplexity,
633
+ "perplexity_results": perplexity_results[:10], # First 10 for debugging
634
+ }
635
+
636
+ if task_name == "wikitext":
637
+ result_dict["average_perplexity"] = avg_perplexity
638
+ result_dict["details"] = (
639
+ f"Calculated perplexity for {total_samples} WikiText documents, avg perplexity: {avg_perplexity:.3f}"
640
+ )
641
+ else:
642
+ result_dict["details"] = (
643
+ f"Calculated perplexity for {total_samples} samples, accuracy: {perplexity_accuracy:.3f}"
644
+ )
645
+
646
+ return result_dict
647
+
648
+ except Exception as e:
649
+ logger.error(f"Error in perplexity evaluation: {e}")
650
+ return self._error_result(f"Perplexity evaluation error: {e!s}")
651
+
652
+ def _get_evaluation_method_for_task(self, task_name: str) -> str:
653
+ """Get the evaluation method for a task from the benchmark configuration."""
654
+ try:
655
+ import json
656
+
657
+ eval_methods_path = "wisent/parameters/benchmarks/benchmark_evaluation_methods.json"
658
+ with open(eval_methods_path) as f:
659
+ benchmark_methods = json.load(f)
660
+ return benchmark_methods.get(task_name, "text-generation")
661
+ except Exception as e:
662
+ logger.debug(f"Could not load benchmark evaluation methods: {e}")
663
+ return "text-generation"
664
+
665
+ def _error_result(self, error_message: str) -> Dict[str, Any]:
666
+ """Return a standardized error result."""
667
+ return {
668
+ "ground_truth": "ERROR",
669
+ "method_used": "lm-eval-harness-error",
670
+ "confidence": 0.0,
671
+ "details": error_message,
672
+ "task_name": self.task_name,
673
+ "evaluation_method": self.evaluation_method,
674
+ }
675
+
676
+ def _map_token_aggregation_to_activation_method(self, token_aggregation: str):
677
+ """Map token aggregation string to activation method."""
678
+
679
+ mapping = { # TODO This should be refactor, why we use strings as Token aggregation?
680
+ "average": ActivationAggregationStrategy.MEAN_POOLING,
681
+ "mean": ActivationAggregationStrategy.MEAN_POOLING,
682
+ "last": ActivationAggregationStrategy.LAST_TOKEN,
683
+ "max": ActivationAggregationStrategy.MAX_POOLING,
684
+ }
685
+
686
+ return mapping.get(token_aggregation.lower(), ActivationAggregationStrategy.MEAN_POOLING)
687
+
688
+ def _is_task_interface_task(self, task_name: str) -> bool:
689
+ """Check if this is a TaskInterface task (not an lm-eval task)."""
690
+ # List of known TaskInterface tasks
691
+ task_interface_tasks = {
692
+ "hle",
693
+ "hle_exact_match",
694
+ "hle_multiple_choice",
695
+ "livecodebench",
696
+ "math500",
697
+ "math",
698
+ "hendrycks_math",
699
+ "aime",
700
+ "aime2025",
701
+ "aime2024",
702
+ "hmmt",
703
+ "hmmt_feb_2025",
704
+ "polymath",
705
+ "polymath_en_medium",
706
+ "polymath_zh_medium",
707
+ "polymath_en_high",
708
+ "polymath_zh_high",
709
+ "livemathbench",
710
+ "livemathbench_cnmo_en",
711
+ "livemathbench_cnmo_zh",
712
+ #"gsm8k",
713
+ "multirc",
714
+ "arithmetic_1dc",
715
+ #"arithmetic_2da",
716
+ "arithmetic_2dm",
717
+ "arithmetic_2ds",
718
+ "arithmetic_3da",
719
+ "arithmetic_3ds",
720
+ "arithmetic_4da",
721
+ "arithmetic_4ds",
722
+ "arithmetic_5da",
723
+ "arithmetic_5ds",
724
+ "qa4mre_2013",
725
+ #"truthfulqa_mc1",
726
+ }
727
+ return task_name in task_interface_tasks
728
+
729
+ def _load_task_interface_data(self, task_name: str, num_samples: int):
730
+ """Load data from TaskInterface tasks."""
731
+ try:
732
+ from .task_interface import get_task
733
+
734
+ # Get the task instance
735
+ task = get_task(task_name)
736
+
737
+ # Load data
738
+ docs = task.load_data(limit=num_samples)
739
+
740
+ return docs, task
741
+
742
+ except Exception as e:
743
+ logger.error(f"Failed to load TaskInterface task {task_name}: {e}")
744
+ return [], None
745
+
746
+ def _calculate_perplexity(self, model, text: str) -> float:
747
+ """Calculate perplexity of text using the model."""
748
+ try:
749
+ import numpy as np
750
+ import torch
751
+
752
+ # Use the model's prepare_activations method to get outputs
753
+ prepared = model.prepare_activations(text)
754
+ outputs = prepared["outputs"]
755
+ inputs = prepared["inputs"]
756
+
757
+ # Get input IDs
758
+ input_ids = inputs["input_ids"]
759
+
760
+ # Get logits from the outputs
761
+ logits = outputs.logits
762
+
763
+ # Compute log probabilities
764
+ log_probs = torch.log_softmax(logits, dim=-1)
765
+
766
+ # Get log probabilities for actual tokens (shifted for next-token prediction)
767
+ # input_ids shape: [batch_size, sequence_length]
768
+ # logits shape: [batch_size, sequence_length, vocab_size]
769
+ # We need to match targets with predictions
770
+
771
+ if input_ids.shape[1] > 1:
772
+ # Get log probabilities for the target tokens
773
+ target_ids = input_ids[0, 1:] # Skip first token (no prediction for it)
774
+ prediction_logits = log_probs[0, :-1, :] # Skip last prediction (no target for it)
775
+
776
+ # Get log probabilities for actual tokens
777
+ token_log_probs = prediction_logits.gather(dim=-1, index=target_ids.unsqueeze(-1)).squeeze(-1)
778
+
779
+ # Compute average log probability
780
+ avg_log_prob = token_log_probs.mean().item()
781
+
782
+ # Compute perplexity
783
+ perplexity = np.exp(-avg_log_prob)
784
+ else:
785
+ # Single token, cannot compute perplexity
786
+ perplexity = float("inf")
787
+
788
+ return perplexity
789
+
790
+ except Exception as e:
791
+ logger.error(f"Error calculating perplexity: {e}")
792
+ return float("inf")
793
+
794
+ def _evaluate_generic_code_execution(
795
+ self, classifier, task_name: str, num_samples: int, model, layer: int, token_aggregation: str = "average"
796
+ ) -> Dict[str, Any]:
797
+ """Evaluate generic code execution tasks (non-BigCode) like LiveCodeBench."""
798
+ try:
799
+ logger.info(f"🎯 GENERIC CODE EXECUTION EVALUATION: {task_name}")
800
+
801
+ # Get secure code evaluator
802
+ from .secure_code_evaluator import SecureCodeEvaluator
803
+
804
+ secure_evaluator = SecureCodeEvaluator()
805
+
806
+ # Load task data
807
+ task_data = model.load_lm_eval_task(task_name, shots=0, limit=num_samples)
808
+
809
+ # Use unified split strategy - get TEST portion only
810
+ all_docs, split_counts = get_all_docs_from_task(task_data)
811
+ if all_docs:
812
+ _, docs = create_deterministic_split(all_docs, task_name)
813
+ logger.info(f"Using {len(docs)} test docs from unified split "
814
+ f"(total: {len(all_docs)}, original splits: {split_counts})")
815
+ else:
816
+ docs, _ = model.split_task_data(task_data, split_ratio=1.0)
817
+
818
+ if not docs:
819
+ return self._error_result(f"No documents retrieved from task: {task_name}")
820
+
821
+ logger.info(f"📝 Retrieved {len(docs)} documents from {task_name}")
822
+
823
+ # Generate code for each sample
824
+ generated_codes = []
825
+ evaluation_results = []
826
+
827
+ for i, doc in enumerate(docs):
828
+ try:
829
+ # Get prompt
830
+ if hasattr(task_data, "doc_to_text"):
831
+ prompt = task_data.doc_to_text(doc)
832
+ else:
833
+ # For LiveCodeBench
834
+ question = doc.get("question_content", doc.get("text", ""))
835
+ starter_code = doc.get("starter_code", "")
836
+ prompt = f"{question}\n\n{starter_code}" if starter_code else question
837
+
838
+ logger.debug(f"📋 Prompt for sample {i + 1}:\n{prompt[:200]}...\n")
839
+
840
+ # Generate code using model (more tokens for code, low temperature for deterministic output)
841
+ logger.debug(f"🔸 Generating code for sample {i + 1}/{len(docs)}...")
842
+ gen_kwargs = get_generate_kwargs(max_new_tokens=500, temperature=0.1, do_sample=False)
843
+ generated_code, _ = model.generate(
844
+ prompt=prompt,
845
+ layer_index=layer,
846
+ **gen_kwargs,
847
+ )
848
+
849
+ generated_codes.append(generated_code)
850
+ logger.debug(f" 📝 Generated code:\n{generated_code}\n")
851
+
852
+ # Evaluate generated code
853
+ eval_result = secure_evaluator.evaluate_response(task_name, doc, generated_code)
854
+ evaluation_results.append(eval_result)
855
+
856
+ logger.debug(
857
+ f" ✅ Evaluation result: {'PASSED' if eval_result.get('passed', False) else 'FAILED'}"
858
+ )
859
+ if "pass_rate" in eval_result:
860
+ logger.debug(f" 📊 Pass rate: {eval_result['pass_rate']:.2%}")
861
+
862
+ except Exception as e:
863
+ logger.error(f"Error processing sample {i}: {e}")
864
+ generated_codes.append("")
865
+ evaluation_results.append({"passed": False, "error": str(e), "success": False})
866
+
867
+ # Aggregate results
868
+ total_passed = sum(1 for r in evaluation_results if r.get("passed", False))
869
+ accuracy = total_passed / len(evaluation_results) if evaluation_results else 0.0
870
+
871
+ logger.info(
872
+ f"📊 CODE EXECUTION COMPLETED: {total_passed}/{len(evaluation_results)} passed ({accuracy:.2%})"
873
+ )
874
+
875
+ # Clean up Docker resources
876
+ secure_evaluator.cleanup()
877
+
878
+ return {
879
+ "ground_truth": "EVALUATED",
880
+ "method_used": f"generic-code-execution-{task_name}",
881
+ "confidence": accuracy,
882
+ "accuracy": accuracy,
883
+ "details": f"Executed and evaluated {len(generated_codes)} code samples",
884
+ "task_name": task_name,
885
+ "evaluation_method": "code-execution",
886
+ "total_samples": len(generated_codes),
887
+ "passed_samples": total_passed,
888
+ "evaluation_results": evaluation_results,
889
+ }
890
+
891
+ except Exception as e:
892
+ logger.error(f"Error in generic code execution evaluation: {e}")
893
+ import traceback
894
+
895
+ logger.error(f"Traceback: {traceback.format_exc()}")
896
+ return self._error_result(f"Generic code execution evaluation error: {e!s}")
897
+
898
+ def _evaluate_with_lm_eval_metrics(self, task_name: str, response_data: list, task_data) -> Dict[str, Any]:
899
+ """Evaluate responses using task-specific evaluation metrics."""
900
+ try:
901
+ correct = 0
902
+ total = len(response_data)
903
+ evaluation_details = []
904
+
905
+ for response in response_data:
906
+ generated = response["generated_response"]
907
+ ground_truth = response["ground_truth"]
908
+
909
+ # Task-specific evaluation logic
910
+ if task_name == "gsm8k":
911
+ # GSM8K uses exact match on numerical answer
912
+ is_correct = self._evaluate_gsm8k_response(generated, ground_truth)
913
+ elif task_name.startswith("math") or task_name in ["hendrycks_math"]:
914
+ # MATH-500 and related benchmarks use same evaluation as GSM8K (numerical extraction)
915
+ is_correct = self._evaluate_gsm8k_response(generated, ground_truth)
916
+ elif task_name in ["arc_easy", "arc_challenge"]:
917
+ # ARC uses exact match on choice letter/number
918
+ is_correct = self._evaluate_arc_response(generated, ground_truth)
919
+ elif task_name == "hellaswag":
920
+ # HellaSwag uses exact match on choice index
921
+ is_correct = self._evaluate_hellaswag_response(generated, ground_truth)
922
+ elif task_name == "mathqa":
923
+ # MATH_QA uses exact match on choice index (0, 1, 2, 3)
924
+ is_correct = self._evaluate_mathqa_response(generated, ground_truth)
925
+ elif task_name == "drop":
926
+ # DROP uses structured answer format with numbers, spans, and dates
927
+ is_correct = self._evaluate_drop_response(generated, ground_truth)
928
+ elif task_name.startswith("gpqa"):
929
+ # GPQA uses multiple-choice answer extraction (A, B, C, D)
930
+ is_correct = self._evaluate_multiple_choice_response(generated, ground_truth)
931
+ elif task_name.startswith("hle") and "multiple_choice" in task_name:
932
+ # HLE multiple choice uses letter extraction (A, B, C, D, E)
933
+ is_correct = self._evaluate_multiple_choice_response(generated, ground_truth)
934
+ elif task_name.startswith("truthfulqa") or task_name == "truthfulqa_mc1":
935
+ # TruthfulQA uses multiple-choice answer extraction (A, B, C, D)
936
+ is_correct = self._evaluate_multiple_choice_response(generated, ground_truth)
937
+ else:
938
+ # Default: string matching with some flexibility
939
+ is_correct = self._evaluate_default_response(generated, ground_truth)
940
+
941
+ if is_correct:
942
+ correct += 1
943
+
944
+ evaluation_details.append(
945
+ {
946
+ "question": response["question"][:100],
947
+ "generated": generated[-50:],
948
+ "ground_truth": ground_truth,
949
+ "correct": is_correct,
950
+ }
951
+ )
952
+
953
+ logger.debug(f"📊 Evaluation: {response['question'][:50]}...")
954
+ logger.debug(f" Generated: {generated[:50]}...")
955
+ logger.debug(f" Ground Truth: {ground_truth}")
956
+ logger.debug(f" Correct: {is_correct}")
957
+
958
+ accuracy = correct / total if total > 0 else 0.0
959
+
960
+ return {
961
+ "accuracy": accuracy,
962
+ "correct_predictions": correct,
963
+ "total_samples": total,
964
+ "evaluation_details": evaluation_details[:5], # First 5 for debugging
965
+ "task_name": task_name,
966
+ }
967
+
968
+ except Exception as e:
969
+ logger.error(f"Error in metrics evaluation: {e}")
970
+ return {"accuracy": 0.0, "correct_predictions": 0, "total_samples": len(response_data), "error": str(e)}
971
+
972
+ def _evaluate_gsm8k_response(self, generated: str, ground_truth) -> bool:
973
+ """Evaluate GSM8K response using numerical answer extraction."""
974
+ try:
975
+ # Extract numerical answer from generated response
976
+ # GSM8K answers are typically in format "#### 42" or just the number
977
+ generated_answer = self._extract_numerical_answer(generated)
978
+ ground_truth_answer = self._extract_numerical_answer(str(ground_truth))
979
+
980
+ # Compare numerical values
981
+ if generated_answer is not None and ground_truth_answer is not None:
982
+ return abs(generated_answer - ground_truth_answer) < 1e-6
983
+
984
+ # Fallback to string matching
985
+ return generated.strip().lower() == str(ground_truth).strip().lower()
986
+
987
+ except Exception as e:
988
+ logger.error(f"Error evaluating GSM8K response: {e}")
989
+ return False
990
+
991
+ def _extract_numerical_answer(self, text: str) -> float:
992
+ """Extract numerical answer from text."""
993
+ try:
994
+ import re
995
+
996
+ # Look for #### pattern (GSM8K format)
997
+ pattern = r"####\s*([+-]?\d+(?:\.\d+)?)"
998
+ match = re.search(pattern, text)
999
+ if match:
1000
+ return float(match.group(1))
1001
+
1002
+ # Look for last number in text
1003
+ numbers = re.findall(r"[+-]?\d+(?:\.\d+)?", text)
1004
+ if numbers:
1005
+ return float(numbers[-1])
1006
+
1007
+ return None
1008
+
1009
+ except Exception as e:
1010
+ logger.error(f"Error extracting numerical answer: {e}")
1011
+ return None
1012
+
1013
+ def _evaluate_arc_response(self, generated: str, ground_truth) -> bool:
1014
+ """Evaluate ARC response using exact match."""
1015
+ try:
1016
+ # Normalize responses
1017
+ gen_clean = generated.strip().lower()
1018
+ gt_clean = str(ground_truth).strip().lower()
1019
+
1020
+ # Direct match
1021
+ if gen_clean == gt_clean:
1022
+ return True
1023
+
1024
+ # Check if generated contains the ground truth
1025
+ if gt_clean in gen_clean:
1026
+ return True
1027
+
1028
+ # Check for choice letter/number patterns
1029
+ import re
1030
+
1031
+ gen_match = re.search(r"[abcd]|\d+", gen_clean)
1032
+ gt_match = re.search(r"[abcd]|\d+", gt_clean)
1033
+
1034
+ if gen_match and gt_match:
1035
+ return gen_match.group() == gt_match.group()
1036
+
1037
+ return False
1038
+
1039
+ except Exception as e:
1040
+ logger.error(f"Error evaluating ARC response: {e}")
1041
+ return False
1042
+
1043
+ def _evaluate_hellaswag_response(self, generated: str, ground_truth) -> bool:
1044
+ """Evaluate HellaSwag response using exact match."""
1045
+ try:
1046
+ # Normalize and compare
1047
+ gen_clean = generated.strip().lower()
1048
+ gt_clean = str(ground_truth).strip().lower()
1049
+
1050
+ return gen_clean == gt_clean or gt_clean in gen_clean
1051
+
1052
+ except Exception as e:
1053
+ logger.error(f"Error evaluating HellaSwag response: {e}")
1054
+ return False
1055
+
1056
+ def _evaluate_mathqa_response(self, generated: str, ground_truth) -> bool:
1057
+ """Evaluate MATH_QA response using choice matching."""
1058
+ try:
1059
+ import re
1060
+
1061
+ # Ground truth is typically 0, 1, 2, or 3 (choice index)
1062
+ gt_str = str(ground_truth).strip()
1063
+
1064
+ # Look for choice patterns in generated response
1065
+ gen_clean = generated.strip().lower()
1066
+
1067
+ # Direct match with choice index
1068
+ if gt_str in gen_clean:
1069
+ return True
1070
+
1071
+ # Look for choice letter patterns (a=0, b=1, c=2, d=3)
1072
+ choice_map = {"a": "0", "b": "1", "c": "2", "d": "3"}
1073
+ for letter, index in choice_map.items():
1074
+ if index == gt_str and letter in gen_clean:
1075
+ return True
1076
+
1077
+ # Look for explicit choice pattern like "The answer is 1" or "Choice B"
1078
+ choice_patterns = [
1079
+ rf"\b{gt_str}\b", # Exact number match
1080
+ rf"choice\s*{choice_map.get(gt_str, gt_str)}", # "choice 1"
1081
+ rf"answer\s*is\s*{gt_str}", # "answer is 1"
1082
+ rf"option\s*{gt_str}", # "option 1"
1083
+ ]
1084
+
1085
+ for pattern in choice_patterns:
1086
+ if re.search(pattern, gen_clean):
1087
+ return True
1088
+
1089
+ return False
1090
+
1091
+ except Exception as e:
1092
+ logger.error(f"Error evaluating MATH_QA response: {e}")
1093
+ return False
1094
+
1095
+ def _evaluate_drop_response(self, generated: str, ground_truth) -> bool:
1096
+ """Evaluate DROP response using structured answer format."""
1097
+ try:
1098
+ import json
1099
+ import re
1100
+
1101
+ # Parse ground truth if it's a string representation of a dict
1102
+ if isinstance(ground_truth, str):
1103
+ try:
1104
+ # Try to parse as JSON first
1105
+ if ground_truth.startswith("{"):
1106
+ gt_dict = json.loads(ground_truth)
1107
+ else:
1108
+ # Handle malformed string representations
1109
+ return False
1110
+ except:
1111
+ return False
1112
+ elif isinstance(ground_truth, dict):
1113
+ gt_dict = ground_truth
1114
+ else:
1115
+ return False
1116
+
1117
+ gen_clean = generated.strip().lower()
1118
+
1119
+ # Check number field
1120
+ if gt_dict.get("number"):
1121
+ number_str = str(gt_dict["number"]).strip()
1122
+ if number_str:
1123
+ # Direct number match
1124
+ if number_str.lower() in gen_clean:
1125
+ return True
1126
+
1127
+ # Try to extract numbers from generated response
1128
+ gen_numbers = re.findall(r"\b\d+\b", generated)
1129
+ if number_str in gen_numbers:
1130
+ return True
1131
+
1132
+ # Word number matching (e.g., "two" vs "2")
1133
+ number_words = {
1134
+ "0": ["zero", "none"],
1135
+ "1": ["one"],
1136
+ "2": ["two"],
1137
+ "3": ["three"],
1138
+ "4": ["four"],
1139
+ "5": ["five"],
1140
+ "6": ["six"],
1141
+ "7": ["seven"],
1142
+ "8": ["eight"],
1143
+ "9": ["nine"],
1144
+ "10": ["ten"],
1145
+ }
1146
+ if number_str in number_words:
1147
+ for word in number_words[number_str]:
1148
+ if word in gen_clean:
1149
+ return True
1150
+
1151
+ # Check spans field
1152
+ if gt_dict.get("spans"):
1153
+ spans = gt_dict["spans"]
1154
+ if isinstance(spans, list):
1155
+ for span in spans:
1156
+ span_clean = str(span).strip().lower()
1157
+ if span_clean and span_clean in gen_clean:
1158
+ return True
1159
+ elif isinstance(spans, str):
1160
+ span_clean = spans.strip().lower()
1161
+ if span_clean and span_clean in gen_clean:
1162
+ return True
1163
+
1164
+ # Check date field (less common but possible)
1165
+ if gt_dict.get("date"):
1166
+ date_obj = gt_dict["date"]
1167
+ if isinstance(date_obj, dict):
1168
+ # Check individual date components
1169
+ for component in ["day", "month", "year"]:
1170
+ if date_obj.get(component):
1171
+ date_val = str(date_obj[component]).strip().lower()
1172
+ if date_val and date_val in gen_clean:
1173
+ return True
1174
+
1175
+ return False
1176
+
1177
+ except Exception as e:
1178
+ logger.error(f"Error evaluating DROP response: {e}")
1179
+ return False
1180
+
1181
+ def _evaluate_default_response(self, generated: str, ground_truth) -> bool:
1182
+ """Default evaluation using flexible string matching."""
1183
+ try:
1184
+ gen_clean = generated.strip().lower()
1185
+
1186
+ # Handle list ground truth (e.g., COQA format)
1187
+ if isinstance(ground_truth, list):
1188
+ # Check if generated response matches any of the acceptable answers
1189
+ for gt_option in ground_truth:
1190
+ gt_clean = str(gt_option).strip().lower()
1191
+
1192
+ # Exact match
1193
+ if gen_clean == gt_clean:
1194
+ return True
1195
+
1196
+ # Contains match
1197
+ if gt_clean in gen_clean or gen_clean in gt_clean:
1198
+ return True
1199
+
1200
+ return False
1201
+ # Handle string ground truth
1202
+ gt_clean = str(ground_truth).strip().lower()
1203
+
1204
+ # Exact match
1205
+ if gen_clean == gt_clean:
1206
+ return True
1207
+
1208
+ # Contains match
1209
+ if gt_clean in gen_clean or gen_clean in gt_clean:
1210
+ return True
1211
+
1212
+ return False
1213
+
1214
+ except Exception as e:
1215
+ logger.error(f"Error in default evaluation: {e}")
1216
+ return False
1217
+
1218
+ def _evaluate_multiple_choice_response(self, generated: str, ground_truth) -> bool:
1219
+ """Evaluate multiple choice response by extracting choice letter (A, B, C, D, E)."""
1220
+ import re
1221
+
1222
+ try:
1223
+ # Clean the generated response
1224
+ gen_clean = generated.strip()
1225
+
1226
+ # Convert ground truth to string and extract expected letter
1227
+ gt_str = str(ground_truth).strip()
1228
+ expected_letter = None
1229
+
1230
+ # Extract letter from ground truth (could be "(A)", "A", etc.)
1231
+ gt_match = re.search(r"[ABCDE]", gt_str.upper())
1232
+ if gt_match:
1233
+ expected_letter = gt_match.group()
1234
+ else:
1235
+ return False
1236
+
1237
+ # Try multiple strict patterns to extract answer from generated response
1238
+ # These patterns require clear context indicating an intentional choice
1239
+ patterns = [
1240
+ # Fixed pattern to avoid matching 'A' in "Answer:" alone
1241
+ r"(?:answer|choice|option)\s*(?:is\s+|:\s*)(?:\()?([ABCDE])(?:\))?", # "Answer: A" or "Answer is (B)" - requires letter after
1242
+ r"the\s+(?:correct\s+)?answer\s+is\s*(?:\()?([ABCDE])(?:\))?", # "The answer is A" - requires "the answer is"
1243
+ r"(?:select|choose)\s+(?:\()?([ABCDE])(?:\))?", # "Select A" or "Choose A" - requires the action word
1244
+ r"(?:^|\n)([ABCDE])(?:\s*$)", # Letter at start of line followed by whitespace/end only
1245
+ r"^([ABCDE])[.,;!?)\s]*$", # Just the letter with optional punctuation and whitespace
1246
+ r"^(?:\()?([ABCDE])(?:\))?\s*$", # Just the letter with optional parentheses
1247
+ ]
1248
+
1249
+ # Try each pattern - only accept clear, intentional responses
1250
+ for pattern in patterns:
1251
+ matches = re.finditer(pattern, gen_clean.upper(), re.IGNORECASE | re.MULTILINE)
1252
+ for match in matches:
1253
+ extracted_letter = match.group(1).upper()
1254
+ if extracted_letter == expected_letter:
1255
+ return True
1256
+
1257
+ # No more fallback - if we can't clearly identify the choice, it's wrong
1258
+ return False
1259
+
1260
+ except Exception as e:
1261
+ logger.error(f"Error evaluating multiple choice response: {e}")
1262
+ return False
1263
+
1264
+ def _evaluate_code_execution(
1265
+ self, classifier, task_name: str, num_samples: int, model, layer: int, token_aggregation: str = "average"
1266
+ ) -> Dict[str, Any]:
1267
+ """Evaluate classifier using code execution approach for BigCode tasks."""
1268
+ try:
1269
+ logger.debug(f"🎯 CODE EXECUTION EVALUATION: {task_name}")
1270
+
1271
+ # Check if it's a BigCode task
1272
+ from .bigcode_integration import get_bigcode_evaluator, is_bigcode_task, load_bigcode_task
1273
+ from .secure_code_evaluator import SecureCodeEvaluator
1274
+
1275
+ if not is_bigcode_task(task_name):
1276
+ # Check if it's still a code execution task (like LiveCodeBench)
1277
+ if SecureCodeEvaluator.is_code_execution_task(task_name):
1278
+ logger.info(f"Task {task_name} is a non-BigCode code execution task")
1279
+ return self._evaluate_generic_code_execution(
1280
+ classifier, task_name, num_samples, model, layer, token_aggregation
1281
+ )
1282
+ logger.warning(f"Task {task_name} is not a code execution task, falling back to text generation")
1283
+ return self._evaluate_text_generation(
1284
+ classifier, task_name, num_samples, model, layer, token_aggregation
1285
+ )
1286
+
1287
+ # Load BigCode task
1288
+ bigcode_task = load_bigcode_task(task_name, limit=num_samples)
1289
+ logger.info(f"📝 Loaded BigCode task {task_name} with {len(bigcode_task)} samples")
1290
+
1291
+ # Generate code for each sample
1292
+ generated_codes = []
1293
+ for i, sample in enumerate(bigcode_task.get_samples()):
1294
+ try:
1295
+ # Get prompt
1296
+ prompt = bigcode_task.doc_to_text(sample)
1297
+ logger.debug(f"📋 Prompt for sample {i + 1}:\n{prompt}\n")
1298
+
1299
+ # Generate code using model (more tokens for code, low temperature for deterministic output)
1300
+ logger.debug(f"🔸 Generating code for sample {i + 1}/{len(bigcode_task)}...")
1301
+ gen_kwargs = get_generate_kwargs(max_new_tokens=300, temperature=0.1, do_sample=False)
1302
+ generated_code, _ = model.generate(
1303
+ prompt=prompt,
1304
+ layer_index=layer,
1305
+ **gen_kwargs,
1306
+ )
1307
+
1308
+ generated_codes.append(generated_code)
1309
+ logger.debug(f" 📝 Generated: {generated_code[:100]}...")
1310
+ logger.debug(f" 📝 Full generated code:\n{generated_code}\n")
1311
+
1312
+ except Exception as e:
1313
+ logger.error(f"Error generating code for sample {i}: {e}")
1314
+ generated_codes.append("") # Empty code for failed generation
1315
+
1316
+ # Evaluate generated code using BigCode evaluator
1317
+ logger.info(f"🎯 Evaluating {len(generated_codes)} generated code samples...")
1318
+
1319
+ # Get Docker executor if available
1320
+ docker_executor = None
1321
+ try:
1322
+ from .docker import OptimizedDockerExecutor
1323
+
1324
+ docker_executor = OptimizedDockerExecutor()
1325
+ except Exception as e:
1326
+ logger.warning(f"Docker executor not available: {e}")
1327
+
1328
+ # Use BigCode evaluator
1329
+ evaluator = get_bigcode_evaluator(docker_executor)
1330
+
1331
+ # Prepare generations in expected format (list of lists)
1332
+ generations_for_eval = [[code] for code in generated_codes]
1333
+
1334
+ # Run evaluation
1335
+ evaluation_results = evaluator.evaluate(
1336
+ bigcode_task,
1337
+ generations_for_eval,
1338
+ k_values=[1], # Just pass@1 for now
1339
+ )
1340
+
1341
+ # Extract pass rate
1342
+ pass_rate = evaluation_results.get("pass_at_k", {}).get("pass@1", 0.0)
1343
+
1344
+ logger.info(f"✅ Code execution pass@1: {pass_rate:.2%}")
1345
+
1346
+ # Now classify the generated code to see if classifier agrees
1347
+ classification_results = []
1348
+ for i, code in enumerate(generated_codes):
1349
+ try:
1350
+ layer_obj = Layer(index=layer, type="transformer")
1351
+
1352
+ # Extract activations from generated code
1353
+ activation_tensor = model.extract_activations(code, layer_obj)
1354
+ activation_method = self._map_token_aggregation_to_activation_method(token_aggregation)
1355
+
1356
+ activation_obj = Activations(
1357
+ tensor=activation_tensor, layer=layer_obj, aggregation_strategy=activation_method
1358
+ )
1359
+
1360
+ # Get classifier prediction
1361
+ features = activation_obj.extract_features_for_classifier()
1362
+ features_numpy = features.cpu().numpy()
1363
+
1364
+ # Get prediction probability
1365
+ try:
1366
+ prediction_proba = classifier.predict_proba([features_numpy])
1367
+ if isinstance(prediction_proba, (list, tuple)) and len(prediction_proba) > 0:
1368
+ prediction = float(prediction_proba[0])
1369
+ else:
1370
+ prediction = float(prediction_proba)
1371
+ except:
1372
+ predictions = classifier.predict([features_numpy])
1373
+ if len(predictions) > 0:
1374
+ prediction = float(predictions[0])
1375
+ else:
1376
+ logger.warning("Classifier predict returned empty array")
1377
+ prediction = 0.5
1378
+
1379
+ # Check if code passed tests
1380
+ code_passed = False
1381
+ if i < len(evaluation_results.get("execution_results", [])):
1382
+ sample_results = evaluation_results["execution_results"][i].get("results", [])
1383
+ if sample_results:
1384
+ code_passed = sample_results[0].get("passed", False)
1385
+
1386
+ classification_results.append(
1387
+ {"classifier_score": prediction, "code_passed": code_passed, "code_snippet": code[:200]}
1388
+ )
1389
+
1390
+ except Exception as e:
1391
+ logger.error(f"Error classifying generated code {i}: {e}")
1392
+ classification_results.append({"classifier_score": 0.5, "code_passed": False, "error": str(e)})
1393
+
1394
+ # Analyze classifier performance
1395
+ correct_predictions = 0
1396
+ for result in classification_results:
1397
+ # Classifier should predict high score (>0.5) for passing code
1398
+ if (result["classifier_score"] > 0.5 and result["code_passed"]) or (
1399
+ result["classifier_score"] <= 0.5 and not result["code_passed"]
1400
+ ):
1401
+ correct_predictions += 1
1402
+
1403
+ classifier_accuracy = correct_predictions / len(classification_results) if classification_results else 0.0
1404
+
1405
+ return {
1406
+ "ground_truth": "CODE_EXECUTION",
1407
+ "method_used": "bigcode-evaluation",
1408
+ "confidence": classifier_accuracy,
1409
+ "pass_rate": pass_rate,
1410
+ "classifier_accuracy": classifier_accuracy,
1411
+ "total_samples": len(generated_codes),
1412
+ "passing_samples": int(pass_rate * len(generated_codes)),
1413
+ "details": f"Pass@1: {pass_rate:.2%}, Classifier accuracy: {classifier_accuracy:.2%}",
1414
+ "task_name": task_name,
1415
+ "evaluation_method": "code-execution",
1416
+ "execution_results": evaluation_results,
1417
+ }
1418
+
1419
+ except Exception as e:
1420
+ logger.error(f"Error in code execution evaluation: {e}")
1421
+ import traceback
1422
+
1423
+ traceback.print_exc()
1424
+ return {
1425
+ "ground_truth": "ERROR",
1426
+ "method_used": "code-execution-error",
1427
+ "confidence": 0.0,
1428
+ "details": f"Code execution evaluation failed: {e!s}",
1429
+ "task_name": task_name,
1430
+ "evaluation_method": "code-execution",
1431
+ }