wisent 0.7.379__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1720) hide show
  1. wisent/__init__.py +64 -0
  2. wisent/cli.py +114 -0
  3. wisent/core/__init__.py +40 -0
  4. wisent/core/activations/__init__.py +26 -0
  5. wisent/core/activations/activations.py +97 -0
  6. wisent/core/activations/activations_collector.py +506 -0
  7. wisent/core/activations/core/__init__.py +0 -0
  8. wisent/core/activations/core/atoms.py +219 -0
  9. wisent/core/activations/prompt_construction_strategy.py +47 -0
  10. wisent/core/adapters/__init__.py +22 -0
  11. wisent/core/adapters/audio.py +616 -0
  12. wisent/core/adapters/base.py +420 -0
  13. wisent/core/adapters/multimodal.py +738 -0
  14. wisent/core/adapters/robotics.py +643 -0
  15. wisent/core/adapters/text.py +441 -0
  16. wisent/core/adapters/video.py +555 -0
  17. wisent/core/agent/__init__.py +1 -0
  18. wisent/core/agent/budget.py +644 -0
  19. wisent/core/agent/device_benchmarks.py +691 -0
  20. wisent/core/agent/diagnose/__init__.py +1 -0
  21. wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
  22. wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
  23. wisent/core/agent/diagnose/create_classifier.py +1155 -0
  24. wisent/core/agent/diagnose/response_diagnostics.py +273 -0
  25. wisent/core/agent/diagnose/select_classifiers.py +507 -0
  26. wisent/core/agent/diagnose/synthetic_classifier_option.py +755 -0
  27. wisent/core/agent/diagnose/tasks/__init__.py +33 -0
  28. wisent/core/agent/diagnose/tasks/task_manager.py +1453 -0
  29. wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
  30. wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
  31. wisent/core/agent/diagnose.py +249 -0
  32. wisent/core/agent/steer.py +215 -0
  33. wisent/core/agent/timeout.py +134 -0
  34. wisent/core/autonomous_agent.py +1158 -0
  35. wisent/core/benchmark_extractors.py +372 -0
  36. wisent/core/benchmark_registry.py +151 -0
  37. wisent/core/bigcode_extractors.py +26 -0
  38. wisent/core/bigcode_integration.py +886 -0
  39. wisent/core/branding.py +108 -0
  40. wisent/core/classifier/__init__.py +1 -0
  41. wisent/core/classifier/models/__init__.py +1 -0
  42. wisent/core/classifiers/__init__.py +1 -0
  43. wisent/core/classifiers/classifiers/__init__.py +0 -0
  44. wisent/core/classifiers/classifiers/core/__init__.py +0 -0
  45. wisent/core/classifiers/classifiers/core/atoms.py +748 -0
  46. wisent/core/classifiers/classifiers/models/__init__.py +0 -0
  47. wisent/core/classifiers/classifiers/models/logistic.py +29 -0
  48. wisent/core/classifiers/classifiers/models/mlp.py +47 -0
  49. wisent/core/classifiers/classifiers/rotator.py +137 -0
  50. wisent/core/classifiers/core/__init__.py +1 -0
  51. wisent/core/classifiers/models/__init__.py +1 -0
  52. wisent/core/classifiers/pipeline_steps/__init__.py +1 -0
  53. wisent/core/cli/__init__.py +26 -0
  54. wisent/core/cli/agent/__init__.py +15 -0
  55. wisent/core/cli/agent/apply_steering.py +192 -0
  56. wisent/core/cli/agent/evaluate_response.py +128 -0
  57. wisent/core/cli/agent/generate_synthetic_pairs.py +123 -0
  58. wisent/core/cli/agent/main.py +139 -0
  59. wisent/core/cli/agent/train_classifier.py +173 -0
  60. wisent/core/cli/check_linearity.py +126 -0
  61. wisent/core/cli/create_steering_vector.py +304 -0
  62. wisent/core/cli/diagnose_pairs.py +153 -0
  63. wisent/core/cli/diagnose_vectors.py +404 -0
  64. wisent/core/cli/estimate_unified_goodness_time.py +428 -0
  65. wisent/core/cli/evaluate_refusal.py +241 -0
  66. wisent/core/cli/evaluate_responses.py +926 -0
  67. wisent/core/cli/generate_humanization_pairs.py +128 -0
  68. wisent/core/cli/generate_pairs.py +175 -0
  69. wisent/core/cli/generate_pairs_from_task.py +108 -0
  70. wisent/core/cli/generate_responses.py +160 -0
  71. wisent/core/cli/generate_vector_from_synthetic.py +217 -0
  72. wisent/core/cli/generate_vector_from_task.py +248 -0
  73. wisent/core/cli/get_activations.py +192 -0
  74. wisent/core/cli/inference_config.py +84 -0
  75. wisent/core/cli/inference_config_cli.py +54 -0
  76. wisent/core/cli/modify_weights.py +660 -0
  77. wisent/core/cli/multi_steer.py +112 -0
  78. wisent/core/cli/optimization_cache.py +298 -0
  79. wisent/core/cli/optimize.py +621 -0
  80. wisent/core/cli/optimize_classification.py +473 -0
  81. wisent/core/cli/optimize_sample_size.py +390 -0
  82. wisent/core/cli/optimize_steering.py +3421 -0
  83. wisent/core/cli/optimize_weights.py +1287 -0
  84. wisent/core/cli/steering_method_trainer.py +641 -0
  85. wisent/core/cli/steering_search_space.py +508 -0
  86. wisent/core/cli/tasks.py +940 -0
  87. wisent/core/cli/train_unified_goodness.py +681 -0
  88. wisent/core/cli_logger.py +22 -0
  89. wisent/core/config_manager.py +1731 -0
  90. wisent/core/contrastive_pairs/__init__.py +15 -0
  91. wisent/core/contrastive_pairs/core/__init__.py +0 -0
  92. wisent/core/contrastive_pairs/core/atoms.py +45 -0
  93. wisent/core/contrastive_pairs/core/buliders.py +59 -0
  94. wisent/core/contrastive_pairs/core/pair.py +183 -0
  95. wisent/core/contrastive_pairs/core/response.py +153 -0
  96. wisent/core/contrastive_pairs/core/serialization.py +306 -0
  97. wisent/core/contrastive_pairs/core/set.py +192 -0
  98. wisent/core/contrastive_pairs/diagnostics/__init__.py +79 -0
  99. wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
  100. wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
  101. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +1655 -0
  102. wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
  103. wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
  104. wisent/core/contrastive_pairs/diagnostics/duplicates.py +118 -0
  105. wisent/core/contrastive_pairs/diagnostics/linearity.py +325 -0
  106. wisent/core/contrastive_pairs/diagnostics/vector_quality.py +620 -0
  107. wisent/core/contrastive_pairs/huggingface_pairs/__init__.py +1 -0
  108. wisent/core/contrastive_pairs/huggingface_pairs/atoms.py +255 -0
  109. wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py +470 -0
  110. wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_registry.py +136 -0
  111. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py +44 -0
  112. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentbench.py +225 -0
  113. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentharm.py +267 -0
  114. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +444 -0
  115. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +225 -0
  116. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aime.py +118 -0
  117. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aime2024.py +74 -0
  118. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aime2025.py +73 -0
  119. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/alpaca_eval.py +153 -0
  120. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py +182 -0
  121. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/arena_hard.py +179 -0
  122. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/atis.py +89 -0
  123. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/babilong.py +96 -0
  124. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bangla_mmlu.py +108 -0
  125. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/basqueglue.py +217 -0
  126. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bec2016eu.py +99 -0
  127. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bfcl.py +283 -0
  128. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bhtc_v2.py +87 -0
  129. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +245 -0
  130. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/chain_of_thought.py +89 -0
  131. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/chinese_simpleqa.py +209 -0
  132. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/cluewsc.py +177 -0
  133. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/cnn_dailymail.py +92 -0
  134. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +378 -0
  135. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py +109 -0
  136. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text.py +15 -0
  137. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_go.py +64 -0
  138. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_java.py +65 -0
  139. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_javascript.py +65 -0
  140. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_php.py +65 -0
  141. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_python.py +65 -0
  142. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_ruby.py +65 -0
  143. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +844 -0
  144. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coedit_gec.py +79 -0
  145. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/conala.py +133 -0
  146. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/concode.py +111 -0
  147. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/dbpedia_14.py +91 -0
  148. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/doc_vqa.py +102 -0
  149. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/donotanswer.py +236 -0
  150. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ds1000.py +129 -0
  151. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ds_1000.py +155 -0
  152. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/epec_koref_bin.py +85 -0
  153. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ethos_binary.py +82 -0
  154. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/evalita_mp.py +165 -0
  155. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/evalita_sp_sum_task_fp_small_p1.py +89 -0
  156. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/facts_grounding.py +181 -0
  157. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +295 -0
  158. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/financial_tweets.py +100 -0
  159. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +270 -0
  160. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flan_held_in.py +98 -0
  161. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +572 -0
  162. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +143 -0
  163. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/freebase.py +99 -0
  164. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/get_negative_example_livecodebench.py +146 -0
  165. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/get_positive_example_livecodebench.py +140 -0
  166. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/gpt3_translation_benchmarks.py +98 -0
  167. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +389 -0
  168. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/halueval.py +246 -0
  169. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/harmbench.py +250 -0
  170. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/healthbench.py +181 -0
  171. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hle.py +106 -0
  172. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hmmt.py +117 -0
  173. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py +119 -0
  174. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humanevalpack.py +102 -0
  175. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py +180 -0
  176. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py +129 -0
  177. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/iwslt2017_ar_en.py +98 -0
  178. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/iwslt2017_en_ar.py +98 -0
  179. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/jailbreakbench.py +258 -0
  180. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/law_stack_exchange.py +101 -0
  181. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ledgar.py +118 -0
  182. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livecodebench.py +61 -0
  183. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livecodebench_contrastive_pair_generator.py +491 -0
  184. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livecodebench_v6.py +263 -0
  185. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +230 -0
  186. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/llama.py +96 -0
  187. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +285 -0
  188. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/m_mmlu.py +96 -0
  189. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math.py +186 -0
  190. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +146 -0
  191. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py +142 -0
  192. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/meddialog.py +79 -0
  193. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medical_abstracts.py +101 -0
  194. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +787 -0
  195. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +111 -0
  196. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mmlu_redux.py +194 -0
  197. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mmlusr.py +108 -0
  198. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multimedqa.py +99 -0
  199. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multipl_e.py +109 -0
  200. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple.py +96 -0
  201. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_choice.py +87 -0
  202. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_cpp.py +128 -0
  203. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_go.py +128 -0
  204. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_java.py +128 -0
  205. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_js.py +128 -0
  206. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_py.py +15 -0
  207. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_rs.py +128 -0
  208. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/non_greedy_robustness_agieval_aqua_rat.py +92 -0
  209. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +287 -0
  210. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/openllm.py +99 -0
  211. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/option_order_robustness_agieval_aqua_rat.py +92 -0
  212. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/or_bench.py +300 -0
  213. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/penn_treebank.py +80 -0
  214. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +317 -0
  215. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +467 -0
  216. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/prompt_robustness_agieval_aqua_rat.py +92 -0
  217. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/pythia.py +99 -0
  218. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +131 -0
  219. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +280 -0
  220. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/scicode.py +275 -0
  221. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/self_consistency.py +90 -0
  222. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +145 -0
  223. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/sorry_bench.py +211 -0
  224. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/stsb.py +79 -0
  225. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_glue_lm_eval_v1.py +99 -0
  226. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_glue_lm_eval_v1_seq2seq.py +98 -0
  227. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_glue_t5_prompt.py +123 -0
  228. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_gpqa.py +106 -0
  229. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/swe_bench.py +428 -0
  230. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/swe_bench_verified.py +158 -0
  231. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/sycophancy_eval.py +205 -0
  232. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/t0_eval.py +79 -0
  233. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tag.py +98 -0
  234. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +305 -0
  235. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tmlu.py +109 -0
  236. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +360 -0
  237. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +386 -0
  238. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/travelplanner.py +286 -0
  239. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/truthfulqa_generation.py +128 -0
  240. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/unfair_tos.py +83 -0
  241. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/vaxx_stance.py +86 -0
  242. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wiceu.py +85 -0
  243. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wikitext103.py +97 -0
  244. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wildguard.py +280 -0
  245. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt14_en_fr.py +97 -0
  246. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt14_fr_en.py +97 -0
  247. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_de_en.py +90 -0
  248. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_en_de.py +90 -0
  249. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_en_ro.py +90 -0
  250. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_ro_en.py +90 -0
  251. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt_ro_en_t5_prompt.py +90 -0
  252. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/xsum.py +81 -0
  253. wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
  254. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +265 -0
  255. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/__init__.py +472 -0
  256. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/aclue.py +24 -0
  257. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/acp.py +33 -0
  258. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/acpbench.py +39 -0
  259. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/advanced_ai_risk.py +59 -0
  260. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/aexams.py +14 -0
  261. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrimgsm.py +10 -0
  262. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrimmlu.py +10 -0
  263. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrixnli.py +9 -0
  264. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench.py +14 -0
  265. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_adr.py +9 -0
  266. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_afriqa.py +9 -0
  267. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_afrisenti.py +9 -0
  268. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_belebele.py +9 -0
  269. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_flores.py +9 -0
  270. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_injongointent.py +9 -0
  271. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_mafand.py +9 -0
  272. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_masakhaner.py +9 -0
  273. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_masakhanews.py +9 -0
  274. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_masakhapos.py +9 -0
  275. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_naijarc.py +9 -0
  276. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_nollysenti.py +9 -0
  277. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_ntrex.py +9 -0
  278. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_openai_mmlu.py +9 -0
  279. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_salt.py +9 -0
  280. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_sib.py +9 -0
  281. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_uhura_arc_easy.py +9 -0
  282. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_xlsum.py +9 -0
  283. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/agieval.py +33 -0
  284. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/anli.py +9 -0
  285. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arab_culture.py +24 -0
  286. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_acva.py +67 -0
  287. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_acva_light.py +67 -0
  288. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_complete.py +24 -0
  289. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_light.py +81 -0
  290. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabicmmlu.py +59 -0
  291. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/aradice.py +36 -0
  292. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arc.py +61 -0
  293. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arithmetic.py +19 -0
  294. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/basque_bench.py +37 -0
  295. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bbh.py +121 -0
  296. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bbq.py +9 -0
  297. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/belebele.py +293 -0
  298. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bertaqa.py +25 -0
  299. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bigbench.py +300 -0
  300. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/blimp.py +76 -0
  301. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/careqa.py +9 -0
  302. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/catalan_bench.py +43 -0
  303. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/ceval_valid.py +61 -0
  304. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/cmmlu.py +76 -0
  305. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py +16 -0
  306. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/copal_id.py +11 -0
  307. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/crows_pairs.py +31 -0
  308. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/csatqa.py +15 -0
  309. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/darija.py +29 -0
  310. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/darijammlu.py +57 -0
  311. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/egymmlu.py +62 -0
  312. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/eus.py +76 -0
  313. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/evalita_mp.py +93 -0
  314. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/fld.py +9 -0
  315. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/flores.py +466 -0
  316. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py +9 -0
  317. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/french_bench.py +23 -0
  318. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/galician_bench.py +41 -0
  319. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/glianorex.py +11 -0
  320. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/global_mmlu.py +115 -0
  321. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/gpqa.py +27 -0
  322. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/gsm8k.py +9 -0
  323. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/gsm8k_platinum.py +9 -0
  324. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/haerae.py +14 -0
  325. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/headqa.py +11 -0
  326. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hellaswag.py +39 -0
  327. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hendrycks_ethics.py +14 -0
  328. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hendrycks_math.py +9 -0
  329. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hrm8k.py +20 -0
  330. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/inverse.py +22 -0
  331. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/japanese_leaderboard.py +20 -0
  332. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/jsonschema_bench.py +9 -0
  333. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kbl.py +85 -0
  334. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kmmlu.py +281 -0
  335. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kobest.py +14 -0
  336. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kormedmcqa.py +9 -0
  337. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/lambada.py +28 -0
  338. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/leaderboard.py +52 -0
  339. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/libra.py +9 -0
  340. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/lingoly.py +11 -0
  341. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/longbench.py +9 -0
  342. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/m.py +43 -0
  343. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mastermind.py +9 -0
  344. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mathqa.py +9 -0
  345. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/med.py +24 -0
  346. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/meddialog.py +12 -0
  347. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/medqa.py +9 -0
  348. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mela.py +18 -0
  349. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/metabench.py +36 -0
  350. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mgsm.py +44 -0
  351. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/minerva_math.py +16 -0
  352. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mlqa.py +58 -0
  353. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu.py +70 -0
  354. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu_pro.py +23 -0
  355. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu_pro_plus.py +23 -0
  356. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu_prox.py +191 -0
  357. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlusr.py +9 -0
  358. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmmu.py +46 -0
  359. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/model_written_evals.py +9 -0
  360. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/multiblimp.py +111 -0
  361. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/non.py +23 -0
  362. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/noreval.py +143 -0
  363. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/noridiom.py +20 -0
  364. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/nortruthfulqa.py +32 -0
  365. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/nrk.py +20 -0
  366. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi.py +9 -0
  367. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_arc_multilingual.py +10 -0
  368. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_hellaswag_multilingual.py +24 -0
  369. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_mmlu_multilingual.py +24 -0
  370. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_truthfulqa_multilingual.py +34 -0
  371. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/paloma.py +25 -0
  372. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/pawsx.py +9 -0
  373. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/persona.py +144 -0
  374. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/pile.py +31 -0
  375. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/polemo2.py +9 -0
  376. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/portuguese_bench.py +31 -0
  377. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/prompt.py +23 -0
  378. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/qa4mre.py +12 -0
  379. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/qasper.py +11 -0
  380. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/ru.py +19 -0
  381. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/ruler.py +9 -0
  382. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/score.py +20 -0
  383. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/scrolls.py +9 -0
  384. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/self_consistency.py +11 -0
  385. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/spanish_bench.py +38 -0
  386. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/storycloze.py +9 -0
  387. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/super_glue_t5_prompt.py +17 -0
  388. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/tinyBenchmarks.py +9 -0
  389. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/tmlu.py +9 -0
  390. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/tmmluplus.py +80 -0
  391. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/translation.py +9 -0
  392. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/truthfulqa.py +76 -0
  393. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/truthfulqa_multi.py +24 -0
  394. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/turkishmmlu.py +30 -0
  395. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/unitxt.py +23 -0
  396. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/unscramble.py +9 -0
  397. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/winogender.py +16 -0
  398. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wmdp.py +12 -0
  399. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wmt14.py +16 -0
  400. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wmt16.py +22 -0
  401. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wsc273.py +9 -0
  402. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xcopa.py +21 -0
  403. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xnli.py +28 -0
  404. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xnli_eu.py +12 -0
  405. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xquad.py +22 -0
  406. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xstorycloze.py +22 -0
  407. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xwinograd.py +15 -0
  408. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +478 -0
  409. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +140 -0
  410. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +125 -0
  411. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +171 -0
  412. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +207 -0
  413. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +185 -0
  414. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +130 -0
  415. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +184 -0
  416. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimgsm.py +98 -0
  417. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +113 -0
  418. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +129 -0
  419. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrobench_cot.py +88 -0
  420. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrobench_mc.py +107 -0
  421. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ag.py +134 -0
  422. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py +155 -0
  423. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ai2_arc.py +114 -0
  424. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/anagrams1.py +81 -0
  425. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/anagrams2.py +81 -0
  426. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/anli.py +140 -0
  427. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +180 -0
  428. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +98 -0
  429. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +104 -0
  430. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +168 -0
  431. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +168 -0
  432. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +167 -0
  433. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +268 -0
  434. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +133 -0
  435. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +118 -0
  436. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +118 -0
  437. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_gen.py +101 -0
  438. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_mc.py +106 -0
  439. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/argument.py +134 -0
  440. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +114 -0
  441. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +122 -0
  442. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/assin.py +103 -0
  443. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +113 -0
  444. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +155 -0
  445. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench_gen.py +168 -0
  446. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench_mc.py +139 -0
  447. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbh.py +133 -0
  448. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +169 -0
  449. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +181 -0
  450. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +155 -0
  451. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +165 -0
  452. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +155 -0
  453. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +143 -0
  454. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bigbench.py +170 -0
  455. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +171 -0
  456. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +152 -0
  457. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +117 -0
  458. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq_seq2seq.py +117 -0
  459. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +150 -0
  460. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +152 -0
  461. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabreu.py +127 -0
  462. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +169 -0
  463. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +155 -0
  464. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench_gen.py +119 -0
  465. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench_mc.py +113 -0
  466. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +171 -0
  467. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +139 -0
  468. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +117 -0
  469. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +223 -0
  470. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +163 -0
  471. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +110 -0
  472. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +238 -0
  473. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +151 -0
  474. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +152 -0
  475. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +166 -0
  476. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +144 -0
  477. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +148 -0
  478. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code2text.py +161 -0
  479. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py +114 -0
  480. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/codexglue.py +107 -0
  481. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +149 -0
  482. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cola.py +83 -0
  483. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +107 -0
  484. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +127 -0
  485. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +124 -0
  486. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +169 -0
  487. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +162 -0
  488. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqcat.py +114 -0
  489. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/crows_pairs.py +158 -0
  490. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +152 -0
  491. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +107 -0
  492. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle_letters.py +81 -0
  493. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +221 -0
  494. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +174 -0
  495. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +152 -0
  496. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +157 -0
  497. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +152 -0
  498. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +107 -0
  499. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +129 -0
  500. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/egyhellaswag.py +125 -0
  501. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/egymmlu.py +180 -0
  502. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +142 -0
  503. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +107 -0
  504. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +194 -0
  505. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +152 -0
  506. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +152 -0
  507. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +152 -0
  508. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/escola.py +85 -0
  509. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +135 -0
  510. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethos.py +99 -0
  511. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +107 -0
  512. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +225 -0
  513. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +159 -0
  514. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +159 -0
  515. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +159 -0
  516. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +166 -0
  517. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_sp.py +109 -0
  518. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/fda.py +105 -0
  519. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +107 -0
  520. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +114 -0
  521. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/fld.py +143 -0
  522. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +202 -0
  523. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench_mc.py +98 -0
  524. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench_perplexity.py +86 -0
  525. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galcola.py +109 -0
  526. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +155 -0
  527. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench_gen.py +118 -0
  528. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench_mc.py +112 -0
  529. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +141 -0
  530. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +118 -0
  531. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +171 -0
  532. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +152 -0
  533. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glue.py +109 -0
  534. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpqa.py +161 -0
  535. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +110 -0
  536. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +184 -0
  537. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm.py +108 -0
  538. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +134 -0
  539. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +152 -0
  540. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +112 -0
  541. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +125 -0
  542. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +225 -0
  543. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +191 -0
  544. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +179 -0
  545. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hle.py +111 -0
  546. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +203 -0
  547. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval.py +124 -0
  548. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +152 -0
  549. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +152 -0
  550. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ifeval.py +118 -0
  551. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +107 -0
  552. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +192 -0
  553. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/iwslt2017.py +117 -0
  554. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +107 -0
  555. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +155 -0
  556. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_gen.py +224 -0
  557. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +120 -0
  558. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/jsonschema_bench.py +123 -0
  559. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kbl.py +140 -0
  560. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +168 -0
  561. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu_cot.py +88 -0
  562. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu_mc.py +107 -0
  563. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +165 -0
  564. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +160 -0
  565. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada.py +147 -0
  566. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +185 -0
  567. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +185 -0
  568. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual_stablelm.py +141 -0
  569. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +107 -0
  570. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +194 -0
  571. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/libra.py +165 -0
  572. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +203 -0
  573. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livemathbench.py +155 -0
  574. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +152 -0
  575. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +152 -0
  576. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logieval.py +82 -0
  577. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +115 -0
  578. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +114 -0
  579. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +152 -0
  580. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +152 -0
  581. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +203 -0
  582. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mathqa.py +137 -0
  583. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py +123 -0
  584. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +115 -0
  585. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +224 -0
  586. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +180 -0
  587. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +107 -0
  588. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mediqa_qa2019.py +123 -0
  589. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +169 -0
  590. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +118 -0
  591. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medtext.py +108 -0
  592. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +96 -0
  593. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meqsum.py +115 -0
  594. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +154 -0
  595. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mgsm.py +122 -0
  596. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mimic_repsum.py +140 -0
  597. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +172 -0
  598. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mlqa.py +143 -0
  599. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +144 -0
  600. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_cot.py +88 -0
  601. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_mc.py +107 -0
  602. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_pro.py +145 -0
  603. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +189 -0
  604. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmmu.py +150 -0
  605. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mnli.py +113 -0
  606. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/model_written_evals.py +115 -0
  607. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/moral_stories.py +151 -0
  608. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +111 -0
  609. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mts_dialog.py +118 -0
  610. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mts_dialog_perplexity.py +97 -0
  611. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +134 -0
  612. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multilingual.py +106 -0
  613. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +114 -0
  614. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +113 -0
  615. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +107 -0
  616. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +173 -0
  617. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +157 -0
  618. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen.py +277 -0
  619. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +165 -0
  620. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +228 -0
  621. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +223 -0
  622. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noticia.py +105 -0
  623. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +135 -0
  624. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi.py +27 -0
  625. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +167 -0
  626. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +174 -0
  627. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +162 -0
  628. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +209 -0
  629. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +186 -0
  630. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph_perplexity.py +97 -0
  631. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +118 -0
  632. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +107 -0
  633. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paloma.py +205 -0
  634. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +110 -0
  635. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +110 -0
  636. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +107 -0
  637. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +154 -0
  638. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +115 -0
  639. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +246 -0
  640. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +144 -0
  641. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases_ca_va.py +82 -0
  642. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +161 -0
  643. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile_10k.py +140 -0
  644. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +116 -0
  645. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polemo2.py +135 -0
  646. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polymath.py +155 -0
  647. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +155 -0
  648. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench_gen.py +121 -0
  649. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench_mc.py +103 -0
  650. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +107 -0
  651. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +115 -0
  652. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +112 -0
  653. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +119 -0
  654. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +118 -0
  655. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +112 -0
  656. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +111 -0
  657. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +107 -0
  658. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +111 -0
  659. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/quac.py +111 -0
  660. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +124 -0
  661. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +107 -0
  662. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/realtoxicityprompts.py +124 -0
  663. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +125 -0
  664. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +110 -0
  665. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +111 -0
  666. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +170 -0
  667. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +113 -0
  668. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +177 -0
  669. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +161 -0
  670. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +157 -0
  671. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +110 -0
  672. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +131 -0
  673. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +119 -0
  674. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/simple_cooccurrence_bias.py +121 -0
  675. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +209 -0
  676. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +114 -0
  677. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +155 -0
  678. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench_gen.py +117 -0
  679. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench_mc.py +110 -0
  680. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad2.py +129 -0
  681. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad_completion.py +121 -0
  682. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sst2.py +111 -0
  683. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +250 -0
  684. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +107 -0
  685. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +107 -0
  686. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +154 -0
  687. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/superglue.py +111 -0
  688. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/supergpqa.py +111 -0
  689. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +115 -0
  690. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +179 -0
  691. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +117 -0
  692. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +110 -0
  693. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +110 -0
  694. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +110 -0
  695. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +155 -0
  696. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +110 -0
  697. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +110 -0
  698. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +110 -0
  699. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +113 -0
  700. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +110 -0
  701. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +181 -0
  702. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/toxigen.py +91 -0
  703. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/translation.py +149 -0
  704. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +130 -0
  705. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +112 -0
  706. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +120 -0
  707. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +140 -0
  708. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_multi.py +142 -0
  709. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +152 -0
  710. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +161 -0
  711. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_cot.py +104 -0
  712. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +102 -0
  713. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/twenty_newsgroups.py +111 -0
  714. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unitxt.py +131 -0
  715. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +155 -0
  716. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +95 -0
  717. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +130 -0
  718. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +122 -0
  719. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wikitext.py +146 -0
  720. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogender.py +139 -0
  721. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +118 -0
  722. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +155 -0
  723. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmt14.py +110 -0
  724. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmt16.py +118 -0
  725. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +114 -0
  726. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +117 -0
  727. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +180 -0
  728. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +197 -0
  729. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +147 -0
  730. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +131 -0
  731. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +203 -0
  732. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +129 -0
  733. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +124 -0
  734. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/yahoo.py +108 -0
  735. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +155 -0
  736. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +56 -0
  737. wisent/core/data_loaders/__init__.py +235 -0
  738. wisent/core/data_loaders/core/__init__.py +0 -0
  739. wisent/core/data_loaders/core/atoms.py +99 -0
  740. wisent/core/data_loaders/loaders/__init__.py +0 -0
  741. wisent/core/data_loaders/loaders/custom.py +120 -0
  742. wisent/core/data_loaders/loaders/huggingface_loader.py +153 -0
  743. wisent/core/data_loaders/loaders/lm_loader.py +494 -0
  744. wisent/core/data_loaders/loaders/lm_loader_special_cases.py +496 -0
  745. wisent/core/data_loaders/loaders/task_interface_loader.py +300 -0
  746. wisent/core/data_loaders/rotator.py +118 -0
  747. wisent/core/detection_handling.py +259 -0
  748. wisent/core/diversity_processors.py +193 -0
  749. wisent/core/download_full_benchmarks.py +1512 -0
  750. wisent/core/errors/__init__.py +203 -0
  751. wisent/core/errors/error_codes.py +763 -0
  752. wisent/core/errors/error_handler.py +134 -0
  753. wisent/core/evaluators/__init__.py +0 -0
  754. wisent/core/evaluators/benchmark_specific/__init__.py +42 -0
  755. wisent/core/evaluators/benchmark_specific/aime_evaluator.py +90 -0
  756. wisent/core/evaluators/benchmark_specific/coding/__init__.py +0 -0
  757. wisent/core/evaluators/benchmark_specific/coding/metrics/__init__.py +0 -0
  758. wisent/core/evaluators/benchmark_specific/coding/metrics/core/__init__.py +0 -0
  759. wisent/core/evaluators/benchmark_specific/coding/metrics/core/atoms.py +36 -0
  760. wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py +363 -0
  761. wisent/core/evaluators/benchmark_specific/coding/metrics/passk.py +67 -0
  762. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/__init__.py +0 -0
  763. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/core/__init__.py +0 -0
  764. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/core/atoms.py +27 -0
  765. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/cpp_sanitizer.py +62 -0
  766. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/java_sanitizer.py +78 -0
  767. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/python_sanitizer.py +94 -0
  768. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/utils.py +126 -0
  769. wisent/core/evaluators/benchmark_specific/coding/providers/__init__.py +18 -0
  770. wisent/core/evaluators/benchmark_specific/coding/providers/core/__init__.py +0 -0
  771. wisent/core/evaluators/benchmark_specific/coding/providers/core/atoms.py +31 -0
  772. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/__init__.py +3 -0
  773. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/provider.py +305 -0
  774. wisent/core/evaluators/benchmark_specific/coding/safe_docker/Dockerfile +31 -0
  775. wisent/core/evaluators/benchmark_specific/coding/safe_docker/__init__.py +0 -0
  776. wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/__init__.py +0 -0
  777. wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/atoms.py +105 -0
  778. wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/runtime.py +143 -0
  779. wisent/core/evaluators/benchmark_specific/coding/safe_docker/entrypoint.py +121 -0
  780. wisent/core/evaluators/benchmark_specific/coding/safe_docker/recipes.py +60 -0
  781. wisent/core/evaluators/benchmark_specific/coding/solution_generator.py +258 -0
  782. wisent/core/evaluators/benchmark_specific/conala_evaluator.py +332 -0
  783. wisent/core/evaluators/benchmark_specific/exact_match_evaluator.py +81 -0
  784. wisent/core/evaluators/benchmark_specific/f1_evaluator.py +173 -0
  785. wisent/core/evaluators/benchmark_specific/generation_evaluator.py +488 -0
  786. wisent/core/evaluators/benchmark_specific/livemathbench_evaluator.py +393 -0
  787. wisent/core/evaluators/benchmark_specific/log_likelihoods_evaluator.py +202 -0
  788. wisent/core/evaluators/benchmark_specific/math_evaluator.py +119 -0
  789. wisent/core/evaluators/benchmark_specific/math_parsing/__init__.py +1 -0
  790. wisent/core/evaluators/benchmark_specific/math_parsing/core.py +1640 -0
  791. wisent/core/evaluators/benchmark_specific/math_parsing/extract_boxed.py +48 -0
  792. wisent/core/evaluators/benchmark_specific/math_parsing/is_equiv.py +159 -0
  793. wisent/core/evaluators/benchmark_specific/math_parsing/scripts.py +919 -0
  794. wisent/core/evaluators/benchmark_specific/perplexity_evaluator.py +175 -0
  795. wisent/core/evaluators/benchmark_specific/polymath_evaluator.py +114 -0
  796. wisent/core/evaluators/core/__init__.py +5 -0
  797. wisent/core/evaluators/core/atoms.py +166 -0
  798. wisent/core/evaluators/custom/__init__.py +20 -0
  799. wisent/core/evaluators/custom/custom_evaluator.py +382 -0
  800. wisent/core/evaluators/custom/examples/__init__.py +37 -0
  801. wisent/core/evaluators/custom/examples/desklib_detector.py +166 -0
  802. wisent/core/evaluators/custom/examples/gptzero.py +185 -0
  803. wisent/core/evaluators/custom/examples/humanization.py +79 -0
  804. wisent/core/evaluators/custom/examples/humanization_coherent.py +127 -0
  805. wisent/core/evaluators/custom/examples/roberta_detector.py +173 -0
  806. wisent/core/evaluators/oracles/__init__.py +0 -0
  807. wisent/core/evaluators/oracles/interactive.py +73 -0
  808. wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
  809. wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py +168 -0
  810. wisent/core/evaluators/oracles/user_specified.py +67 -0
  811. wisent/core/evaluators/personalization/__init__.py +12 -0
  812. wisent/core/evaluators/personalization/alignment.py +166 -0
  813. wisent/core/evaluators/personalization/coherence.py +325 -0
  814. wisent/core/evaluators/personalization/difference.py +73 -0
  815. wisent/core/evaluators/rotator.py +217 -0
  816. wisent/core/evaluators/steering_evaluators.py +386 -0
  817. wisent/core/evaluators/synthetic_evaluator.py +377 -0
  818. wisent/core/hyperparameter_optimizer.py +547 -0
  819. wisent/core/layer.py +17 -0
  820. wisent/core/lm_eval_harness_ground_truth.py +1431 -0
  821. wisent/core/main.py +101 -0
  822. wisent/core/managed_cached_benchmarks.py +609 -0
  823. wisent/core/mixed_benchmark_sampler.py +366 -0
  824. wisent/core/modalities/__init__.py +545 -0
  825. wisent/core/model_persistence.py +302 -0
  826. wisent/core/models/__init__.py +23 -0
  827. wisent/core/models/core/__init__.py +0 -0
  828. wisent/core/models/core/atoms.py +465 -0
  829. wisent/core/models/inference_config.py +127 -0
  830. wisent/core/models/wisent_model.py +893 -0
  831. wisent/core/multi_steering.py +397 -0
  832. wisent/core/opti/__init__.py +0 -0
  833. wisent/core/opti/core/__init__.py +0 -0
  834. wisent/core/opti/core/atoms.py +177 -0
  835. wisent/core/opti/methods/__init__.py +10 -0
  836. wisent/core/opti/methods/opti_classificator.py +172 -0
  837. wisent/core/opti/methods/opti_steering.py +139 -0
  838. wisent/core/opti/methods/opti_weights.py +523 -0
  839. wisent/core/optuna/__init__.py +54 -0
  840. wisent/core/optuna/classifier/__init__.py +25 -0
  841. wisent/core/optuna/classifier/activation_generator.py +351 -0
  842. wisent/core/optuna/classifier/classifier_cache.py +509 -0
  843. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +685 -0
  844. wisent/core/optuna/steering/__init__.py +20 -0
  845. wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +200 -0
  846. wisent/core/optuna/steering/data_utils.py +342 -0
  847. wisent/core/optuna/steering/metrics.py +412 -0
  848. wisent/core/optuna/steering/steering_optimization.py +1096 -0
  849. wisent/core/parser.py +1662 -0
  850. wisent/core/parser_arguments/__init__.py +10 -0
  851. wisent/core/parser_arguments/agent_parser.py +122 -0
  852. wisent/core/parser_arguments/check_linearity_parser.py +82 -0
  853. wisent/core/parser_arguments/configure_model_parser.py +7 -0
  854. wisent/core/parser_arguments/create_steering_vector_parser.py +67 -0
  855. wisent/core/parser_arguments/diagnose_pairs_parser.py +25 -0
  856. wisent/core/parser_arguments/diagnose_vectors_parser.py +72 -0
  857. wisent/core/parser_arguments/evaluate_parser.py +40 -0
  858. wisent/core/parser_arguments/evaluate_refusal_parser.py +32 -0
  859. wisent/core/parser_arguments/evaluate_responses_parser.py +12 -0
  860. wisent/core/parser_arguments/full_optimize_parser.py +194 -0
  861. wisent/core/parser_arguments/generate_pairs_from_task_parser.py +33 -0
  862. wisent/core/parser_arguments/generate_pairs_parser.py +43 -0
  863. wisent/core/parser_arguments/generate_responses_parser.py +16 -0
  864. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +148 -0
  865. wisent/core/parser_arguments/generate_vector_from_task_parser.py +149 -0
  866. wisent/core/parser_arguments/generate_vector_parser.py +89 -0
  867. wisent/core/parser_arguments/get_activations_parser.py +90 -0
  868. wisent/core/parser_arguments/inference_config_parser.py +65 -0
  869. wisent/core/parser_arguments/main_parser.py +220 -0
  870. wisent/core/parser_arguments/model_config_parser.py +59 -0
  871. wisent/core/parser_arguments/modify_weights_parser.py +309 -0
  872. wisent/core/parser_arguments/monitor_parser.py +17 -0
  873. wisent/core/parser_arguments/multi_steer_parser.py +48 -0
  874. wisent/core/parser_arguments/nonsense_parser.py +26 -0
  875. wisent/core/parser_arguments/optimization_cache_parser.py +64 -0
  876. wisent/core/parser_arguments/optimize_classification_parser.py +108 -0
  877. wisent/core/parser_arguments/optimize_parser.py +142 -0
  878. wisent/core/parser_arguments/optimize_sample_size_parser.py +58 -0
  879. wisent/core/parser_arguments/optimize_steering_parser.py +617 -0
  880. wisent/core/parser_arguments/optimize_weights_parser.py +403 -0
  881. wisent/core/parser_arguments/synthetic_parser.py +117 -0
  882. wisent/core/parser_arguments/tasks_parser.py +591 -0
  883. wisent/core/parser_arguments/train_unified_goodness_parser.py +172 -0
  884. wisent/core/parser_arguments/utils.py +107 -0
  885. wisent/core/prompts/__init__.py +0 -0
  886. wisent/core/prompts/core/__init__.py +0 -0
  887. wisent/core/prompts/core/atom.py +57 -0
  888. wisent/core/prompts/core/prompt_formater.py +148 -0
  889. wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
  890. wisent/core/prompts/prompt_stratiegies/direct_completion.py +26 -0
  891. wisent/core/prompts/prompt_stratiegies/instruction_following.py +26 -0
  892. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +31 -0
  893. wisent/core/prompts/prompt_stratiegies/role_playing.py +33 -0
  894. wisent/core/representation.py +5 -0
  895. wisent/core/save_results.py +277 -0
  896. wisent/core/steering.py +660 -0
  897. wisent/core/steering_method.py +20 -0
  898. wisent/core/steering_methods/__init__.py +54 -0
  899. wisent/core/steering_methods/core/__init__.py +0 -0
  900. wisent/core/steering_methods/core/atoms.py +154 -0
  901. wisent/core/steering_methods/methods/__init__.py +0 -0
  902. wisent/core/steering_methods/methods/caa.py +45 -0
  903. wisent/core/steering_methods/methods/prism.py +588 -0
  904. wisent/core/steering_methods/methods/pulse.py +641 -0
  905. wisent/core/steering_methods/methods/titan.py +1005 -0
  906. wisent/core/steering_methods/preflight.py +322 -0
  907. wisent/core/steering_methods/registry.py +649 -0
  908. wisent/core/steering_methods/rotator.py +121 -0
  909. wisent/core/steering_optimizer.py +1503 -0
  910. wisent/core/synthetic/__init__.py +0 -0
  911. wisent/core/synthetic/cleaners/__init__.py +0 -0
  912. wisent/core/synthetic/cleaners/core/__init__.py +0 -0
  913. wisent/core/synthetic/cleaners/core/atoms.py +58 -0
  914. wisent/core/synthetic/cleaners/deduper_cleaner.py +53 -0
  915. wisent/core/synthetic/cleaners/methods/__init__.py +0 -0
  916. wisent/core/synthetic/cleaners/methods/base_dedupers.py +321 -0
  917. wisent/core/synthetic/cleaners/methods/base_refusalers.py +286 -0
  918. wisent/core/synthetic/cleaners/methods/core/__init__.py +0 -0
  919. wisent/core/synthetic/cleaners/methods/core/atoms.py +47 -0
  920. wisent/core/synthetic/cleaners/pairs_cleaner.py +90 -0
  921. wisent/core/synthetic/cleaners/refusaler_cleaner.py +133 -0
  922. wisent/core/synthetic/db_instructions/__init__.py +0 -0
  923. wisent/core/synthetic/db_instructions/core/__init__.py +0 -0
  924. wisent/core/synthetic/db_instructions/core/atoms.py +25 -0
  925. wisent/core/synthetic/db_instructions/mini_dp.py +115 -0
  926. wisent/core/synthetic/generators/__init__.py +0 -0
  927. wisent/core/synthetic/generators/core/__init__.py +0 -0
  928. wisent/core/synthetic/generators/core/atoms.py +73 -0
  929. wisent/core/synthetic/generators/diversities/__init__.py +0 -0
  930. wisent/core/synthetic/generators/diversities/core/__init__.py +0 -0
  931. wisent/core/synthetic/generators/diversities/core/core.py +68 -0
  932. wisent/core/synthetic/generators/diversities/methods/__init__.py +0 -0
  933. wisent/core/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
  934. wisent/core/synthetic/generators/nonsense_generator.py +150 -0
  935. wisent/core/synthetic/generators/pairs_generator.py +313 -0
  936. wisent/core/task_interface.py +143 -0
  937. wisent/core/task_selector.py +232 -0
  938. wisent/core/tasks/__init__.py +218 -0
  939. wisent/core/tasks/aime_task.py +142 -0
  940. wisent/core/tasks/file_task.py +212 -0
  941. wisent/core/tasks/hle_task.py +180 -0
  942. wisent/core/tasks/hmmt_task.py +120 -0
  943. wisent/core/tasks/livecodebench_task.py +94 -0
  944. wisent/core/tasks/livemathbench_task.py +159 -0
  945. wisent/core/tasks/lm_eval_task.py +611 -0
  946. wisent/core/tasks/math500_task.py +84 -0
  947. wisent/core/tasks/polymath_task.py +147 -0
  948. wisent/core/tasks/supergpqa_task.py +220 -0
  949. wisent/core/time_estimator.py +155 -0
  950. wisent/core/timing_calibration.py +176 -0
  951. wisent/core/tracking/__init__.py +54 -0
  952. wisent/core/tracking/latency.py +620 -0
  953. wisent/core/tracking/memory.py +360 -0
  954. wisent/core/trainers/__init__.py +0 -0
  955. wisent/core/trainers/core/__init__.py +11 -0
  956. wisent/core/trainers/core/atoms.py +45 -0
  957. wisent/core/trainers/steering_trainer.py +365 -0
  958. wisent/core/universal_subspace.py +918 -0
  959. wisent/core/user_model_config.py +158 -0
  960. wisent/core/utils/__init__.py +64 -0
  961. wisent/core/utils/base_rotator.py +292 -0
  962. wisent/core/utils/dataset_splits.py +197 -0
  963. wisent/core/utils/device.py +279 -0
  964. wisent/core/weight_modification/__init__.py +134 -0
  965. wisent/core/weight_modification/additive.py +340 -0
  966. wisent/core/weight_modification/directional.py +1357 -0
  967. wisent/core/weight_modification/export.py +359 -0
  968. wisent/core/weight_modification/multi_direction.py +410 -0
  969. wisent/core/weight_modification/utils.py +236 -0
  970. wisent/core/wisent.py +660 -0
  971. wisent/examples/contrastive_pairs/humanization_human_vs_ai.json +2112 -0
  972. wisent/examples/scripts/1/test_basqueglue_evaluation.json +51 -0
  973. wisent/examples/scripts/1/test_basqueglue_pairs.json +14 -0
  974. wisent/examples/scripts/1/test_bec2016eu_evaluation.json +51 -0
  975. wisent/examples/scripts/1/test_bec2016eu_pairs.json +14 -0
  976. wisent/examples/scripts/1/test_belebele_evaluation.json +51 -0
  977. wisent/examples/scripts/1/test_belebele_pairs.json +14 -0
  978. wisent/examples/scripts/1/test_benchmarks_evaluation.json +51 -0
  979. wisent/examples/scripts/1/test_benchmarks_pairs.json +14 -0
  980. wisent/examples/scripts/1/test_bertaqa_evaluation.json +51 -0
  981. wisent/examples/scripts/1/test_bertaqa_pairs.json +14 -0
  982. wisent/examples/scripts/1/test_bhtc_v2_evaluation.json +30 -0
  983. wisent/examples/scripts/1/test_bhtc_v2_pairs.json +8 -0
  984. wisent/examples/scripts/1/test_boolq-seq2seq_evaluation.json +30 -0
  985. wisent/examples/scripts/1/test_boolq-seq2seq_pairs.json +8 -0
  986. wisent/examples/scripts/1/test_cabreu_evaluation.json +30 -0
  987. wisent/examples/scripts/1/test_cabreu_pairs.json +8 -0
  988. wisent/examples/scripts/1/test_careqa_en_evaluation.json +30 -0
  989. wisent/examples/scripts/1/test_careqa_en_pairs.json +8 -0
  990. wisent/examples/scripts/1/test_careqa_evaluation.json +30 -0
  991. wisent/examples/scripts/1/test_careqa_pairs.json +8 -0
  992. wisent/examples/scripts/1/test_catalanqa_evaluation.json +30 -0
  993. wisent/examples/scripts/1/test_catalanqa_pairs.json +8 -0
  994. wisent/examples/scripts/1/test_catcola_evaluation.json +30 -0
  995. wisent/examples/scripts/1/test_catcola_pairs.json +8 -0
  996. wisent/examples/scripts/1/test_chartqa_evaluation.json +30 -0
  997. wisent/examples/scripts/1/test_chartqa_pairs.json +8 -0
  998. wisent/examples/scripts/1/test_claim_stance_topic_evaluation.json +30 -0
  999. wisent/examples/scripts/1/test_claim_stance_topic_pairs.json +8 -0
  1000. wisent/examples/scripts/1/test_cnn_dailymail_evaluation.json +30 -0
  1001. wisent/examples/scripts/1/test_cnn_dailymail_pairs.json +8 -0
  1002. wisent/examples/scripts/1/test_cocoteros_es_evaluation.json +30 -0
  1003. wisent/examples/scripts/1/test_cocoteros_es_pairs.json +8 -0
  1004. wisent/examples/scripts/1/test_coedit_gec_evaluation.json +30 -0
  1005. wisent/examples/scripts/1/test_coedit_gec_pairs.json +8 -0
  1006. wisent/examples/scripts/1/test_cola_evaluation.json +30 -0
  1007. wisent/examples/scripts/1/test_cola_pairs.json +8 -0
  1008. wisent/examples/scripts/1/test_coqcat_evaluation.json +30 -0
  1009. wisent/examples/scripts/1/test_coqcat_pairs.json +8 -0
  1010. wisent/examples/scripts/1/test_dbpedia_14_evaluation.json +30 -0
  1011. wisent/examples/scripts/1/test_dbpedia_14_pairs.json +8 -0
  1012. wisent/examples/scripts/1/test_epec_koref_bin_evaluation.json +30 -0
  1013. wisent/examples/scripts/1/test_epec_koref_bin_pairs.json +8 -0
  1014. wisent/examples/scripts/1/test_ethos_binary_evaluation.json +30 -0
  1015. wisent/examples/scripts/1/test_ethos_binary_pairs.json +8 -0
  1016. wisent/examples/scripts/2/test_afrimgsm_direct_amh_evaluation.json +30 -0
  1017. wisent/examples/scripts/2/test_afrimgsm_direct_amh_pairs.json +8 -0
  1018. wisent/examples/scripts/2/test_afrimmlu_direct_amh_evaluation.json +30 -0
  1019. wisent/examples/scripts/2/test_afrimmlu_direct_amh_pairs.json +8 -0
  1020. wisent/examples/scripts/2/test_afrixnli_en_direct_amh_evaluation.json +30 -0
  1021. wisent/examples/scripts/2/test_afrixnli_en_direct_amh_pairs.json +8 -0
  1022. wisent/examples/scripts/2/test_arc_ar_evaluation.json +30 -0
  1023. wisent/examples/scripts/2/test_arc_ar_pairs.json +8 -0
  1024. wisent/examples/scripts/2/test_atis_evaluation.json +30 -0
  1025. wisent/examples/scripts/2/test_atis_pairs.json +8 -0
  1026. wisent/examples/scripts/2/test_babi_evaluation.json +30 -0
  1027. wisent/examples/scripts/2/test_babi_pairs.json +8 -0
  1028. wisent/examples/scripts/2/test_babilong_evaluation.json +30 -0
  1029. wisent/examples/scripts/2/test_babilong_pairs.json +8 -0
  1030. wisent/examples/scripts/2/test_bangla_mmlu_evaluation.json +30 -0
  1031. wisent/examples/scripts/2/test_bangla_mmlu_pairs.json +8 -0
  1032. wisent/examples/scripts/2/test_basque-glue_pairs.json +14 -0
  1033. wisent/examples/scripts/benchmark_tags.json +2140 -0
  1034. wisent/examples/scripts/lm_eval_readme.json +4 -0
  1035. wisent/examples/scripts/results/benchmark_descriptions.json +1244 -0
  1036. wisent/examples/scripts/results/benchmark_evaluation_methods.json +66 -0
  1037. wisent/examples/scripts/results/benchmark_evaluator_mapping.json +2781 -0
  1038. wisent/examples/scripts/results/benchmark_evaluator_mapping_updated.json +30536 -0
  1039. wisent/examples/scripts/results/benchmark_evaluators_clean.json +469 -0
  1040. wisent/examples/scripts/results/benchmark_methods_summary.json +260 -0
  1041. wisent/examples/scripts/results/benchmark_pair_creation_methods.json +66 -0
  1042. wisent/examples/scripts/results/benchmark_pair_totals.json +269 -0
  1043. wisent/examples/scripts/results/benchmark_tags.json +917 -0
  1044. wisent/examples/scripts/results/benchmark_test_summary_nov4.json +71 -0
  1045. wisent/examples/scripts/results/coding_benchmarks_test_code_status.json +150 -0
  1046. wisent/examples/scripts/results/failing_benchmarks.json +946 -0
  1047. wisent/examples/scripts/results/failing_benchmarks_list.json +41 -0
  1048. wisent/examples/scripts/results/failing_benchmarks_test_results.json +945 -0
  1049. wisent/examples/scripts/results/missing_benchmark_tags.json +341 -0
  1050. wisent/examples/scripts/results/test_20_newsgroups_evaluation.json +30 -0
  1051. wisent/examples/scripts/results/test_20_newsgroups_pairs.json +8 -0
  1052. wisent/examples/scripts/results/test_AraDICE_evaluation.json +51 -0
  1053. wisent/examples/scripts/results/test_AraDICE_pairs.json +14 -0
  1054. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_evaluation.json +30 -0
  1055. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_pairs.json +8 -0
  1056. wisent/examples/scripts/results/test_ArabCulture_evaluation.json +51 -0
  1057. wisent/examples/scripts/results/test_ArabCulture_pairs.json +14 -0
  1058. wisent/examples/scripts/results/test_Tag_evaluation.json +30 -0
  1059. wisent/examples/scripts/results/test_Tag_pairs.json +8 -0
  1060. wisent/examples/scripts/results/test_aclue_evaluation.json +51 -0
  1061. wisent/examples/scripts/results/test_aclue_pairs.json +14 -0
  1062. wisent/examples/scripts/results/test_acp_bench_evaluation.json +51 -0
  1063. wisent/examples/scripts/results/test_acp_bench_hard_evaluation.json +51 -0
  1064. wisent/examples/scripts/results/test_acp_bench_hard_pairs.json +14 -0
  1065. wisent/examples/scripts/results/test_acp_bench_pairs.json +14 -0
  1066. wisent/examples/scripts/results/test_advanced_ai_risk_evaluation.json +51 -0
  1067. wisent/examples/scripts/results/test_advanced_ai_risk_pairs.json +14 -0
  1068. wisent/examples/scripts/results/test_aexams_evaluation.json +51 -0
  1069. wisent/examples/scripts/results/test_aexams_pairs.json +14 -0
  1070. wisent/examples/scripts/results/test_afrimgsm_direct_amh_evaluation.json +30 -0
  1071. wisent/examples/scripts/results/test_afrimgsm_direct_amh_pairs.json +8 -0
  1072. wisent/examples/scripts/results/test_afrimmlu_direct_amh_evaluation.json +30 -0
  1073. wisent/examples/scripts/results/test_afrimmlu_direct_amh_pairs.json +8 -0
  1074. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_evaluation.json +30 -0
  1075. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_pairs.json +8 -0
  1076. wisent/examples/scripts/results/test_ag_news_evaluation.json +30 -0
  1077. wisent/examples/scripts/results/test_ag_news_pairs.json +8 -0
  1078. wisent/examples/scripts/results/test_agieval_evaluation.json +51 -0
  1079. wisent/examples/scripts/results/test_agieval_pairs.json +14 -0
  1080. wisent/examples/scripts/results/test_aime2024_evaluation.json +30 -0
  1081. wisent/examples/scripts/results/test_aime2024_pairs.json +8 -0
  1082. wisent/examples/scripts/results/test_aime2025_evaluation.json +30 -0
  1083. wisent/examples/scripts/results/test_aime2025_pairs.json +8 -0
  1084. wisent/examples/scripts/results/test_aime_evaluation.json +30 -0
  1085. wisent/examples/scripts/results/test_aime_pairs.json +8 -0
  1086. wisent/examples/scripts/results/test_anagrams1_evaluation.json +30 -0
  1087. wisent/examples/scripts/results/test_anagrams1_pairs.json +8 -0
  1088. wisent/examples/scripts/results/test_anagrams2_evaluation.json +30 -0
  1089. wisent/examples/scripts/results/test_anagrams2_pairs.json +8 -0
  1090. wisent/examples/scripts/results/test_anli_evaluation.json +30 -0
  1091. wisent/examples/scripts/results/test_anli_pairs.json +8 -0
  1092. wisent/examples/scripts/results/test_apps_evaluation.json +30 -0
  1093. wisent/examples/scripts/results/test_apps_pairs.json +8 -0
  1094. wisent/examples/scripts/results/test_arabic_exams_evaluation.json +30 -0
  1095. wisent/examples/scripts/results/test_arabic_exams_pairs.json +8 -0
  1096. wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json +51 -0
  1097. wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json +14 -0
  1098. wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json +51 -0
  1099. wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json +14 -0
  1100. wisent/examples/scripts/results/test_arabicmmlu_evaluation.json +51 -0
  1101. wisent/examples/scripts/results/test_arabicmmlu_pairs.json +14 -0
  1102. wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json +51 -0
  1103. wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json +14 -0
  1104. wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json +51 -0
  1105. wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json +14 -0
  1106. wisent/examples/scripts/results/test_arc_ar_evaluation.json +30 -0
  1107. wisent/examples/scripts/results/test_arc_ar_pairs.json +8 -0
  1108. wisent/examples/scripts/results/test_arc_challenge_evaluation.json +30 -0
  1109. wisent/examples/scripts/results/test_arc_challenge_pairs.json +8 -0
  1110. wisent/examples/scripts/results/test_arc_easy_evaluation.json +30 -0
  1111. wisent/examples/scripts/results/test_arc_easy_pairs.json +8 -0
  1112. wisent/examples/scripts/results/test_argument_topic_evaluation.json +30 -0
  1113. wisent/examples/scripts/results/test_argument_topic_pairs.json +8 -0
  1114. wisent/examples/scripts/results/test_arithmetic_evaluation.json +51 -0
  1115. wisent/examples/scripts/results/test_arithmetic_pairs.json +14 -0
  1116. wisent/examples/scripts/results/test_asdiv_evaluation.json +30 -0
  1117. wisent/examples/scripts/results/test_asdiv_pairs.json +8 -0
  1118. wisent/examples/scripts/results/test_assin_entailment_evaluation.json +30 -0
  1119. wisent/examples/scripts/results/test_assin_entailment_pairs.json +8 -0
  1120. wisent/examples/scripts/results/test_atis_evaluation.json +30 -0
  1121. wisent/examples/scripts/results/test_atis_pairs.json +8 -0
  1122. wisent/examples/scripts/results/test_babi_evaluation.json +30 -0
  1123. wisent/examples/scripts/results/test_babi_pairs.json +8 -0
  1124. wisent/examples/scripts/results/test_babilong_evaluation.json +30 -0
  1125. wisent/examples/scripts/results/test_babilong_pairs.json +8 -0
  1126. wisent/examples/scripts/results/test_bangla_mmlu_evaluation.json +30 -0
  1127. wisent/examples/scripts/results/test_bangla_mmlu_pairs.json +8 -0
  1128. wisent/examples/scripts/results/test_banking77_evaluation.json +30 -0
  1129. wisent/examples/scripts/results/test_banking77_pairs.json +8 -0
  1130. wisent/examples/scripts/results/test_basque/test_basque-glue_pairs.json +14 -0
  1131. wisent/examples/scripts/results/test_basque-glue_evaluation.json +51 -0
  1132. wisent/examples/scripts/results/test_basque-glue_pairs.json +14 -0
  1133. wisent/examples/scripts/results/test_basque2/test_basque-glue_evaluation.json +51 -0
  1134. wisent/examples/scripts/results/test_basque2/test_basque-glue_pairs.json +14 -0
  1135. wisent/examples/scripts/results/test_basque_bench_evaluation.json +51 -0
  1136. wisent/examples/scripts/results/test_basque_bench_pairs.json +14 -0
  1137. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_evaluation.json +51 -0
  1138. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_pairs.json +14 -0
  1139. wisent/examples/scripts/results/test_basqueglue_evaluation.json +51 -0
  1140. wisent/examples/scripts/results/test_basqueglue_pairs.json +14 -0
  1141. wisent/examples/scripts/results/test_bbh_evaluation.json +51 -0
  1142. wisent/examples/scripts/results/test_bbh_pairs.json +14 -0
  1143. wisent/examples/scripts/results/test_bbq_evaluation.json +30 -0
  1144. wisent/examples/scripts/results/test_bbq_pairs.json +8 -0
  1145. wisent/examples/scripts/results/test_bec2016eu_evaluation.json +51 -0
  1146. wisent/examples/scripts/results/test_bec2016eu_pairs.json +14 -0
  1147. wisent/examples/scripts/results/test_belebele_evaluation.json +51 -0
  1148. wisent/examples/scripts/results/test_belebele_pairs.json +14 -0
  1149. wisent/examples/scripts/results/test_benchmarks_evaluation.json +51 -0
  1150. wisent/examples/scripts/results/test_benchmarks_pairs.json +14 -0
  1151. wisent/examples/scripts/results/test_bertaqa_evaluation.json +51 -0
  1152. wisent/examples/scripts/results/test_bertaqa_pairs.json +14 -0
  1153. wisent/examples/scripts/results/test_bhtc_v2_evaluation.json +30 -0
  1154. wisent/examples/scripts/results/test_bhtc_v2_pairs.json +8 -0
  1155. wisent/examples/scripts/results/test_bigbench_evaluation.json +51 -0
  1156. wisent/examples/scripts/results/test_bigbench_pairs.json +14 -0
  1157. wisent/examples/scripts/results/test_blimp_evaluation.json +51 -0
  1158. wisent/examples/scripts/results/test_blimp_pairs.json +14 -0
  1159. wisent/examples/scripts/results/test_boolq/test_boolq_evaluation.json +30 -0
  1160. wisent/examples/scripts/results/test_boolq/test_boolq_pairs.json +8 -0
  1161. wisent/examples/scripts/results/test_boolq-seq2seq_evaluation.json +30 -0
  1162. wisent/examples/scripts/results/test_boolq-seq2seq_pairs.json +8 -0
  1163. wisent/examples/scripts/results/test_boolq_evaluation.json +30 -0
  1164. wisent/examples/scripts/results/test_boolq_pairs.json +8 -0
  1165. wisent/examples/scripts/results/test_c4_evaluation.json +30 -0
  1166. wisent/examples/scripts/results/test_c4_pairs.json +8 -0
  1167. wisent/examples/scripts/results/test_cabreu_evaluation.json +30 -0
  1168. wisent/examples/scripts/results/test_cabreu_pairs.json +8 -0
  1169. wisent/examples/scripts/results/test_careqa_evaluation.json +30 -0
  1170. wisent/examples/scripts/results/test_careqa_pairs.json +8 -0
  1171. wisent/examples/scripts/results/test_catalan_bench_evaluation.json +51 -0
  1172. wisent/examples/scripts/results/test_catalan_bench_pairs.json +14 -0
  1173. wisent/examples/scripts/results/test_catalanqa_evaluation.json +30 -0
  1174. wisent/examples/scripts/results/test_catalanqa_pairs.json +8 -0
  1175. wisent/examples/scripts/results/test_catcola_evaluation.json +30 -0
  1176. wisent/examples/scripts/results/test_catcola_pairs.json +8 -0
  1177. wisent/examples/scripts/results/test_cb_evaluation.json +30 -0
  1178. wisent/examples/scripts/results/test_cb_pairs.json +8 -0
  1179. wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json +51 -0
  1180. wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json +14 -0
  1181. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json +30 -0
  1182. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json +8 -0
  1183. wisent/examples/scripts/results/test_ceval_evaluation.json +51 -0
  1184. wisent/examples/scripts/results/test_ceval_pairs.json +14 -0
  1185. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json +51 -0
  1186. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json +14 -0
  1187. wisent/examples/scripts/results/test_chain_of_thought_evaluation.json +51 -0
  1188. wisent/examples/scripts/results/test_chain_of_thought_pairs.json +14 -0
  1189. wisent/examples/scripts/results/test_chartqa_evaluation.json +30 -0
  1190. wisent/examples/scripts/results/test_chartqa_pairs.json +8 -0
  1191. wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json +30 -0
  1192. wisent/examples/scripts/results/test_claim_stance_topic_pairs.json +8 -0
  1193. wisent/examples/scripts/results/test_cmmlu_evaluation.json +51 -0
  1194. wisent/examples/scripts/results/test_cmmlu_pairs.json +14 -0
  1195. wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json +30 -0
  1196. wisent/examples/scripts/results/test_cnn_dailymail_pairs.json +8 -0
  1197. wisent/examples/scripts/results/test_cocoteros_es_evaluation.json +30 -0
  1198. wisent/examples/scripts/results/test_cocoteros_es_pairs.json +8 -0
  1199. wisent/examples/scripts/results/test_codexglue_code_to_text_go_evaluation.json +30 -0
  1200. wisent/examples/scripts/results/test_codexglue_code_to_text_go_pairs.json +8 -0
  1201. wisent/examples/scripts/results/test_codexglue_code_to_text_java_evaluation.json +30 -0
  1202. wisent/examples/scripts/results/test_codexglue_code_to_text_java_pairs.json +8 -0
  1203. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_evaluation.json +30 -0
  1204. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_pairs.json +8 -0
  1205. wisent/examples/scripts/results/test_codexglue_code_to_text_php_evaluation.json +30 -0
  1206. wisent/examples/scripts/results/test_codexglue_code_to_text_php_pairs.json +8 -0
  1207. wisent/examples/scripts/results/test_codexglue_code_to_text_python_evaluation.json +30 -0
  1208. wisent/examples/scripts/results/test_codexglue_code_to_text_python_pairs.json +8 -0
  1209. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_evaluation.json +30 -0
  1210. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_pairs.json +8 -0
  1211. wisent/examples/scripts/results/test_coedit_gec_evaluation.json +30 -0
  1212. wisent/examples/scripts/results/test_coedit_gec_pairs.json +8 -0
  1213. wisent/examples/scripts/results/test_cola_evaluation.json +30 -0
  1214. wisent/examples/scripts/results/test_cola_pairs.json +8 -0
  1215. wisent/examples/scripts/results/test_commonsense_qa_evaluation.json +30 -0
  1216. wisent/examples/scripts/results/test_commonsense_qa_pairs.json +8 -0
  1217. wisent/examples/scripts/results/test_conala_evaluation.json +30 -0
  1218. wisent/examples/scripts/results/test_conala_pairs.json +8 -0
  1219. wisent/examples/scripts/results/test_concode_evaluation.json +30 -0
  1220. wisent/examples/scripts/results/test_concode_pairs.json +8 -0
  1221. wisent/examples/scripts/results/test_copa_evaluation.json +30 -0
  1222. wisent/examples/scripts/results/test_copa_pairs.json +8 -0
  1223. wisent/examples/scripts/results/test_copal_id_evaluation.json +30 -0
  1224. wisent/examples/scripts/results/test_copal_id_pairs.json +8 -0
  1225. wisent/examples/scripts/results/test_coqa_evaluation.json +30 -0
  1226. wisent/examples/scripts/results/test_coqa_pairs.json +8 -0
  1227. wisent/examples/scripts/results/test_coqcat_evaluation.json +30 -0
  1228. wisent/examples/scripts/results/test_coqcat_pairs.json +8 -0
  1229. wisent/examples/scripts/results/test_crows_pairs_evaluation.json +51 -0
  1230. wisent/examples/scripts/results/test_crows_pairs_pairs.json +14 -0
  1231. wisent/examples/scripts/results/test_csatqa_evaluation.json +51 -0
  1232. wisent/examples/scripts/results/test_csatqa_pairs.json +14 -0
  1233. wisent/examples/scripts/results/test_cycle_letters_evaluation.json +30 -0
  1234. wisent/examples/scripts/results/test_cycle_letters_pairs.json +8 -0
  1235. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_evaluation.json +51 -0
  1236. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_pairs.json +14 -0
  1237. wisent/examples/scripts/results/test_darija_bench_evaluation.json +51 -0
  1238. wisent/examples/scripts/results/test_darija_bench_pairs.json +14 -0
  1239. wisent/examples/scripts/results/test_darijahellaswag_evaluation.json +30 -0
  1240. wisent/examples/scripts/results/test_darijahellaswag_pairs.json +8 -0
  1241. wisent/examples/scripts/results/test_darijammlu_evaluation.json +51 -0
  1242. wisent/examples/scripts/results/test_darijammlu_pairs.json +14 -0
  1243. wisent/examples/scripts/results/test_dbpedia_14_evaluation.json +30 -0
  1244. wisent/examples/scripts/results/test_dbpedia_14_pairs.json +8 -0
  1245. wisent/examples/scripts/results/test_drop_evaluation.json +30 -0
  1246. wisent/examples/scripts/results/test_drop_pairs.json +8 -0
  1247. wisent/examples/scripts/results/test_ds1000_evaluation.json +30 -0
  1248. wisent/examples/scripts/results/test_ds1000_pairs.json +8 -0
  1249. wisent/examples/scripts/results/test_egyhellaswag_evaluation.json +30 -0
  1250. wisent/examples/scripts/results/test_egyhellaswag_pairs.json +8 -0
  1251. wisent/examples/scripts/results/test_egymmlu_evaluation.json +51 -0
  1252. wisent/examples/scripts/results/test_egymmlu_pairs.json +14 -0
  1253. wisent/examples/scripts/results/test_epec_koref_bin_evaluation.json +30 -0
  1254. wisent/examples/scripts/results/test_epec_koref_bin_pairs.json +8 -0
  1255. wisent/examples/scripts/results/test_eq_bench_evaluation.json +30 -0
  1256. wisent/examples/scripts/results/test_eq_bench_pairs.json +8 -0
  1257. wisent/examples/scripts/results/test_escola_evaluation.json +30 -0
  1258. wisent/examples/scripts/results/test_escola_pairs.json +8 -0
  1259. wisent/examples/scripts/results/test_ethics_cm_evaluation.json +30 -0
  1260. wisent/examples/scripts/results/test_ethics_cm_pairs.json +8 -0
  1261. wisent/examples/scripts/results/test_ethos_binary_evaluation.json +30 -0
  1262. wisent/examples/scripts/results/test_ethos_binary_pairs.json +8 -0
  1263. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_evaluation.json +51 -0
  1264. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_pairs.json +14 -0
  1265. wisent/examples/scripts/results/test_eus_exams_es_evaluation.json +51 -0
  1266. wisent/examples/scripts/results/test_eus_exams_es_pairs.json +14 -0
  1267. wisent/examples/scripts/results/test_eus_exams_evaluation.json +51 -0
  1268. wisent/examples/scripts/results/test_eus_exams_pairs.json +14 -0
  1269. wisent/examples/scripts/results/test_eus_proficiency_evaluation.json +30 -0
  1270. wisent/examples/scripts/results/test_eus_proficiency_pairs.json +8 -0
  1271. wisent/examples/scripts/results/test_eus_reading_evaluation.json +30 -0
  1272. wisent/examples/scripts/results/test_eus_reading_pairs.json +8 -0
  1273. wisent/examples/scripts/results/test_eus_trivia_evaluation.json +30 -0
  1274. wisent/examples/scripts/results/test_eus_trivia_pairs.json +8 -0
  1275. wisent/examples/scripts/results/test_evalita-mp_evaluation.json +51 -0
  1276. wisent/examples/scripts/results/test_evalita-mp_pairs.json +14 -0
  1277. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +30 -0
  1278. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_pairs.json +8 -0
  1279. wisent/examples/scripts/results/test_evalita_LLM_evaluation.json +51 -0
  1280. wisent/examples/scripts/results/test_evalita_LLM_pairs.json +14 -0
  1281. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_evaluation.json +51 -0
  1282. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_pairs.json +14 -0
  1283. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_evaluation.json +30 -0
  1284. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_pairs.json +8 -0
  1285. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_evaluation.json +51 -0
  1286. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_pairs.json +14 -0
  1287. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +30 -0
  1288. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_pairs.json +8 -0
  1289. wisent/examples/scripts/results/test_fda_evaluation.json +30 -0
  1290. wisent/examples/scripts/results/test_fda_pairs.json +8 -0
  1291. wisent/examples/scripts/results/test_financial_tweets_evaluation.json +30 -0
  1292. wisent/examples/scripts/results/test_financial_tweets_pairs.json +8 -0
  1293. wisent/examples/scripts/results/test_fld/test_fld_evaluation.json +30 -0
  1294. wisent/examples/scripts/results/test_fld/test_fld_pairs.json +8 -0
  1295. wisent/examples/scripts/results/test_fld_evaluation.json +30 -0
  1296. wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json +30 -0
  1297. wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json +8 -0
  1298. wisent/examples/scripts/results/test_fld_pairs.json +8 -0
  1299. wisent/examples/scripts/results/test_flores_evaluation.json +51 -0
  1300. wisent/examples/scripts/results/test_flores_pairs.json +14 -0
  1301. wisent/examples/scripts/results/test_freebase_evaluation.json +30 -0
  1302. wisent/examples/scripts/results/test_freebase_pairs.json +8 -0
  1303. wisent/examples/scripts/results/test_french_bench_evaluation.json +51 -0
  1304. wisent/examples/scripts/results/test_french_bench_pairs.json +14 -0
  1305. wisent/examples/scripts/results/test_galcola_evaluation.json +30 -0
  1306. wisent/examples/scripts/results/test_galcola_pairs.json +8 -0
  1307. wisent/examples/scripts/results/test_galician_bench_evaluation.json +51 -0
  1308. wisent/examples/scripts/results/test_galician_bench_pairs.json +14 -0
  1309. wisent/examples/scripts/results/test_glianorex_evaluation.json +30 -0
  1310. wisent/examples/scripts/results/test_glianorex_pairs.json +8 -0
  1311. wisent/examples/scripts/results/test_global_mmlu_evaluation.json +51 -0
  1312. wisent/examples/scripts/results/test_global_mmlu_pairs.json +14 -0
  1313. wisent/examples/scripts/results/test_glue_evaluation.json +51 -0
  1314. wisent/examples/scripts/results/test_glue_pairs.json +14 -0
  1315. wisent/examples/scripts/results/test_gpqa_evaluation.json +51 -0
  1316. wisent/examples/scripts/results/test_gpqa_pairs.json +14 -0
  1317. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_evaluation.json +51 -0
  1318. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_pairs.json +14 -0
  1319. wisent/examples/scripts/results/test_groundcocoa_evaluation.json +30 -0
  1320. wisent/examples/scripts/results/test_groundcocoa_pairs.json +8 -0
  1321. wisent/examples/scripts/results/test_gsm8k_evaluation.json +30 -0
  1322. wisent/examples/scripts/results/test_gsm8k_pairs.json +8 -0
  1323. wisent/examples/scripts/results/test_haerae_evaluation.json +51 -0
  1324. wisent/examples/scripts/results/test_haerae_pairs.json +14 -0
  1325. wisent/examples/scripts/results/test_headqa_evaluation.json +30 -0
  1326. wisent/examples/scripts/results/test_headqa_pairs.json +8 -0
  1327. wisent/examples/scripts/results/test_hellaswag_evaluation.json +30 -0
  1328. wisent/examples/scripts/results/test_hellaswag_pairs.json +8 -0
  1329. wisent/examples/scripts/results/test_hendrycks_ethics_evaluation.json +51 -0
  1330. wisent/examples/scripts/results/test_hendrycks_ethics_pairs.json +14 -0
  1331. wisent/examples/scripts/results/test_hendrycks_math_evaluation.json +51 -0
  1332. wisent/examples/scripts/results/test_hendrycks_math_pairs.json +14 -0
  1333. wisent/examples/scripts/results/test_histoires_morales_evaluation.json +30 -0
  1334. wisent/examples/scripts/results/test_histoires_morales_pairs.json +8 -0
  1335. wisent/examples/scripts/results/test_hmmt_evaluation.json +30 -0
  1336. wisent/examples/scripts/results/test_hmmt_feb_2025_evaluation.json +30 -0
  1337. wisent/examples/scripts/results/test_hmmt_feb_2025_pairs.json +8 -0
  1338. wisent/examples/scripts/results/test_hmmt_pairs.json +8 -0
  1339. wisent/examples/scripts/results/test_hrm8k_evaluation.json +51 -0
  1340. wisent/examples/scripts/results/test_hrm8k_pairs.json +14 -0
  1341. wisent/examples/scripts/results/test_humaneval_evaluation.json +30 -0
  1342. wisent/examples/scripts/results/test_humaneval_pairs.json +8 -0
  1343. wisent/examples/scripts/results/test_humaneval_plus_evaluation.json +30 -0
  1344. wisent/examples/scripts/results/test_humaneval_plus_pairs.json +8 -0
  1345. wisent/examples/scripts/results/test_ifeval_evaluation.json +30 -0
  1346. wisent/examples/scripts/results/test_ifeval_pairs.json +8 -0
  1347. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_evaluation.json +30 -0
  1348. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_pairs.json +8 -0
  1349. wisent/examples/scripts/results/test_instruct_humaneval_evaluation.json +30 -0
  1350. wisent/examples/scripts/results/test_instruct_humaneval_pairs.json +8 -0
  1351. wisent/examples/scripts/results/test_inverse_scaling_evaluation.json +51 -0
  1352. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_evaluation.json +30 -0
  1353. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_pairs.json +8 -0
  1354. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_evaluation.json +51 -0
  1355. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_pairs.json +14 -0
  1356. wisent/examples/scripts/results/test_inverse_scaling_pairs.json +14 -0
  1357. wisent/examples/scripts/results/test_iwslt2017-ar-en_evaluation.json +30 -0
  1358. wisent/examples/scripts/results/test_iwslt2017-ar-en_pairs.json +8 -0
  1359. wisent/examples/scripts/results/test_iwslt2017-en-ar_evaluation.json +30 -0
  1360. wisent/examples/scripts/results/test_iwslt2017-en-ar_pairs.json +8 -0
  1361. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_evaluation.json +30 -0
  1362. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_pairs.json +8 -0
  1363. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_evaluation.json +30 -0
  1364. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_pairs.json +8 -0
  1365. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_evaluation.json +30 -0
  1366. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_pairs.json +8 -0
  1367. wisent/examples/scripts/results/test_japanese_leaderboard_evaluation.json +51 -0
  1368. wisent/examples/scripts/results/test_japanese_leaderboard_pairs.json +14 -0
  1369. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_evaluation.json +30 -0
  1370. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_pairs.json +8 -0
  1371. wisent/examples/scripts/results/test_jsonschema_bench_evaluation.json +30 -0
  1372. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_evaluation.json +30 -0
  1373. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_pairs.json +8 -0
  1374. wisent/examples/scripts/results/test_jsonschema_bench_pairs.json +8 -0
  1375. wisent/examples/scripts/results/test_kbl_evaluation.json +51 -0
  1376. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_evaluation.json +51 -0
  1377. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_pairs.json +14 -0
  1378. wisent/examples/scripts/results/test_kbl_pairs.json +14 -0
  1379. wisent/examples/scripts/results/test_kmmlu_evaluation.json +51 -0
  1380. wisent/examples/scripts/results/test_kmmlu_pairs.json +14 -0
  1381. wisent/examples/scripts/results/test_kobest_evaluation.json +51 -0
  1382. wisent/examples/scripts/results/test_kobest_pairs.json +14 -0
  1383. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json +30 -0
  1384. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json +8 -0
  1385. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json +30 -0
  1386. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json +8 -0
  1387. wisent/examples/scripts/results/test_kormedmcqa_evaluation.json +30 -0
  1388. wisent/examples/scripts/results/test_kormedmcqa_pairs.json +8 -0
  1389. wisent/examples/scripts/results/test_lambada_cloze_evaluation.json +30 -0
  1390. wisent/examples/scripts/results/test_lambada_cloze_pairs.json +8 -0
  1391. wisent/examples/scripts/results/test_lambada_evaluation.json +30 -0
  1392. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json +30 -0
  1393. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json +8 -0
  1394. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json +51 -0
  1395. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json +14 -0
  1396. wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json +51 -0
  1397. wisent/examples/scripts/results/test_lambada_multilingual_pairs.json +14 -0
  1398. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json +51 -0
  1399. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json +14 -0
  1400. wisent/examples/scripts/results/test_lambada_openai_evaluation.json +30 -0
  1401. wisent/examples/scripts/results/test_lambada_openai_pairs.json +8 -0
  1402. wisent/examples/scripts/results/test_lambada_pairs.json +8 -0
  1403. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +30 -0
  1404. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +8 -0
  1405. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +30 -0
  1406. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +8 -0
  1407. wisent/examples/scripts/results/test_lambada_standard_evaluation.json +30 -0
  1408. wisent/examples/scripts/results/test_lambada_standard_pairs.json +8 -0
  1409. wisent/examples/scripts/results/test_leaderboard_evaluation.json +51 -0
  1410. wisent/examples/scripts/results/test_leaderboard_pairs.json +14 -0
  1411. wisent/examples/scripts/results/test_libra/test_libra_evaluation.json +51 -0
  1412. wisent/examples/scripts/results/test_libra/test_libra_pairs.json +14 -0
  1413. wisent/examples/scripts/results/test_libra_evaluation.json +51 -0
  1414. wisent/examples/scripts/results/test_libra_pairs.json +14 -0
  1415. wisent/examples/scripts/results/test_lingoly_evaluation.json +30 -0
  1416. wisent/examples/scripts/results/test_lingoly_pairs.json +8 -0
  1417. wisent/examples/scripts/results/test_livecodebench_evaluation.json +30 -0
  1418. wisent/examples/scripts/results/test_livecodebench_pairs.json +8 -0
  1419. wisent/examples/scripts/results/test_livemathbench_cnmo_en_evaluation.json +30 -0
  1420. wisent/examples/scripts/results/test_livemathbench_cnmo_en_pairs.json +8 -0
  1421. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_evaluation.json +30 -0
  1422. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_pairs.json +8 -0
  1423. wisent/examples/scripts/results/test_llama_evaluation.json +30 -0
  1424. wisent/examples/scripts/results/test_llama_pairs.json +8 -0
  1425. wisent/examples/scripts/results/test_logiqa2_evaluation.json +30 -0
  1426. wisent/examples/scripts/results/test_logiqa2_pairs.json +8 -0
  1427. wisent/examples/scripts/results/test_logiqa_evaluation.json +30 -0
  1428. wisent/examples/scripts/results/test_logiqa_pairs.json +8 -0
  1429. wisent/examples/scripts/results/test_m_mmlu_evaluation.json +51 -0
  1430. wisent/examples/scripts/results/test_m_mmlu_pairs.json +14 -0
  1431. wisent/examples/scripts/results/test_mastermind/test_mastermind_evaluation.json +51 -0
  1432. wisent/examples/scripts/results/test_mastermind/test_mastermind_pairs.json +14 -0
  1433. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_evaluation.json +30 -0
  1434. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_pairs.json +8 -0
  1435. wisent/examples/scripts/results/test_mastermind_evaluation.json +51 -0
  1436. wisent/examples/scripts/results/test_mastermind_pairs.json +14 -0
  1437. wisent/examples/scripts/results/test_math500_evaluation.json +30 -0
  1438. wisent/examples/scripts/results/test_math500_pairs.json +8 -0
  1439. wisent/examples/scripts/results/test_math_evaluation.json +30 -0
  1440. wisent/examples/scripts/results/test_math_pairs.json +8 -0
  1441. wisent/examples/scripts/results/test_mathqa_evaluation.json +30 -0
  1442. wisent/examples/scripts/results/test_mathqa_pairs.json +8 -0
  1443. wisent/examples/scripts/results/test_mbpp_evaluation.json +30 -0
  1444. wisent/examples/scripts/results/test_mbpp_pairs.json +8 -0
  1445. wisent/examples/scripts/results/test_mbpp_plus_evaluation.json +30 -0
  1446. wisent/examples/scripts/results/test_mbpp_plus_pairs.json +8 -0
  1447. wisent/examples/scripts/results/test_mc_taco_evaluation.json +30 -0
  1448. wisent/examples/scripts/results/test_mc_taco_pairs.json +8 -0
  1449. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_evaluation.json +51 -0
  1450. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_pairs.json +14 -0
  1451. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_evaluation.json +30 -0
  1452. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_pairs.json +8 -0
  1453. wisent/examples/scripts/results/test_med_concepts_qa_evaluation.json +51 -0
  1454. wisent/examples/scripts/results/test_med_concepts_qa_pairs.json +14 -0
  1455. wisent/examples/scripts/results/test_meddialog_evaluation.json +30 -0
  1456. wisent/examples/scripts/results/test_meddialog_pairs.json +8 -0
  1457. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_evaluation.json +30 -0
  1458. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_pairs.json +8 -0
  1459. wisent/examples/scripts/results/test_mediqa_qa2019_evaluation.json +30 -0
  1460. wisent/examples/scripts/results/test_mediqa_qa2019_pairs.json +8 -0
  1461. wisent/examples/scripts/results/test_medmcqa_evaluation.json +30 -0
  1462. wisent/examples/scripts/results/test_medmcqa_pairs.json +8 -0
  1463. wisent/examples/scripts/results/test_medqa_evaluation.json +30 -0
  1464. wisent/examples/scripts/results/test_medqa_pairs.json +8 -0
  1465. wisent/examples/scripts/results/test_medtext_evaluation.json +30 -0
  1466. wisent/examples/scripts/results/test_medtext_pairs.json +8 -0
  1467. wisent/examples/scripts/results/test_mela_evaluation.json +51 -0
  1468. wisent/examples/scripts/results/test_mela_pairs.json +14 -0
  1469. wisent/examples/scripts/results/test_meqsum_evaluation.json +30 -0
  1470. wisent/examples/scripts/results/test_meqsum_pairs.json +8 -0
  1471. wisent/examples/scripts/results/test_mercury_evaluation.json +30 -0
  1472. wisent/examples/scripts/results/test_mercury_pairs.json +8 -0
  1473. wisent/examples/scripts/results/test_metabench_evaluation.json +51 -0
  1474. wisent/examples/scripts/results/test_metabench_pairs.json +14 -0
  1475. wisent/examples/scripts/results/test_mgsm_evaluation.json +51 -0
  1476. wisent/examples/scripts/results/test_mgsm_pairs.json +14 -0
  1477. wisent/examples/scripts/results/test_mimic_repsum_evaluation.json +30 -0
  1478. wisent/examples/scripts/results/test_mimic_repsum_pairs.json +8 -0
  1479. wisent/examples/scripts/results/test_minerva_math_evaluation.json +51 -0
  1480. wisent/examples/scripts/results/test_minerva_math_pairs.json +14 -0
  1481. wisent/examples/scripts/results/test_mlqa_evaluation.json +51 -0
  1482. wisent/examples/scripts/results/test_mlqa_pairs.json +14 -0
  1483. wisent/examples/scripts/results/test_mmlu-pro-plus_evaluation.json +51 -0
  1484. wisent/examples/scripts/results/test_mmlu-pro-plus_pairs.json +14 -0
  1485. wisent/examples/scripts/results/test_mmlu_evaluation.json +51 -0
  1486. wisent/examples/scripts/results/test_mmlu_pairs.json +14 -0
  1487. wisent/examples/scripts/results/test_mmlu_pro_evaluation.json +51 -0
  1488. wisent/examples/scripts/results/test_mmlu_pro_pairs.json +14 -0
  1489. wisent/examples/scripts/results/test_mmlu_prox_evaluation.json +51 -0
  1490. wisent/examples/scripts/results/test_mmlu_prox_pairs.json +14 -0
  1491. wisent/examples/scripts/results/test_mmlusr_evaluation.json +30 -0
  1492. wisent/examples/scripts/results/test_mmlusr_pairs.json +8 -0
  1493. wisent/examples/scripts/results/test_mmmu_evaluation.json +51 -0
  1494. wisent/examples/scripts/results/test_mmmu_pairs.json +14 -0
  1495. wisent/examples/scripts/results/test_mnli_evaluation.json +30 -0
  1496. wisent/examples/scripts/results/test_mnli_pairs.json +8 -0
  1497. wisent/examples/scripts/results/test_model_written_evals_evaluation.json +51 -0
  1498. wisent/examples/scripts/results/test_model_written_evals_pairs.json +14 -0
  1499. wisent/examples/scripts/results/test_moral_stories_evaluation.json +30 -0
  1500. wisent/examples/scripts/results/test_moral_stories_pairs.json +8 -0
  1501. wisent/examples/scripts/results/test_mts_dialog_evaluation.json +30 -0
  1502. wisent/examples/scripts/results/test_mts_dialog_pairs.json +8 -0
  1503. wisent/examples/scripts/results/test_multiblimp_evaluation.json +51 -0
  1504. wisent/examples/scripts/results/test_multiblimp_pairs.json +14 -0
  1505. wisent/examples/scripts/results/test_multimedqa_evaluation.json +51 -0
  1506. wisent/examples/scripts/results/test_multimedqa_pairs.json +14 -0
  1507. wisent/examples/scripts/results/test_multipl_e_evaluation.json +30 -0
  1508. wisent/examples/scripts/results/test_multipl_e_pairs.json +8 -0
  1509. wisent/examples/scripts/results/test_mutual_evaluation.json +30 -0
  1510. wisent/examples/scripts/results/test_mutual_pairs.json +8 -0
  1511. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_evaluation.json +30 -0
  1512. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_pairs.json +8 -0
  1513. wisent/examples/scripts/results/test_noreval_evaluation.json +51 -0
  1514. wisent/examples/scripts/results/test_noreval_pairs.json +14 -0
  1515. wisent/examples/scripts/results/test_noticia_evaluation.json +30 -0
  1516. wisent/examples/scripts/results/test_noticia_pairs.json +8 -0
  1517. wisent/examples/scripts/results/test_nq_open_evaluation.json +30 -0
  1518. wisent/examples/scripts/results/test_nq_open_pairs.json +8 -0
  1519. wisent/examples/scripts/results/test_olaph_evaluation.json +30 -0
  1520. wisent/examples/scripts/results/test_olaph_pairs.json +8 -0
  1521. wisent/examples/scripts/results/test_openbookqa_evaluation.json +30 -0
  1522. wisent/examples/scripts/results/test_openbookqa_pairs.json +8 -0
  1523. wisent/examples/scripts/results/test_openllm_evaluation.json +51 -0
  1524. wisent/examples/scripts/results/test_openllm_pairs.json +14 -0
  1525. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_evaluation.json +30 -0
  1526. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_pairs.json +8 -0
  1527. wisent/examples/scripts/results/test_paloma_evaluation.json +51 -0
  1528. wisent/examples/scripts/results/test_paloma_pairs.json +14 -0
  1529. wisent/examples/scripts/results/test_passkey/test_passkey_evaluation.json +30 -0
  1530. wisent/examples/scripts/results/test_passkey/test_passkey_pairs.json +8 -0
  1531. wisent/examples/scripts/results/test_paws-x_evaluation.json +51 -0
  1532. wisent/examples/scripts/results/test_paws-x_pairs.json +14 -0
  1533. wisent/examples/scripts/results/test_paws_en/test_paws_en_evaluation.json +30 -0
  1534. wisent/examples/scripts/results/test_paws_en/test_paws_en_pairs.json +8 -0
  1535. wisent/examples/scripts/results/test_penn_treebank_evaluation.json +30 -0
  1536. wisent/examples/scripts/results/test_penn_treebank_pairs.json +8 -0
  1537. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_evaluation.json +30 -0
  1538. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_pairs.json +8 -0
  1539. wisent/examples/scripts/results/test_piqa_evaluation.json +30 -0
  1540. wisent/examples/scripts/results/test_piqa_pairs.json +8 -0
  1541. wisent/examples/scripts/results/test_polemo2_evaluation.json +30 -0
  1542. wisent/examples/scripts/results/test_polemo2_pairs.json +8 -0
  1543. wisent/examples/scripts/results/test_polymath_en_high_evaluation.json +30 -0
  1544. wisent/examples/scripts/results/test_polymath_en_high_pairs.json +8 -0
  1545. wisent/examples/scripts/results/test_polymath_en_medium_evaluation.json +30 -0
  1546. wisent/examples/scripts/results/test_polymath_en_medium_pairs.json +8 -0
  1547. wisent/examples/scripts/results/test_polymath_zh_high_evaluation.json +30 -0
  1548. wisent/examples/scripts/results/test_polymath_zh_high_pairs.json +8 -0
  1549. wisent/examples/scripts/results/test_polymath_zh_medium_evaluation.json +30 -0
  1550. wisent/examples/scripts/results/test_polymath_zh_medium_pairs.json +8 -0
  1551. wisent/examples/scripts/results/test_portuguese_bench_evaluation.json +51 -0
  1552. wisent/examples/scripts/results/test_portuguese_bench_pairs.json +14 -0
  1553. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_evaluation.json +30 -0
  1554. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_pairs.json +8 -0
  1555. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_evaluation.json +30 -0
  1556. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_pairs.json +8 -0
  1557. wisent/examples/scripts/results/test_prost_evaluation.json +30 -0
  1558. wisent/examples/scripts/results/test_prost_pairs.json +8 -0
  1559. wisent/examples/scripts/results/test_ptb_evaluation.json +30 -0
  1560. wisent/examples/scripts/results/test_ptb_pairs.json +8 -0
  1561. wisent/examples/scripts/results/test_pubmedqa_evaluation.json +30 -0
  1562. wisent/examples/scripts/results/test_pubmedqa_pairs.json +8 -0
  1563. wisent/examples/scripts/results/test_pythia_evaluation.json +51 -0
  1564. wisent/examples/scripts/results/test_pythia_pairs.json +14 -0
  1565. wisent/examples/scripts/results/test_qa4mre_evaluation.json +30 -0
  1566. wisent/examples/scripts/results/test_qa4mre_pairs.json +8 -0
  1567. wisent/examples/scripts/results/test_qasper_evaluation.json +30 -0
  1568. wisent/examples/scripts/results/test_qasper_pairs.json +8 -0
  1569. wisent/examples/scripts/results/test_race_evaluation.json +30 -0
  1570. wisent/examples/scripts/results/test_race_pairs.json +8 -0
  1571. wisent/examples/scripts/results/test_realtoxicityprompts_evaluation.json +30 -0
  1572. wisent/examples/scripts/results/test_realtoxicityprompts_pairs.json +8 -0
  1573. wisent/examples/scripts/results/test_recode_evaluation.json +30 -0
  1574. wisent/examples/scripts/results/test_recode_pairs.json +8 -0
  1575. wisent/examples/scripts/results/test_record_evaluation.json +30 -0
  1576. wisent/examples/scripts/results/test_record_pairs.json +8 -0
  1577. wisent/examples/scripts/results/test_ruler_evaluation.json +51 -0
  1578. wisent/examples/scripts/results/test_ruler_pairs.json +14 -0
  1579. wisent/examples/scripts/results/test_sciq_evaluation.json +30 -0
  1580. wisent/examples/scripts/results/test_sciq_pairs.json +8 -0
  1581. wisent/examples/scripts/results/test_score_evaluation.json +51 -0
  1582. wisent/examples/scripts/results/test_score_pairs.json +14 -0
  1583. wisent/examples/scripts/results/test_self_consistency_evaluation.json +30 -0
  1584. wisent/examples/scripts/results/test_self_consistency_pairs.json +8 -0
  1585. wisent/examples/scripts/results/test_siqa/test_siqa_evaluation.json +30 -0
  1586. wisent/examples/scripts/results/test_siqa/test_siqa_pairs.json +8 -0
  1587. wisent/examples/scripts/results/test_siqa_evaluation.json +30 -0
  1588. wisent/examples/scripts/results/test_siqa_pairs.json +8 -0
  1589. wisent/examples/scripts/results/test_spanish_bench_evaluation.json +51 -0
  1590. wisent/examples/scripts/results/test_spanish_bench_pairs.json +14 -0
  1591. wisent/examples/scripts/results/test_squad2_evaluation.json +30 -0
  1592. wisent/examples/scripts/results/test_squad2_pairs.json +8 -0
  1593. wisent/examples/scripts/results/test_squadv2_evaluation.json +30 -0
  1594. wisent/examples/scripts/results/test_squadv2_pairs.json +8 -0
  1595. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_evaluation.json +30 -0
  1596. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_pairs.json +8 -0
  1597. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_evaluation.json +51 -0
  1598. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_pairs.json +14 -0
  1599. wisent/examples/scripts/results/test_swag_evaluation.json +30 -0
  1600. wisent/examples/scripts/results/test_swag_pairs.json +8 -0
  1601. wisent/examples/scripts/results/test_tinyBenchmarks_evaluation.json +51 -0
  1602. wisent/examples/scripts/results/test_tinyBenchmarks_pairs.json +14 -0
  1603. wisent/examples/scripts/results/test_tmmluplus_evaluation.json +51 -0
  1604. wisent/examples/scripts/results/test_tmmluplus_pairs.json +14 -0
  1605. wisent/examples/scripts/results/test_translation_evaluation.json +51 -0
  1606. wisent/examples/scripts/results/test_translation_pairs.json +14 -0
  1607. wisent/examples/scripts/results/test_triviaqa_evaluation.json +30 -0
  1608. wisent/examples/scripts/results/test_triviaqa_pairs.json +8 -0
  1609. wisent/examples/scripts/results/test_truthfulqa-multi_evaluation.json +51 -0
  1610. wisent/examples/scripts/results/test_truthfulqa-multi_pairs.json +14 -0
  1611. wisent/examples/scripts/results/test_truthfulqa_evaluation.json +30 -0
  1612. wisent/examples/scripts/results/test_truthfulqa_mc1_evaluation.json +30 -0
  1613. wisent/examples/scripts/results/test_truthfulqa_mc1_pairs.json +8 -0
  1614. wisent/examples/scripts/results/test_truthfulqa_mc2_evaluation.json +30 -0
  1615. wisent/examples/scripts/results/test_truthfulqa_mc2_pairs.json +8 -0
  1616. wisent/examples/scripts/results/test_truthfulqa_pairs.json +8 -0
  1617. wisent/examples/scripts/results/test_turkishmmlu_evaluation.json +51 -0
  1618. wisent/examples/scripts/results/test_turkishmmlu_pairs.json +14 -0
  1619. wisent/examples/scripts/results/test_unfair_tos_evaluation.json +30 -0
  1620. wisent/examples/scripts/results/test_unfair_tos_pairs.json +8 -0
  1621. wisent/examples/scripts/results/test_unscramble_evaluation.json +51 -0
  1622. wisent/examples/scripts/results/test_unscramble_pairs.json +14 -0
  1623. wisent/examples/scripts/results/test_webqs_evaluation.json +30 -0
  1624. wisent/examples/scripts/results/test_webqs_pairs.json +8 -0
  1625. wisent/examples/scripts/results/test_wikitext103_evaluation.json +30 -0
  1626. wisent/examples/scripts/results/test_wikitext103_pairs.json +8 -0
  1627. wisent/examples/scripts/results/test_wikitext_evaluation.json +30 -0
  1628. wisent/examples/scripts/results/test_wikitext_pairs.json +8 -0
  1629. wisent/examples/scripts/results/test_winogender_evaluation.json +51 -0
  1630. wisent/examples/scripts/results/test_winogender_pairs.json +14 -0
  1631. wisent/examples/scripts/results/test_winogrande_evaluation.json +30 -0
  1632. wisent/examples/scripts/results/test_winogrande_pairs.json +8 -0
  1633. wisent/examples/scripts/results/test_wmdp_evaluation.json +30 -0
  1634. wisent/examples/scripts/results/test_wmdp_pairs.json +8 -0
  1635. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_evaluation.json +30 -0
  1636. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_pairs.json +8 -0
  1637. wisent/examples/scripts/results/test_wmt14_en_fr_evaluation.json +30 -0
  1638. wisent/examples/scripts/results/test_wmt14_en_fr_pairs.json +8 -0
  1639. wisent/examples/scripts/results/test_wmt16_en_de_evaluation.json +30 -0
  1640. wisent/examples/scripts/results/test_wmt16_en_de_pairs.json +8 -0
  1641. wisent/examples/scripts/results/test_wmt16_ro_en_evaluation.json +30 -0
  1642. wisent/examples/scripts/results/test_wmt16_ro_en_pairs.json +8 -0
  1643. wisent/examples/scripts/results/test_wsc273_evaluation.json +30 -0
  1644. wisent/examples/scripts/results/test_wsc273_pairs.json +8 -0
  1645. wisent/examples/scripts/results/test_xcopa_evaluation.json +51 -0
  1646. wisent/examples/scripts/results/test_xcopa_pairs.json +14 -0
  1647. wisent/examples/scripts/results/test_xnli_eu_evaluation.json +30 -0
  1648. wisent/examples/scripts/results/test_xnli_eu_pairs.json +8 -0
  1649. wisent/examples/scripts/results/test_xnli_evaluation.json +51 -0
  1650. wisent/examples/scripts/results/test_xnli_pairs.json +14 -0
  1651. wisent/examples/scripts/results/test_xquad_evaluation.json +51 -0
  1652. wisent/examples/scripts/results/test_xquad_pairs.json +14 -0
  1653. wisent/examples/scripts/results/test_xstorycloze_evaluation.json +51 -0
  1654. wisent/examples/scripts/results/test_xstorycloze_pairs.json +14 -0
  1655. wisent/examples/scripts/results/test_xsum_evaluation.json +30 -0
  1656. wisent/examples/scripts/results/test_xsum_pairs.json +8 -0
  1657. wisent/examples/scripts/results/test_xwinograd_evaluation.json +51 -0
  1658. wisent/examples/scripts/results/test_xwinograd_pairs.json +14 -0
  1659. wisent/examples/scripts/results/test_yahoo_answers_topics_evaluation.json +30 -0
  1660. wisent/examples/scripts/results/test_yahoo_answers_topics_pairs.json +8 -0
  1661. wisent/parameters/__init__.py +1 -0
  1662. wisent/parameters/lm_eval/all_lm_eval_task_families.json +169 -0
  1663. wisent/parameters/lm_eval/broken_in_lm_eval.json +10 -0
  1664. wisent/parameters/lm_eval/evaluations_not_lm_eval_tasks.json +0 -0
  1665. wisent/parameters/lm_eval/evaluator_check.json +3476 -0
  1666. wisent/parameters/lm_eval/final_verification.json +24782 -0
  1667. wisent/parameters/lm_eval/group_task_evaluators.json +1833 -0
  1668. wisent/parameters/lm_eval/group_tasks.json +150 -0
  1669. wisent/parameters/lm_eval/individual_tasks.json +402 -0
  1670. wisent/parameters/lm_eval/no_readmes.json +1 -0
  1671. wisent/parameters/lm_eval/not_lm_eval_tasks.json +110 -0
  1672. wisent/parameters/lm_eval/read_tasks.json +208 -0
  1673. wisent/parameters/lm_eval/readme_files.json +208 -0
  1674. wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json +128 -0
  1675. wisent/parameters/tasks/missing_task_families.json +2963 -0
  1676. wisent/parameters/tasks/remaining_tasks_to_implement.json +199 -0
  1677. wisent/parameters/tasks/risks.json +10 -0
  1678. wisent/parameters/tasks/skills.json +14 -0
  1679. wisent/parameters/tasks/tasks.json +56031 -0
  1680. wisent/scripts/run_quality_metrics_sweep.sh +315 -0
  1681. wisent/tests/__init__.py +0 -0
  1682. wisent/tests/examples/__init__.py +0 -0
  1683. wisent/tests/examples/cli/__init__.py +0 -0
  1684. wisent/tests/examples/cli/activations/__init__.py +0 -0
  1685. wisent/tests/examples/cli/activations/test_get_activations.py +127 -0
  1686. wisent/tests/examples/cli/classifier/__init__.py +0 -0
  1687. wisent/tests/examples/cli/classifier/test_classifier_examples.py +141 -0
  1688. wisent/tests/examples/cli/contrastive_pairs/__init__.py +0 -0
  1689. wisent/tests/examples/cli/contrastive_pairs/test_generate_pairs.py +89 -0
  1690. wisent/tests/examples/cli/evaluation/__init__.py +0 -0
  1691. wisent/tests/examples/cli/evaluation/test_evaluation_examples.py +117 -0
  1692. wisent/tests/examples/cli/generate/__init__.py +0 -0
  1693. wisent/tests/examples/cli/generate/test_generate_with_classifier.py +146 -0
  1694. wisent/tests/examples/cli/generate/test_generate_with_steering.py +149 -0
  1695. wisent/tests/examples/cli/generate/test_only_generate.py +110 -0
  1696. wisent/tests/examples/cli/multi_steering/__init__.py +0 -0
  1697. wisent/tests/examples/cli/multi_steering/test_multi_steer_from_trained_vectors.py +210 -0
  1698. wisent/tests/examples/cli/multi_steering/test_multi_steer_with_different_parameters.py +205 -0
  1699. wisent/tests/examples/cli/multi_steering/test_train_and_multi_steer.py +174 -0
  1700. wisent/tests/examples/cli/optimizer/__init__.py +0 -0
  1701. wisent/tests/examples/cli/optimizer/test_optimize_sample_size.py +102 -0
  1702. wisent/tests/examples/cli/optimizer/test_optimizer_examples.py +59 -0
  1703. wisent/tests/examples/cli/steering/__init__.py +0 -0
  1704. wisent/tests/examples/cli/steering/test_create_steering_vectors.py +135 -0
  1705. wisent/tests/examples/cli/synthetic/__init__.py +0 -0
  1706. wisent/tests/examples/cli/synthetic/test_synthetic_pairs.py +45 -0
  1707. wisent/tests/nosense/__init__.py +6 -0
  1708. wisent/tests/nosense/base_nosense.py +81 -0
  1709. wisent/tests/nosense/math500_nosense.py +72 -0
  1710. wisent/tests/nosense/test_robustness.py +336 -0
  1711. wisent/tests/test_all_cli_commands.py +674 -0
  1712. wisent/tests/test_geometry_comprehensive.py +327 -0
  1713. wisent/tests/test_titan_geometry.py +257 -0
  1714. wisent/tests/visualize_geometry.py +148 -0
  1715. wisent-0.7.379.dist-info/METADATA +64 -0
  1716. wisent-0.7.379.dist-info/RECORD +1720 -0
  1717. wisent-0.7.379.dist-info/WHEEL +5 -0
  1718. wisent-0.7.379.dist-info/entry_points.txt +2 -0
  1719. wisent-0.7.379.dist-info/licenses/LICENSE +21 -0
  1720. wisent-0.7.379.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2781 @@
1
+ {
2
+ "Tag": {
3
+ "evaluator": null,
4
+ "extractor_location": "huggingface_pairs",
5
+ "extractor_file": "tag",
6
+ "benchmark_type": "other",
7
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
8
+ },
9
+ "aclue": {
10
+ "evaluator": "log_likelihoods",
11
+ "extractor_location": "lm_eval_pairs",
12
+ "extractor_file": "aclue",
13
+ "benchmark_type": "other",
14
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
15
+ },
16
+ "acp_bench": {
17
+ "evaluator": "log_likelihoods",
18
+ "extractor_location": "lm_eval_pairs",
19
+ "extractor_file": "acp_bench",
20
+ "benchmark_type": "other",
21
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
22
+ },
23
+ "acp_bench_hard": {
24
+ "evaluator": "generation",
25
+ "extractor_location": "lm_eval_pairs",
26
+ "extractor_file": "acp_bench_hard",
27
+ "benchmark_type": "other",
28
+ "explanation": "Text generation evaluation - assesses quality of generated text"
29
+ },
30
+ "advanced": {
31
+ "evaluator": "log_likelihoods",
32
+ "extractor_location": "lm_eval_pairs",
33
+ "extractor_file": "advanced",
34
+ "benchmark_type": "other",
35
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
36
+ },
37
+ "aexams": {
38
+ "evaluator": "log_likelihoods",
39
+ "extractor_location": "lm_eval_pairs",
40
+ "extractor_file": "aexams",
41
+ "benchmark_type": "knowledge",
42
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
43
+ },
44
+ "afrimgsm": {
45
+ "evaluator": "generation",
46
+ "extractor_location": "lm_eval_pairs",
47
+ "extractor_file": "afrimgsm",
48
+ "benchmark_type": "mathematics",
49
+ "explanation": "Text generation evaluation - assesses quality of generated text"
50
+ },
51
+ "afrimmlu": {
52
+ "evaluator": "log_likelihoods",
53
+ "extractor_location": "lm_eval_pairs",
54
+ "extractor_file": "afrimmlu",
55
+ "benchmark_type": "knowledge",
56
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
57
+ },
58
+ "afrixnli": {
59
+ "evaluator": "log_likelihoods",
60
+ "extractor_location": "lm_eval_pairs",
61
+ "extractor_file": "afrixnli",
62
+ "benchmark_type": "other",
63
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
64
+ },
65
+ "ag": {
66
+ "evaluator": "exact_match",
67
+ "extractor_location": "lm_eval_pairs",
68
+ "extractor_file": "ag",
69
+ "benchmark_type": "other",
70
+ "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
71
+ },
72
+ "agieval": {
73
+ "evaluator": "exact_match",
74
+ "extractor_location": "lm_eval_pairs",
75
+ "extractor_file": "agieval",
76
+ "benchmark_type": "other",
77
+ "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
78
+ },
79
+ "ai2_arc": {
80
+ "evaluator": "log_likelihoods",
81
+ "extractor_location": "lm_eval_pairs",
82
+ "extractor_file": "ai2_arc",
83
+ "benchmark_type": "knowledge",
84
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
85
+ },
86
+ "aime": {
87
+ "evaluator": null,
88
+ "extractor_location": "huggingface_pairs",
89
+ "extractor_file": "aime",
90
+ "benchmark_type": "mathematics",
91
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
92
+ },
93
+ "aime2024": {
94
+ "evaluator": null,
95
+ "extractor_location": "huggingface_pairs",
96
+ "extractor_file": "aime",
97
+ "benchmark_type": "mathematics",
98
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
99
+ },
100
+ "aime2025": {
101
+ "evaluator": null,
102
+ "extractor_location": "huggingface_pairs",
103
+ "extractor_file": "aime",
104
+ "benchmark_type": "mathematics",
105
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
106
+ },
107
+ "anagrams1": {
108
+ "evaluator": "exact_match",
109
+ "extractor_location": "lm_eval_pairs",
110
+ "extractor_file": "anagrams1",
111
+ "benchmark_type": "other",
112
+ "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
113
+ },
114
+ "anagrams2": {
115
+ "evaluator": "exact_match",
116
+ "extractor_location": "lm_eval_pairs",
117
+ "extractor_file": "anagrams2",
118
+ "benchmark_type": "other",
119
+ "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
120
+ },
121
+ "anli": {
122
+ "evaluator": "log_likelihoods",
123
+ "extractor_location": "lm_eval_pairs",
124
+ "extractor_file": "anli",
125
+ "benchmark_type": "other",
126
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
127
+ },
128
+ "apps": {
129
+ "evaluator": null,
130
+ "extractor_location": "huggingface_pairs",
131
+ "extractor_file": "apps",
132
+ "benchmark_type": "coding",
133
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
134
+ },
135
+ "arabculture": {
136
+ "evaluator": "log_likelihoods",
137
+ "extractor_location": "lm_eval_pairs",
138
+ "extractor_file": "arabculture",
139
+ "benchmark_type": "other",
140
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
141
+ },
142
+ "arabic": {
143
+ "evaluator": "log_likelihoods",
144
+ "extractor_location": "lm_eval_pairs",
145
+ "extractor_file": "arabic",
146
+ "benchmark_type": "other",
147
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
148
+ },
149
+ "arabic_leaderboard_complete": {
150
+ "evaluator": "log_likelihoods",
151
+ "extractor_location": "lm_eval_pairs",
152
+ "extractor_file": "arabic_leaderboard_complete",
153
+ "benchmark_type": "other",
154
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
155
+ },
156
+ "arabic_leaderboard_light": {
157
+ "evaluator": "log_likelihoods",
158
+ "extractor_location": "lm_eval_pairs",
159
+ "extractor_file": "arabic_leaderboard_light",
160
+ "benchmark_type": "other",
161
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
162
+ },
163
+ "arabicmmlu": {
164
+ "evaluator": "log_likelihoods",
165
+ "extractor_location": "lm_eval_pairs",
166
+ "extractor_file": "arabicmmlu",
167
+ "benchmark_type": "knowledge",
168
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
169
+ },
170
+ "aradice": {
171
+ "evaluator": "log_likelihoods",
172
+ "extractor_location": "lm_eval_pairs",
173
+ "extractor_file": "aradice",
174
+ "benchmark_type": "other",
175
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
176
+ },
177
+ "arc": {
178
+ "evaluator": "log_likelihoods",
179
+ "extractor_location": "lm_eval_pairs",
180
+ "extractor_file": "arc",
181
+ "benchmark_type": "knowledge",
182
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
183
+ },
184
+ "arc_challenge": {
185
+ "evaluator": "log_likelihoods",
186
+ "extractor_location": "lm_eval_pairs",
187
+ "extractor_file": "arc_challenge",
188
+ "benchmark_type": "knowledge",
189
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
190
+ },
191
+ "arc_easy": {
192
+ "evaluator": "log_likelihoods",
193
+ "extractor_location": "lm_eval_pairs",
194
+ "extractor_file": "arc_easy",
195
+ "benchmark_type": "knowledge",
196
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
197
+ },
198
+ "argument": {
199
+ "evaluator": "exact_match",
200
+ "extractor_location": "lm_eval_pairs",
201
+ "extractor_file": "argument",
202
+ "benchmark_type": "other",
203
+ "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
204
+ },
205
+ "arithmetic": {
206
+ "evaluator": "exact_match",
207
+ "extractor_location": "lm_eval_pairs",
208
+ "extractor_file": "arithmetic",
209
+ "benchmark_type": "mathematics",
210
+ "explanation": "Text comparison (WARNING: should use execution for mathematics)"
211
+ },
212
+ "asdiv": {
213
+ "evaluator": "exact_match",
214
+ "extractor_location": "lm_eval_pairs",
215
+ "extractor_file": "asdiv",
216
+ "benchmark_type": "mathematics",
217
+ "explanation": "Text comparison (WARNING: should use execution for mathematics)"
218
+ },
219
+ "asdiv_cot_llama": {
220
+ "evaluator": null,
221
+ "extractor_location": "huggingface_pairs",
222
+ "extractor_file": "math",
223
+ "benchmark_type": "mathematics",
224
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
225
+ },
226
+ "assin": {
227
+ "evaluator": "log_likelihoods",
228
+ "extractor_location": "lm_eval_pairs",
229
+ "extractor_file": "assin",
230
+ "benchmark_type": "other",
231
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
232
+ },
233
+ "atis": {
234
+ "evaluator": "generation",
235
+ "extractor_location": "huggingface_pairs",
236
+ "extractor_file": "atis",
237
+ "benchmark_type": "other",
238
+ "explanation": "Text generation evaluation - assesses quality of generated text"
239
+ },
240
+ "babi": {
241
+ "evaluator": "generation",
242
+ "extractor_location": "lm_eval_pairs",
243
+ "extractor_file": "babi",
244
+ "benchmark_type": "other",
245
+ "explanation": "Text generation evaluation - assesses quality of generated text"
246
+ },
247
+ "babilong": {
248
+ "evaluator": "generation",
249
+ "extractor_location": "huggingface_pairs",
250
+ "extractor_file": "babilong",
251
+ "benchmark_type": "other",
252
+ "explanation": "Text generation evaluation - assesses quality of generated text"
253
+ },
254
+ "bangla_mmlu": {
255
+ "evaluator": "log_likelihoods",
256
+ "extractor_location": "huggingface_pairs",
257
+ "extractor_file": "bangla_mmlu",
258
+ "benchmark_type": "knowledge",
259
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
260
+ },
261
+ "banking77": {
262
+ "evaluator": "exact_match",
263
+ "extractor_location": "huggingface_pairs",
264
+ "extractor_file": "banking77",
265
+ "benchmark_type": "other",
266
+ "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
267
+ },
268
+ "basque_bench": {
269
+ "evaluator": "log_likelihoods",
270
+ "extractor_location": "lm_eval_pairs",
271
+ "extractor_file": "basque_bench",
272
+ "benchmark_type": "other",
273
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
274
+ },
275
+ "basque_glue": {
276
+ "evaluator": "log_likelihoods",
277
+ "extractor_location": "lm_eval_pairs",
278
+ "extractor_file": "basque_glue",
279
+ "benchmark_type": "other",
280
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
281
+ },
282
+ "basqueglue": {
283
+ "evaluator": "log_likelihoods",
284
+ "extractor_location": "huggingface_pairs",
285
+ "extractor_file": "basqueglue",
286
+ "benchmark_type": "other",
287
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
288
+ },
289
+ "bbh": {
290
+ "evaluator": "exact_match",
291
+ "extractor_location": "lm_eval_pairs",
292
+ "extractor_file": "bbh",
293
+ "benchmark_type": "other",
294
+ "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
295
+ },
296
+ "bbq": {
297
+ "evaluator": "log_likelihoods",
298
+ "extractor_location": "lm_eval_pairs",
299
+ "extractor_file": "bbq",
300
+ "benchmark_type": "other",
301
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
302
+ },
303
+ "bec2016eu": {
304
+ "evaluator": "log_likelihoods",
305
+ "extractor_location": "huggingface_pairs",
306
+ "extractor_file": "bec2016eu",
307
+ "benchmark_type": "other",
308
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
309
+ },
310
+ "belebele": {
311
+ "evaluator": "log_likelihoods",
312
+ "extractor_location": "lm_eval_pairs",
313
+ "extractor_file": "belebele",
314
+ "benchmark_type": "other",
315
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
316
+ },
317
+ "benchmarks": {
318
+ "evaluator": "log_likelihoods",
319
+ "extractor_location": "lm_eval_pairs",
320
+ "extractor_file": "benchmarks",
321
+ "benchmark_type": "other",
322
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
323
+ },
324
+ "bertaqa": {
325
+ "evaluator": "log_likelihoods",
326
+ "extractor_location": "lm_eval_pairs",
327
+ "extractor_file": "bertaqa",
328
+ "benchmark_type": "question_answering",
329
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
330
+ },
331
+ "bhs": {
332
+ "evaluator": "log_likelihoods",
333
+ "extractor_location": "lm_eval_pairs",
334
+ "extractor_file": "bhs",
335
+ "benchmark_type": "other",
336
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
337
+ },
338
+ "bhtc": {
339
+ "evaluator": "log_likelihoods",
340
+ "extractor_location": "lm_eval_pairs",
341
+ "extractor_file": "bhtc",
342
+ "benchmark_type": "other",
343
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
344
+ },
345
+ "bigbench": {
346
+ "evaluator": "exact_match",
347
+ "extractor_location": "lm_eval_pairs",
348
+ "extractor_file": "bigbench",
349
+ "benchmark_type": "other",
350
+ "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
351
+ },
352
+ "blimp": {
353
+ "evaluator": "log_likelihoods",
354
+ "extractor_location": "lm_eval_pairs",
355
+ "extractor_file": "blimp",
356
+ "benchmark_type": "other",
357
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
358
+ },
359
+ "blimp_nl": {
360
+ "evaluator": null,
361
+ "extractor_location": "lm_eval_pairs",
362
+ "extractor_file": "blimp_nl",
363
+ "benchmark_type": "other",
364
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
365
+ },
366
+ "boolq": {
367
+ "evaluator": "log_likelihoods",
368
+ "extractor_location": "huggingface_pairs",
369
+ "extractor_file": "boolq",
370
+ "benchmark_type": "other",
371
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
372
+ },
373
+ "boolq_seq2seq": {
374
+ "evaluator": null,
375
+ "extractor_location": "huggingface_pairs",
376
+ "extractor_file": "boolq_seq2seq",
377
+ "benchmark_type": "other",
378
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
379
+ },
380
+ "c4": {
381
+ "evaluator": null,
382
+ "extractor_location": "lm_eval_pairs",
383
+ "extractor_file": "c4",
384
+ "benchmark_type": "other",
385
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
386
+ },
387
+ "cabbq": {
388
+ "evaluator": null,
389
+ "extractor_location": "lm_eval_pairs",
390
+ "extractor_file": "cabbq",
391
+ "benchmark_type": "other",
392
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
393
+ },
394
+ "cabreu": {
395
+ "evaluator": "log_likelihoods",
396
+ "extractor_location": "lm_eval_pairs",
397
+ "extractor_file": "cabreu",
398
+ "benchmark_type": "other",
399
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
400
+ },
401
+ "careqa": {
402
+ "evaluator": "log_likelihoods",
403
+ "extractor_location": "lm_eval_pairs",
404
+ "extractor_file": "careqa",
405
+ "benchmark_type": "question_answering",
406
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
407
+ },
408
+ "catalan_bench": {
409
+ "evaluator": "log_likelihoods",
410
+ "extractor_location": "lm_eval_pairs",
411
+ "extractor_file": "catalan_bench",
412
+ "benchmark_type": "other",
413
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
414
+ },
415
+ "catalanqa": {
416
+ "evaluator": "log_likelihoods",
417
+ "extractor_location": "lm_eval_pairs",
418
+ "extractor_file": "catalanqa",
419
+ "benchmark_type": "question_answering",
420
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
421
+ },
422
+ "catcola": {
423
+ "evaluator": "log_likelihoods",
424
+ "extractor_location": "lm_eval_pairs",
425
+ "extractor_file": "catcola",
426
+ "benchmark_type": "other",
427
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
428
+ },
429
+ "cb": {
430
+ "evaluator": null,
431
+ "extractor_location": "huggingface_pairs",
432
+ "extractor_file": "cb",
433
+ "benchmark_type": "other",
434
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
435
+ },
436
+ "ceval": {
437
+ "evaluator": "log_likelihoods",
438
+ "extractor_location": "lm_eval_pairs",
439
+ "extractor_file": "ceval",
440
+ "benchmark_type": "other",
441
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
442
+ },
443
+ "ceval_valid": {
444
+ "evaluator": "log_likelihoods",
445
+ "extractor_location": "lm_eval_pairs",
446
+ "extractor_file": "ceval_valid",
447
+ "benchmark_type": "other",
448
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
449
+ },
450
+ "chain": {
451
+ "evaluator": "log_likelihoods",
452
+ "extractor_location": "lm_eval_pairs",
453
+ "extractor_file": "chain",
454
+ "benchmark_type": "other",
455
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
456
+ },
457
+ "chain_of_thought": {
458
+ "evaluator": null,
459
+ "extractor_location": "huggingface_pairs",
460
+ "extractor_file": "math",
461
+ "benchmark_type": "other",
462
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
463
+ },
464
+ "chartqa": {
465
+ "evaluator": "generation",
466
+ "extractor_location": "lm_eval_pairs",
467
+ "extractor_file": "chartqa",
468
+ "benchmark_type": "question_answering",
469
+ "explanation": "Text generation evaluation - assesses quality of generated text"
470
+ },
471
+ "claim": {
472
+ "evaluator": "log_likelihoods",
473
+ "extractor_location": "lm_eval_pairs",
474
+ "extractor_file": "claim",
475
+ "benchmark_type": "other",
476
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
477
+ },
478
+ "click": {
479
+ "evaluator": null,
480
+ "extractor_location": "lm_eval_pairs",
481
+ "extractor_file": "click",
482
+ "benchmark_type": "other",
483
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
484
+ },
485
+ "cmmlu": {
486
+ "evaluator": null,
487
+ "extractor_location": "lm_eval_pairs",
488
+ "extractor_file": "cmmlu",
489
+ "benchmark_type": "knowledge",
490
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
491
+ },
492
+ "cnn": {
493
+ "evaluator": "exact_match",
494
+ "extractor_location": "lm_eval_pairs",
495
+ "extractor_file": "cnn",
496
+ "benchmark_type": "other",
497
+ "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
498
+ },
499
+ "cocoteros": {
500
+ "evaluator": "generation",
501
+ "extractor_location": "lm_eval_pairs",
502
+ "extractor_file": "cocoteros",
503
+ "benchmark_type": "other",
504
+ "explanation": "Text generation evaluation - assesses quality of generated text"
505
+ },
506
+ "code2text": {
507
+ "evaluator": "generation",
508
+ "extractor_location": "lm_eval_pairs",
509
+ "extractor_file": "code2text",
510
+ "benchmark_type": "coding",
511
+ "explanation": "Text generation evaluation - assesses quality of generated text"
512
+ },
513
+ "code_x_glue": {
514
+ "evaluator": "generation",
515
+ "extractor_location": "lm_eval_pairs",
516
+ "extractor_file": "code_x_glue",
517
+ "benchmark_type": "coding",
518
+ "explanation": "Text generation evaluation - assesses quality of generated text"
519
+ },
520
+ "codexglue": {
521
+ "evaluator": null,
522
+ "extractor_location": "huggingface_pairs",
523
+ "extractor_file": "codexglue",
524
+ "benchmark_type": "coding",
525
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
526
+ },
527
+ "codexglue_code_to_text_go": {
528
+ "evaluator": null,
529
+ "extractor_location": "huggingface_pairs",
530
+ "extractor_file": "codexglue",
531
+ "benchmark_type": "coding",
532
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
533
+ },
534
+ "codexglue_code_to_text_java": {
535
+ "evaluator": null,
536
+ "extractor_location": "huggingface_pairs",
537
+ "extractor_file": "codexglue",
538
+ "benchmark_type": "coding",
539
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
540
+ },
541
+ "codexglue_code_to_text_javascript": {
542
+ "evaluator": null,
543
+ "extractor_location": "huggingface_pairs",
544
+ "extractor_file": "codexglue",
545
+ "benchmark_type": "coding",
546
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
547
+ },
548
+ "codexglue_code_to_text_php": {
549
+ "evaluator": null,
550
+ "extractor_location": "huggingface_pairs",
551
+ "extractor_file": "codexglue",
552
+ "benchmark_type": "coding",
553
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
554
+ },
555
+ "codexglue_code_to_text_python": {
556
+ "evaluator": null,
557
+ "extractor_location": "huggingface_pairs",
558
+ "extractor_file": "codexglue",
559
+ "benchmark_type": "coding",
560
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
561
+ },
562
+ "codexglue_code_to_text_ruby": {
563
+ "evaluator": null,
564
+ "extractor_location": "huggingface_pairs",
565
+ "extractor_file": "codexglue",
566
+ "benchmark_type": "coding",
567
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
568
+ },
569
+ "coedit": {
570
+ "evaluator": "generation",
571
+ "extractor_location": "lm_eval_pairs",
572
+ "extractor_file": "coedit",
573
+ "benchmark_type": "other",
574
+ "explanation": "Text generation evaluation - assesses quality of generated text"
575
+ },
576
+ "cola": {
577
+ "evaluator": "log_likelihoods",
578
+ "extractor_location": "lm_eval_pairs",
579
+ "extractor_file": "cola",
580
+ "benchmark_type": "other",
581
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
582
+ },
583
+ "commonsense": {
584
+ "evaluator": null,
585
+ "extractor_location": "lm_eval_pairs",
586
+ "extractor_file": "commonsense",
587
+ "benchmark_type": "other",
588
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
589
+ },
590
+ "commonsense_qa": {
591
+ "evaluator": "log_likelihoods",
592
+ "extractor_location": "lm_eval_pairs",
593
+ "extractor_file": "commonsense_qa",
594
+ "benchmark_type": "question_answering",
595
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
596
+ },
597
+ "conala": {
598
+ "evaluator": null,
599
+ "extractor_location": "huggingface_pairs",
600
+ "extractor_file": "conala",
601
+ "benchmark_type": "coding",
602
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
603
+ },
604
+ "concode": {
605
+ "evaluator": null,
606
+ "extractor_location": "huggingface_pairs",
607
+ "extractor_file": "concode",
608
+ "benchmark_type": "coding",
609
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
610
+ },
611
+ "copa": {
612
+ "evaluator": null,
613
+ "extractor_location": "huggingface_pairs",
614
+ "extractor_file": "copa",
615
+ "benchmark_type": "other",
616
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
617
+ },
618
+ "copal_id": {
619
+ "evaluator": "log_likelihoods",
620
+ "extractor_location": "lm_eval_pairs",
621
+ "extractor_file": "copal_id",
622
+ "benchmark_type": "other",
623
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
624
+ },
625
+ "coqa": {
626
+ "evaluator": null,
627
+ "extractor_location": "lm_eval_pairs",
628
+ "extractor_file": "coqa",
629
+ "benchmark_type": "question_answering",
630
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
631
+ },
632
+ "coqcat": {
633
+ "evaluator": "generation",
634
+ "extractor_location": "lm_eval_pairs",
635
+ "extractor_file": "coqcat",
636
+ "benchmark_type": "other",
637
+ "explanation": "Text generation evaluation - assesses quality of generated text"
638
+ },
639
+ "crows_pairs": {
640
+ "evaluator": "log_likelihoods",
641
+ "extractor_location": "lm_eval_pairs",
642
+ "extractor_file": "crows_pairs",
643
+ "benchmark_type": "other",
644
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
645
+ },
646
+ "csatqa": {
647
+ "evaluator": null,
648
+ "extractor_location": "lm_eval_pairs",
649
+ "extractor_file": "csatqa",
650
+ "benchmark_type": "question_answering",
651
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
652
+ },
653
+ "cycle": {
654
+ "evaluator": null,
655
+ "extractor_location": "lm_eval_pairs",
656
+ "extractor_file": "cycle",
657
+ "benchmark_type": "other",
658
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
659
+ },
660
+ "cycle_letters": {
661
+ "evaluator": "exact_match",
662
+ "extractor_location": "lm_eval_pairs",
663
+ "extractor_file": "cycle_letters",
664
+ "benchmark_type": "other",
665
+ "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
666
+ },
667
+ "darija_bench": {
668
+ "evaluator": "log_likelihoods",
669
+ "extractor_location": "lm_eval_pairs",
670
+ "extractor_file": "darija_bench",
671
+ "benchmark_type": "other",
672
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
673
+ },
674
+ "darijahellaswag": {
675
+ "evaluator": "log_likelihoods",
676
+ "extractor_location": "lm_eval_pairs",
677
+ "extractor_file": "darijahellaswag",
678
+ "benchmark_type": "knowledge",
679
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
680
+ },
681
+ "darijammlu": {
682
+ "evaluator": null,
683
+ "extractor_location": "lm_eval_pairs",
684
+ "extractor_file": "darijammlu",
685
+ "benchmark_type": "knowledge",
686
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
687
+ },
688
+ "dbpedia": {
689
+ "evaluator": "generation",
690
+ "extractor_location": "lm_eval_pairs",
691
+ "extractor_file": "dbpedia",
692
+ "benchmark_type": "other",
693
+ "explanation": "Text generation evaluation - assesses quality of generated text"
694
+ },
695
+ "discrim_eval": {
696
+ "evaluator": null,
697
+ "extractor_location": "lm_eval_pairs",
698
+ "extractor_file": "discrim_eval",
699
+ "benchmark_type": "other",
700
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
701
+ },
702
+ "doc": {
703
+ "evaluator": null,
704
+ "extractor_location": "lm_eval_pairs",
705
+ "extractor_file": "doc",
706
+ "benchmark_type": "other",
707
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
708
+ },
709
+ "doc_vqa": {
710
+ "evaluator": "generation",
711
+ "extractor_location": "huggingface_pairs",
712
+ "extractor_file": "doc_vqa",
713
+ "benchmark_type": "question_answering",
714
+ "explanation": "Text generation evaluation - assesses quality of generated text"
715
+ },
716
+ "drop": {
717
+ "evaluator": null,
718
+ "extractor_location": "lm_eval_pairs",
719
+ "extractor_file": "drop",
720
+ "benchmark_type": "other",
721
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
722
+ },
723
+ "ds1000": {
724
+ "evaluator": "exact_match",
725
+ "extractor_location": "huggingface_pairs",
726
+ "extractor_file": "ds1000",
727
+ "benchmark_type": "coding",
728
+ "explanation": "Text comparison (WARNING: should use execution for coding)"
729
+ },
730
+ "ds_1000": {
731
+ "evaluator": null,
732
+ "extractor_location": "huggingface_pairs",
733
+ "extractor_file": "ds_1000",
734
+ "benchmark_type": "other",
735
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
736
+ },
737
+ "egyhellaswag": {
738
+ "evaluator": "log_likelihoods",
739
+ "extractor_location": "lm_eval_pairs",
740
+ "extractor_file": "egyhellaswag",
741
+ "benchmark_type": "knowledge",
742
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
743
+ },
744
+ "egymmlu": {
745
+ "evaluator": "log_likelihoods",
746
+ "extractor_location": "lm_eval_pairs",
747
+ "extractor_file": "egymmlu",
748
+ "benchmark_type": "knowledge",
749
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
750
+ },
751
+ "epec": {
752
+ "evaluator": "log_likelihoods",
753
+ "extractor_location": "lm_eval_pairs",
754
+ "extractor_file": "epec",
755
+ "benchmark_type": "other",
756
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
757
+ },
758
+ "eq": {
759
+ "evaluator": null,
760
+ "extractor_location": "lm_eval_pairs",
761
+ "extractor_file": "eq",
762
+ "benchmark_type": "other",
763
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
764
+ },
765
+ "eq_bench": {
766
+ "evaluator": "exact_match",
767
+ "extractor_location": "lm_eval_pairs",
768
+ "extractor_file": "eq_bench",
769
+ "benchmark_type": "other",
770
+ "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
771
+ },
772
+ "eq_bench_ca": {
773
+ "evaluator": null,
774
+ "extractor_location": "lm_eval_pairs",
775
+ "extractor_file": "eq_bench_ca",
776
+ "benchmark_type": "other",
777
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
778
+ },
779
+ "eq_bench_es": {
780
+ "evaluator": null,
781
+ "extractor_location": "lm_eval_pairs",
782
+ "extractor_file": "eq_bench_es",
783
+ "benchmark_type": "other",
784
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
785
+ },
786
+ "esbbq": {
787
+ "evaluator": null,
788
+ "extractor_location": "lm_eval_pairs",
789
+ "extractor_file": "esbbq",
790
+ "benchmark_type": "other",
791
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
792
+ },
793
+ "escola": {
794
+ "evaluator": "log_likelihoods",
795
+ "extractor_location": "lm_eval_pairs",
796
+ "extractor_file": "escola",
797
+ "benchmark_type": "other",
798
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
799
+ },
800
+ "ethics": {
801
+ "evaluator": "log_likelihoods",
802
+ "extractor_location": "lm_eval_pairs",
803
+ "extractor_file": "ethics",
804
+ "benchmark_type": "other",
805
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
806
+ },
807
+ "ethos": {
808
+ "evaluator": "generation",
809
+ "extractor_location": "lm_eval_pairs",
810
+ "extractor_file": "ethos",
811
+ "benchmark_type": "other",
812
+ "explanation": "Text generation evaluation - assesses quality of generated text"
813
+ },
814
+ "eus": {
815
+ "evaluator": null,
816
+ "extractor_location": "lm_eval_pairs",
817
+ "extractor_file": "eus",
818
+ "benchmark_type": "other",
819
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
820
+ },
821
+ "eus_exams": {
822
+ "evaluator": "log_likelihoods",
823
+ "extractor_location": "lm_eval_pairs",
824
+ "extractor_file": "eus_exams",
825
+ "benchmark_type": "knowledge",
826
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
827
+ },
828
+ "eus_proficiency": {
829
+ "evaluator": "log_likelihoods",
830
+ "extractor_location": "lm_eval_pairs",
831
+ "extractor_file": "eus_proficiency",
832
+ "benchmark_type": "other",
833
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
834
+ },
835
+ "eus_reading": {
836
+ "evaluator": "log_likelihoods",
837
+ "extractor_location": "lm_eval_pairs",
838
+ "extractor_file": "eus_reading",
839
+ "benchmark_type": "other",
840
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
841
+ },
842
+ "eus_trivia": {
843
+ "evaluator": "log_likelihoods",
844
+ "extractor_location": "lm_eval_pairs",
845
+ "extractor_file": "eus_trivia",
846
+ "benchmark_type": "other",
847
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
848
+ },
849
+ "evalita_llm": {
850
+ "evaluator": "log_likelihoods",
851
+ "extractor_location": "lm_eval_pairs",
852
+ "extractor_file": "evalita_llm",
853
+ "benchmark_type": "other",
854
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
855
+ },
856
+ "evalita_mp": {
857
+ "evaluator": "log_likelihoods",
858
+ "extractor_location": "huggingface_pairs",
859
+ "extractor_file": "evalita_mp",
860
+ "benchmark_type": "other",
861
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
862
+ },
863
+ "evalita_sp": {
864
+ "evaluator": "generation",
865
+ "extractor_location": "lm_eval_pairs",
866
+ "extractor_file": "evalita_sp",
867
+ "benchmark_type": "other",
868
+ "explanation": "Text generation evaluation - assesses quality of generated text"
869
+ },
870
+ "fda": {
871
+ "evaluator": "generation",
872
+ "extractor_location": "lm_eval_pairs",
873
+ "extractor_file": "fda",
874
+ "benchmark_type": "other",
875
+ "explanation": "Text generation evaluation - assesses quality of generated text"
876
+ },
877
+ "financial": {
878
+ "evaluator": null,
879
+ "extractor_location": "lm_eval_pairs",
880
+ "extractor_file": "financial",
881
+ "benchmark_type": "other",
882
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
883
+ },
884
+ "financial_tweets": {
885
+ "evaluator": "generation",
886
+ "extractor_location": "huggingface_pairs",
887
+ "extractor_file": "financial_tweets",
888
+ "benchmark_type": "other",
889
+ "explanation": "Text generation evaluation - assesses quality of generated text"
890
+ },
891
+ "flan": {
892
+ "evaluator": "log_likelihoods",
893
+ "extractor_location": "lm_eval_pairs",
894
+ "extractor_file": "flan",
895
+ "benchmark_type": "other",
896
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
897
+ },
898
+ "fld": {
899
+ "evaluator": "exact_match",
900
+ "extractor_location": "lm_eval_pairs",
901
+ "extractor_file": "fld",
902
+ "benchmark_type": "other",
903
+ "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
904
+ },
905
+ "flores": {
906
+ "evaluator": "generation",
907
+ "extractor_location": "huggingface_pairs",
908
+ "extractor_file": "flores",
909
+ "benchmark_type": "translation",
910
+ "explanation": "Text generation evaluation - assesses quality of generated text"
911
+ },
912
+ "freebase": {
913
+ "evaluator": "log_likelihoods",
914
+ "extractor_location": "huggingface_pairs",
915
+ "extractor_file": "freebase",
916
+ "benchmark_type": "other",
917
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
918
+ },
919
+ "french_bench": {
920
+ "evaluator": "log_likelihoods",
921
+ "extractor_location": "lm_eval_pairs",
922
+ "extractor_file": "french_bench",
923
+ "benchmark_type": "other",
924
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
925
+ },
926
+ "galcola": {
927
+ "evaluator": "log_likelihoods",
928
+ "extractor_location": "lm_eval_pairs",
929
+ "extractor_file": "galcola",
930
+ "benchmark_type": "other",
931
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
932
+ },
933
+ "galician_bench": {
934
+ "evaluator": "log_likelihoods",
935
+ "extractor_location": "lm_eval_pairs",
936
+ "extractor_file": "galician_bench",
937
+ "benchmark_type": "other",
938
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
939
+ },
940
+ "gaokao": {
941
+ "evaluator": "log_likelihoods",
942
+ "extractor_location": "lm_eval_pairs",
943
+ "extractor_file": "gaokao",
944
+ "benchmark_type": "other",
945
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
946
+ },
947
+ "glianorex": {
948
+ "evaluator": "log_likelihoods",
949
+ "extractor_location": "huggingface_pairs",
950
+ "extractor_file": "glianorex",
951
+ "benchmark_type": "other",
952
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
953
+ },
954
+ "global_mmlu": {
955
+ "evaluator": "log_likelihoods",
956
+ "extractor_location": "lm_eval_pairs",
957
+ "extractor_file": "global_mmlu",
958
+ "benchmark_type": "knowledge",
959
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
960
+ },
961
+ "global_piqa": {
962
+ "evaluator": null,
963
+ "extractor_location": "lm_eval_pairs",
964
+ "extractor_file": "global_piqa",
965
+ "benchmark_type": "question_answering",
966
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
967
+ },
968
+ "glue": {
969
+ "evaluator": "log_likelihoods",
970
+ "extractor_location": "lm_eval_pairs",
971
+ "extractor_file": "glue",
972
+ "benchmark_type": "other",
973
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
974
+ },
975
+ "gpqa": {
976
+ "evaluator": "log_likelihoods",
977
+ "extractor_location": "lm_eval_pairs",
978
+ "extractor_file": "gpqa",
979
+ "benchmark_type": "question_answering",
980
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
981
+ },
982
+ "gpt3": {
983
+ "evaluator": "log_likelihoods",
984
+ "extractor_location": "lm_eval_pairs",
985
+ "extractor_file": "gpt3",
986
+ "benchmark_type": "other",
987
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
988
+ },
989
+ "groundcocoa": {
990
+ "evaluator": "generation",
991
+ "extractor_location": "lm_eval_pairs",
992
+ "extractor_file": "groundcocoa",
993
+ "benchmark_type": "other",
994
+ "explanation": "Text generation evaluation - assesses quality of generated text"
995
+ },
996
+ "gsm": {
997
+ "evaluator": "exact_match",
998
+ "extractor_location": "lm_eval_pairs",
999
+ "extractor_file": "gsm",
1000
+ "benchmark_type": "mathematics",
1001
+ "explanation": "Text comparison (WARNING: should use execution for mathematics)"
1002
+ },
1003
+ "gsm8k": {
1004
+ "evaluator": "exact_match",
1005
+ "extractor_location": "lm_eval_pairs",
1006
+ "extractor_file": "gsm8k",
1007
+ "benchmark_type": "mathematics",
1008
+ "explanation": "Text comparison (WARNING: should use execution for mathematics)"
1009
+ },
1010
+ "gsm8k_cot": {
1011
+ "evaluator": null,
1012
+ "extractor_location": "huggingface_pairs",
1013
+ "extractor_file": "math",
1014
+ "benchmark_type": "mathematics",
1015
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1016
+ },
1017
+ "gsm8k_cot_llama": {
1018
+ "evaluator": null,
1019
+ "extractor_location": "huggingface_pairs",
1020
+ "extractor_file": "math",
1021
+ "benchmark_type": "mathematics",
1022
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1023
+ },
1024
+ "gsm8k_cot_self_consistency": {
1025
+ "evaluator": null,
1026
+ "extractor_location": "huggingface_pairs",
1027
+ "extractor_file": "math",
1028
+ "benchmark_type": "mathematics",
1029
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1030
+ },
1031
+ "gsm8k_llama": {
1032
+ "evaluator": null,
1033
+ "extractor_location": "huggingface_pairs",
1034
+ "extractor_file": "math",
1035
+ "benchmark_type": "mathematics",
1036
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1037
+ },
1038
+ "gsm8k_platinum_cot": {
1039
+ "evaluator": null,
1040
+ "extractor_location": "huggingface_pairs",
1041
+ "extractor_file": "math",
1042
+ "benchmark_type": "mathematics",
1043
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1044
+ },
1045
+ "gsm8k_platinum_cot_llama": {
1046
+ "evaluator": null,
1047
+ "extractor_location": "huggingface_pairs",
1048
+ "extractor_file": "math",
1049
+ "benchmark_type": "mathematics",
1050
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1051
+ },
1052
+ "gsm8k_platinum_cot_self_consistency": {
1053
+ "evaluator": null,
1054
+ "extractor_location": "huggingface_pairs",
1055
+ "extractor_file": "math",
1056
+ "benchmark_type": "mathematics",
1057
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1058
+ },
1059
+ "gsm_plus": {
1060
+ "evaluator": "exact_match",
1061
+ "extractor_location": "huggingface_pairs",
1062
+ "extractor_file": "gsm_plus",
1063
+ "benchmark_type": "mathematics",
1064
+ "explanation": "Text comparison (WARNING: should use execution for mathematics)"
1065
+ },
1066
+ "haerae": {
1067
+ "evaluator": null,
1068
+ "extractor_location": "lm_eval_pairs",
1069
+ "extractor_file": "haerae",
1070
+ "benchmark_type": "other",
1071
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1072
+ },
1073
+ "headqa": {
1074
+ "evaluator": null,
1075
+ "extractor_location": "lm_eval_pairs",
1076
+ "extractor_file": "headqa",
1077
+ "benchmark_type": "question_answering",
1078
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1079
+ },
1080
+ "hellaswag": {
1081
+ "evaluator": "log_likelihoods",
1082
+ "extractor_location": "lm_eval_pairs",
1083
+ "extractor_file": "hellaswag",
1084
+ "benchmark_type": "knowledge",
1085
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1086
+ },
1087
+ "hendrycks_ethics": {
1088
+ "evaluator": null,
1089
+ "extractor_location": "lm_eval_pairs",
1090
+ "extractor_file": "hendrycks_ethics",
1091
+ "benchmark_type": "other",
1092
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1093
+ },
1094
+ "hendrycks_math": {
1095
+ "evaluator": null,
1096
+ "extractor_location": "lm_eval_pairs",
1097
+ "extractor_file": "hendrycks_math",
1098
+ "benchmark_type": "mathematics",
1099
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1100
+ },
1101
+ "histoires_morales": {
1102
+ "evaluator": "generation",
1103
+ "extractor_location": "lm_eval_pairs",
1104
+ "extractor_file": "histoires_morales",
1105
+ "benchmark_type": "other",
1106
+ "explanation": "Text generation evaluation - assesses quality of generated text"
1107
+ },
1108
+ "hle": {
1109
+ "evaluator": null,
1110
+ "extractor_location": "huggingface_pairs",
1111
+ "extractor_file": "hle",
1112
+ "benchmark_type": "other",
1113
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1114
+ },
1115
+ "hle_exact_match": {
1116
+ "evaluator": null,
1117
+ "extractor_location": "huggingface_pairs",
1118
+ "extractor_file": "hle",
1119
+ "benchmark_type": "other",
1120
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1121
+ },
1122
+ "hle_multiple_choice": {
1123
+ "evaluator": null,
1124
+ "extractor_location": "huggingface_pairs",
1125
+ "extractor_file": "hle",
1126
+ "benchmark_type": "other",
1127
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1128
+ },
1129
+ "hmmt": {
1130
+ "evaluator": null,
1131
+ "extractor_location": "huggingface_pairs",
1132
+ "extractor_file": "hmmt",
1133
+ "benchmark_type": "mathematics",
1134
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1135
+ },
1136
+ "hmmt_feb_2025": {
1137
+ "evaluator": null,
1138
+ "extractor_location": "huggingface_pairs",
1139
+ "extractor_file": "hmmt",
1140
+ "benchmark_type": "mathematics",
1141
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1142
+ },
1143
+ "hrm8k": {
1144
+ "evaluator": "exact_match",
1145
+ "extractor_location": "lm_eval_pairs",
1146
+ "extractor_file": "hrm8k",
1147
+ "benchmark_type": "other",
1148
+ "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
1149
+ },
1150
+ "humaneval": {
1151
+ "evaluator": null,
1152
+ "extractor_location": "huggingface_pairs",
1153
+ "extractor_file": "humaneval",
1154
+ "benchmark_type": "coding",
1155
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1156
+ },
1157
+ "humaneval_64_instruct": {
1158
+ "evaluator": null,
1159
+ "extractor_location": "huggingface_pairs",
1160
+ "extractor_file": "instructhumaneval",
1161
+ "benchmark_type": "coding",
1162
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1163
+ },
1164
+ "humaneval_infilling": {
1165
+ "evaluator": null,
1166
+ "extractor_location": "lm_eval_pairs",
1167
+ "extractor_file": "humaneval_infilling",
1168
+ "benchmark_type": "coding",
1169
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1170
+ },
1171
+ "humaneval_instruct": {
1172
+ "evaluator": null,
1173
+ "extractor_location": "huggingface_pairs",
1174
+ "extractor_file": "instructhumaneval",
1175
+ "benchmark_type": "coding",
1176
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1177
+ },
1178
+ "humaneval_plus": {
1179
+ "evaluator": null,
1180
+ "extractor_location": "huggingface_pairs",
1181
+ "extractor_file": "humaneval",
1182
+ "benchmark_type": "coding",
1183
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1184
+ },
1185
+ "humanevalpack": {
1186
+ "evaluator": "exact_match",
1187
+ "extractor_location": "huggingface_pairs",
1188
+ "extractor_file": "humanevalpack",
1189
+ "benchmark_type": "coding",
1190
+ "explanation": "Text comparison (WARNING: should use execution for coding)"
1191
+ },
1192
+ "icelandic_winogrande": {
1193
+ "evaluator": null,
1194
+ "extractor_location": "lm_eval_pairs",
1195
+ "extractor_file": "icelandic_winogrande",
1196
+ "benchmark_type": "other",
1197
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1198
+ },
1199
+ "ifeval": {
1200
+ "evaluator": "exact_match",
1201
+ "extractor_location": "lm_eval_pairs",
1202
+ "extractor_file": "ifeval",
1203
+ "benchmark_type": "other",
1204
+ "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
1205
+ },
1206
+ "instruct_humaneval": {
1207
+ "evaluator": null,
1208
+ "extractor_location": "huggingface_pairs",
1209
+ "extractor_file": "instructhumaneval",
1210
+ "benchmark_type": "coding",
1211
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1212
+ },
1213
+ "instructhumaneval": {
1214
+ "evaluator": null,
1215
+ "extractor_location": "huggingface_pairs",
1216
+ "extractor_file": "instructhumaneval",
1217
+ "benchmark_type": "coding",
1218
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1219
+ },
1220
+ "inverse": {
1221
+ "evaluator": null,
1222
+ "extractor_location": "lm_eval_pairs",
1223
+ "extractor_file": "inverse",
1224
+ "benchmark_type": "other",
1225
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1226
+ },
1227
+ "inverse_scaling": {
1228
+ "evaluator": "log_likelihoods",
1229
+ "extractor_location": "lm_eval_pairs",
1230
+ "extractor_file": "inverse_scaling",
1231
+ "benchmark_type": "other",
1232
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1233
+ },
1234
+ "iwslt2017": {
1235
+ "evaluator": "generation",
1236
+ "extractor_location": "lm_eval_pairs",
1237
+ "extractor_file": "iwslt2017",
1238
+ "benchmark_type": "translation",
1239
+ "explanation": "Text generation evaluation - assesses quality of generated text"
1240
+ },
1241
+ "iwslt2017_ar_en": {
1242
+ "evaluator": "generation",
1243
+ "extractor_location": "huggingface_pairs",
1244
+ "extractor_file": "iwslt2017_ar_en",
1245
+ "benchmark_type": "translation",
1246
+ "explanation": "Text generation evaluation - assesses quality of generated text"
1247
+ },
1248
+ "iwslt2017_en_ar": {
1249
+ "evaluator": "generation",
1250
+ "extractor_location": "huggingface_pairs",
1251
+ "extractor_file": "iwslt2017_en_ar",
1252
+ "benchmark_type": "translation",
1253
+ "explanation": "Text generation evaluation - assesses quality of generated text"
1254
+ },
1255
+ "ja": {
1256
+ "evaluator": null,
1257
+ "extractor_location": "lm_eval_pairs",
1258
+ "extractor_file": "ja",
1259
+ "benchmark_type": "other",
1260
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1261
+ },
1262
+ "japanese_leaderboard": {
1263
+ "evaluator": "log_likelihoods",
1264
+ "extractor_location": "lm_eval_pairs",
1265
+ "extractor_file": "japanese_leaderboard",
1266
+ "benchmark_type": "other",
1267
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1268
+ },
1269
+ "jsonschema_bench": {
1270
+ "evaluator": "generation",
1271
+ "extractor_location": "lm_eval_pairs",
1272
+ "extractor_file": "jsonschema_bench",
1273
+ "benchmark_type": "other",
1274
+ "explanation": "Text generation evaluation - assesses quality of generated text"
1275
+ },
1276
+ "kbl": {
1277
+ "evaluator": "log_likelihoods",
1278
+ "extractor_location": "lm_eval_pairs",
1279
+ "extractor_file": "kbl",
1280
+ "benchmark_type": "other",
1281
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1282
+ },
1283
+ "kmmlu": {
1284
+ "evaluator": "log_likelihoods",
1285
+ "extractor_location": "lm_eval_pairs",
1286
+ "extractor_file": "kmmlu",
1287
+ "benchmark_type": "knowledge",
1288
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1289
+ },
1290
+ "kobest": {
1291
+ "evaluator": null,
1292
+ "extractor_location": "lm_eval_pairs",
1293
+ "extractor_file": "kobest",
1294
+ "benchmark_type": "other",
1295
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1296
+ },
1297
+ "kormedmcqa": {
1298
+ "evaluator": "generation",
1299
+ "extractor_location": "lm_eval_pairs",
1300
+ "extractor_file": "kormedmcqa",
1301
+ "benchmark_type": "question_answering",
1302
+ "explanation": "Text generation evaluation - assesses quality of generated text"
1303
+ },
1304
+ "lambada": {
1305
+ "evaluator": "exact_match",
1306
+ "extractor_location": "lm_eval_pairs",
1307
+ "extractor_file": "lambada",
1308
+ "benchmark_type": "other",
1309
+ "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
1310
+ },
1311
+ "lambada_cloze": {
1312
+ "evaluator": null,
1313
+ "extractor_location": "lm_eval_pairs",
1314
+ "extractor_file": "lambada_cloze",
1315
+ "benchmark_type": "other",
1316
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1317
+ },
1318
+ "lambada_multilingual": {
1319
+ "evaluator": null,
1320
+ "extractor_location": "lm_eval_pairs",
1321
+ "extractor_file": "lambada_multilingual",
1322
+ "benchmark_type": "other",
1323
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1324
+ },
1325
+ "lambada_multilingual_stablelm": {
1326
+ "evaluator": "log_likelihoods",
1327
+ "extractor_location": "lm_eval_pairs",
1328
+ "extractor_file": "lambada_multilingual_stablelm",
1329
+ "benchmark_type": "other",
1330
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1331
+ },
1332
+ "law": {
1333
+ "evaluator": null,
1334
+ "extractor_location": "lm_eval_pairs",
1335
+ "extractor_file": "law",
1336
+ "benchmark_type": "other",
1337
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1338
+ },
1339
+ "law_stack_exchange": {
1340
+ "evaluator": "generation",
1341
+ "extractor_location": "huggingface_pairs",
1342
+ "extractor_file": "law_stack_exchange",
1343
+ "benchmark_type": "other",
1344
+ "explanation": "Text generation evaluation - assesses quality of generated text"
1345
+ },
1346
+ "leaderboard": {
1347
+ "evaluator": null,
1348
+ "extractor_location": "lm_eval_pairs",
1349
+ "extractor_file": "leaderboard",
1350
+ "benchmark_type": "other",
1351
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1352
+ },
1353
+ "ledgar": {
1354
+ "evaluator": null,
1355
+ "extractor_location": "huggingface_pairs",
1356
+ "extractor_file": "ledgar",
1357
+ "benchmark_type": "other",
1358
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1359
+ },
1360
+ "libra": {
1361
+ "evaluator": "generation",
1362
+ "extractor_location": "lm_eval_pairs",
1363
+ "extractor_file": "libra",
1364
+ "benchmark_type": "other",
1365
+ "explanation": "Text generation evaluation - assesses quality of generated text"
1366
+ },
1367
+ "lingoly": {
1368
+ "evaluator": "log_likelihoods",
1369
+ "extractor_location": "lm_eval_pairs",
1370
+ "extractor_file": "lingoly",
1371
+ "benchmark_type": "other",
1372
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1373
+ },
1374
+ "livecodebench": {
1375
+ "evaluator": null,
1376
+ "extractor_location": "huggingface_pairs",
1377
+ "extractor_file": "livecodebench",
1378
+ "benchmark_type": "coding",
1379
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1380
+ },
1381
+ "livemathbench": {
1382
+ "evaluator": null,
1383
+ "extractor_location": "huggingface_pairs",
1384
+ "extractor_file": "livemathbench",
1385
+ "benchmark_type": "mathematics",
1386
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1387
+ },
1388
+ "livemathbench_cnmo_en": {
1389
+ "evaluator": null,
1390
+ "extractor_location": "huggingface_pairs",
1391
+ "extractor_file": "livemathbench_configs",
1392
+ "benchmark_type": "mathematics",
1393
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1394
+ },
1395
+ "livemathbench_cnmo_zh": {
1396
+ "evaluator": null,
1397
+ "extractor_location": "huggingface_pairs",
1398
+ "extractor_file": "livemathbench_configs",
1399
+ "benchmark_type": "mathematics",
1400
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1401
+ },
1402
+ "llama": {
1403
+ "evaluator": null,
1404
+ "extractor_location": "huggingface_pairs",
1405
+ "extractor_file": "llama",
1406
+ "benchmark_type": "other",
1407
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1408
+ },
1409
+ "llama3": {
1410
+ "evaluator": null,
1411
+ "extractor_location": "lm_eval_pairs",
1412
+ "extractor_file": "llama3",
1413
+ "benchmark_type": "other",
1414
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1415
+ },
1416
+ "lm_syneval": {
1417
+ "evaluator": null,
1418
+ "extractor_location": "lm_eval_pairs",
1419
+ "extractor_file": "lm_syneval",
1420
+ "benchmark_type": "other",
1421
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1422
+ },
1423
+ "logieval": {
1424
+ "evaluator": null,
1425
+ "extractor_location": "huggingface_pairs",
1426
+ "extractor_file": "logieval",
1427
+ "benchmark_type": "other",
1428
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1429
+ },
1430
+ "logiqa": {
1431
+ "evaluator": null,
1432
+ "extractor_location": "lm_eval_pairs",
1433
+ "extractor_file": "logiqa",
1434
+ "benchmark_type": "question_answering",
1435
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1436
+ },
1437
+ "logiqa2": {
1438
+ "evaluator": null,
1439
+ "extractor_location": "lm_eval_pairs",
1440
+ "extractor_file": "logiqa2",
1441
+ "benchmark_type": "question_answering",
1442
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1443
+ },
1444
+ "longbench": {
1445
+ "evaluator": null,
1446
+ "extractor_location": "lm_eval_pairs",
1447
+ "extractor_file": "longbench",
1448
+ "benchmark_type": "other",
1449
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1450
+ },
1451
+ "longbenchv2": {
1452
+ "evaluator": null,
1453
+ "extractor_location": "lm_eval_pairs",
1454
+ "extractor_file": "longbenchv2",
1455
+ "benchmark_type": "other",
1456
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1457
+ },
1458
+ "m_mmlu": {
1459
+ "evaluator": null,
1460
+ "extractor_location": "huggingface_pairs",
1461
+ "extractor_file": "m_mmlu",
1462
+ "benchmark_type": "knowledge",
1463
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1464
+ },
1465
+ "mastermind": {
1466
+ "evaluator": "log_likelihoods",
1467
+ "extractor_location": "lm_eval_pairs",
1468
+ "extractor_file": "mastermind",
1469
+ "benchmark_type": "other",
1470
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1471
+ },
1472
+ "math": {
1473
+ "evaluator": null,
1474
+ "extractor_location": "huggingface_pairs",
1475
+ "extractor_file": "math",
1476
+ "benchmark_type": "mathematics",
1477
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1478
+ },
1479
+ "math500": {
1480
+ "evaluator": null,
1481
+ "extractor_location": "huggingface_pairs",
1482
+ "extractor_file": "math",
1483
+ "benchmark_type": "mathematics",
1484
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1485
+ },
1486
+ "math_500": {
1487
+ "evaluator": null,
1488
+ "extractor_location": "huggingface_pairs",
1489
+ "extractor_file": "math",
1490
+ "benchmark_type": "mathematics",
1491
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1492
+ },
1493
+ "mathqa": {
1494
+ "evaluator": null,
1495
+ "extractor_location": "lm_eval_pairs",
1496
+ "extractor_file": "mathqa",
1497
+ "benchmark_type": "mathematics",
1498
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1499
+ },
1500
+ "mbpp": {
1501
+ "evaluator": null,
1502
+ "extractor_location": "huggingface_pairs",
1503
+ "extractor_file": "mbpp",
1504
+ "benchmark_type": "coding",
1505
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1506
+ },
1507
+ "mbpp_plus": {
1508
+ "evaluator": null,
1509
+ "extractor_location": "huggingface_pairs",
1510
+ "extractor_file": "mbpp",
1511
+ "benchmark_type": "coding",
1512
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1513
+ },
1514
+ "mc-taco": {
1515
+ "evaluator": null,
1516
+ "extractor_location": "lm_eval_pairs",
1517
+ "extractor_file": "mc-taco",
1518
+ "benchmark_type": "other",
1519
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1520
+ },
1521
+ "med_concepts_qa": {
1522
+ "evaluator": "log_likelihoods",
1523
+ "extractor_location": "lm_eval_pairs",
1524
+ "extractor_file": "med_concepts_qa",
1525
+ "benchmark_type": "question_answering",
1526
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1527
+ },
1528
+ "meddialog": {
1529
+ "evaluator": null,
1530
+ "extractor_location": "huggingface_pairs",
1531
+ "extractor_file": "meddialog",
1532
+ "benchmark_type": "other",
1533
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1534
+ },
1535
+ "meddialog_qsumm": {
1536
+ "evaluator": null,
1537
+ "extractor_location": "huggingface_pairs",
1538
+ "extractor_file": "meddialog",
1539
+ "benchmark_type": "other",
1540
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1541
+ },
1542
+ "meddialog_qsumm_perplexity": {
1543
+ "evaluator": null,
1544
+ "extractor_location": "huggingface_pairs",
1545
+ "extractor_file": "meddialog",
1546
+ "benchmark_type": "other",
1547
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1548
+ },
1549
+ "meddialog_raw_dialogues": {
1550
+ "evaluator": null,
1551
+ "extractor_location": "huggingface_pairs",
1552
+ "extractor_file": "meddialog",
1553
+ "benchmark_type": "other",
1554
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1555
+ },
1556
+ "meddialog_raw_perplexity": {
1557
+ "evaluator": null,
1558
+ "extractor_location": "huggingface_pairs",
1559
+ "extractor_file": "meddialog",
1560
+ "benchmark_type": "other",
1561
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1562
+ },
1563
+ "medical": {
1564
+ "evaluator": null,
1565
+ "extractor_location": "lm_eval_pairs",
1566
+ "extractor_file": "medical",
1567
+ "benchmark_type": "other",
1568
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1569
+ },
1570
+ "medical_abstracts": {
1571
+ "evaluator": "generation",
1572
+ "extractor_location": "huggingface_pairs",
1573
+ "extractor_file": "medical_abstracts",
1574
+ "benchmark_type": "other",
1575
+ "explanation": "Text generation evaluation - assesses quality of generated text"
1576
+ },
1577
+ "mediqa_qa2019": {
1578
+ "evaluator": "generation",
1579
+ "extractor_location": "lm_eval_pairs",
1580
+ "extractor_file": "mediqa_qa2019",
1581
+ "benchmark_type": "question_answering",
1582
+ "explanation": "Text generation evaluation - assesses quality of generated text"
1583
+ },
1584
+ "medmcqa": {
1585
+ "evaluator": "log_likelihoods",
1586
+ "extractor_location": "lm_eval_pairs",
1587
+ "extractor_file": "medmcqa",
1588
+ "benchmark_type": "question_answering",
1589
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1590
+ },
1591
+ "medqa": {
1592
+ "evaluator": null,
1593
+ "extractor_location": "lm_eval_pairs",
1594
+ "extractor_file": "medqa",
1595
+ "benchmark_type": "question_answering",
1596
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1597
+ },
1598
+ "medtext": {
1599
+ "evaluator": "generation",
1600
+ "extractor_location": "lm_eval_pairs",
1601
+ "extractor_file": "medtext",
1602
+ "benchmark_type": "other",
1603
+ "explanation": "Text generation evaluation - assesses quality of generated text"
1604
+ },
1605
+ "mela": {
1606
+ "evaluator": "log_likelihoods",
1607
+ "extractor_location": "huggingface_pairs",
1608
+ "extractor_file": "mela",
1609
+ "benchmark_type": "other",
1610
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1611
+ },
1612
+ "meqsum": {
1613
+ "evaluator": "generation",
1614
+ "extractor_location": "lm_eval_pairs",
1615
+ "extractor_file": "meqsum",
1616
+ "benchmark_type": "other",
1617
+ "explanation": "Text generation evaluation - assesses quality of generated text"
1618
+ },
1619
+ "mercury": {
1620
+ "evaluator": null,
1621
+ "extractor_location": "huggingface_pairs",
1622
+ "extractor_file": "mercury",
1623
+ "benchmark_type": "other",
1624
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1625
+ },
1626
+ "metabench": {
1627
+ "evaluator": null,
1628
+ "extractor_location": "lm_eval_pairs",
1629
+ "extractor_file": "metabench",
1630
+ "benchmark_type": "other",
1631
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1632
+ },
1633
+ "mgsm": {
1634
+ "evaluator": "generation",
1635
+ "extractor_location": "lm_eval_pairs",
1636
+ "extractor_file": "mgsm",
1637
+ "benchmark_type": "mathematics",
1638
+ "explanation": "Text generation evaluation - assesses quality of generated text"
1639
+ },
1640
+ "mimic_repsum": {
1641
+ "evaluator": "generation",
1642
+ "extractor_location": "lm_eval_pairs",
1643
+ "extractor_file": "mimic_repsum",
1644
+ "benchmark_type": "other",
1645
+ "explanation": "Text generation evaluation - assesses quality of generated text"
1646
+ },
1647
+ "minerva_math": {
1648
+ "evaluator": "generation",
1649
+ "extractor_location": "lm_eval_pairs",
1650
+ "extractor_file": "minerva_math",
1651
+ "benchmark_type": "mathematics",
1652
+ "explanation": "Text generation evaluation - assesses quality of generated text"
1653
+ },
1654
+ "mlqa": {
1655
+ "evaluator": "generation",
1656
+ "extractor_location": "lm_eval_pairs",
1657
+ "extractor_file": "mlqa",
1658
+ "benchmark_type": "question_answering",
1659
+ "explanation": "Text generation evaluation - assesses quality of generated text"
1660
+ },
1661
+ "mmlu": {
1662
+ "evaluator": "log_likelihoods",
1663
+ "extractor_location": "lm_eval_pairs",
1664
+ "extractor_file": "mmlu",
1665
+ "benchmark_type": "knowledge",
1666
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1667
+ },
1668
+ "mmlu_pro": {
1669
+ "evaluator": null,
1670
+ "extractor_location": "lm_eval_pairs",
1671
+ "extractor_file": "mmlu_pro",
1672
+ "benchmark_type": "knowledge",
1673
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1674
+ },
1675
+ "mmlusr": {
1676
+ "evaluator": null,
1677
+ "extractor_location": "huggingface_pairs",
1678
+ "extractor_file": "mmlusr",
1679
+ "benchmark_type": "knowledge",
1680
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1681
+ },
1682
+ "mmlusr_answer_only": {
1683
+ "evaluator": null,
1684
+ "extractor_location": "huggingface_pairs",
1685
+ "extractor_file": "mmlusr",
1686
+ "benchmark_type": "knowledge",
1687
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1688
+ },
1689
+ "mmlusr_question_and_answer": {
1690
+ "evaluator": null,
1691
+ "extractor_location": "huggingface_pairs",
1692
+ "extractor_file": "mmlusr",
1693
+ "benchmark_type": "knowledge",
1694
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1695
+ },
1696
+ "mmlusr_question_only": {
1697
+ "evaluator": null,
1698
+ "extractor_location": "huggingface_pairs",
1699
+ "extractor_file": "mmlusr",
1700
+ "benchmark_type": "knowledge",
1701
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1702
+ },
1703
+ "mmmu": {
1704
+ "evaluator": "log_likelihoods",
1705
+ "extractor_location": "lm_eval_pairs",
1706
+ "extractor_file": "mmmu",
1707
+ "benchmark_type": "other",
1708
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1709
+ },
1710
+ "mnli": {
1711
+ "evaluator": "log_likelihoods",
1712
+ "extractor_location": "lm_eval_pairs",
1713
+ "extractor_file": "mnli",
1714
+ "benchmark_type": "other",
1715
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1716
+ },
1717
+ "model_written_evals": {
1718
+ "evaluator": "log_likelihoods",
1719
+ "extractor_location": "lm_eval_pairs",
1720
+ "extractor_file": "model_written_evals",
1721
+ "benchmark_type": "other",
1722
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1723
+ },
1724
+ "moral_stories": {
1725
+ "evaluator": "log_likelihoods",
1726
+ "extractor_location": "lm_eval_pairs",
1727
+ "extractor_file": "moral_stories",
1728
+ "benchmark_type": "other",
1729
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1730
+ },
1731
+ "mrpc": {
1732
+ "evaluator": null,
1733
+ "extractor_location": "lm_eval_pairs",
1734
+ "extractor_file": "mrpc",
1735
+ "benchmark_type": "other",
1736
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1737
+ },
1738
+ "mts_dialog": {
1739
+ "evaluator": "generation",
1740
+ "extractor_location": "lm_eval_pairs",
1741
+ "extractor_file": "mts_dialog",
1742
+ "benchmark_type": "other",
1743
+ "explanation": "Text generation evaluation - assesses quality of generated text"
1744
+ },
1745
+ "multiblimp": {
1746
+ "evaluator": "log_likelihoods",
1747
+ "extractor_location": "lm_eval_pairs",
1748
+ "extractor_file": "multiblimp",
1749
+ "benchmark_type": "other",
1750
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1751
+ },
1752
+ "multilingual": {
1753
+ "evaluator": null,
1754
+ "extractor_location": "lm_eval_pairs",
1755
+ "extractor_file": "multilingual",
1756
+ "benchmark_type": "other",
1757
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1758
+ },
1759
+ "multimedqa": {
1760
+ "evaluator": "log_likelihoods",
1761
+ "extractor_location": "huggingface_pairs",
1762
+ "extractor_file": "multimedqa",
1763
+ "benchmark_type": "question_answering",
1764
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1765
+ },
1766
+ "multipl_e": {
1767
+ "evaluator": null,
1768
+ "extractor_location": "huggingface_pairs",
1769
+ "extractor_file": "multipl_e",
1770
+ "benchmark_type": "other",
1771
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1772
+ },
1773
+ "multiple": {
1774
+ "evaluator": null,
1775
+ "extractor_location": "huggingface_pairs",
1776
+ "extractor_file": "multiple",
1777
+ "benchmark_type": "other",
1778
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1779
+ },
1780
+ "multiple_cpp": {
1781
+ "evaluator": null,
1782
+ "extractor_location": "huggingface_pairs",
1783
+ "extractor_file": "multipl_e",
1784
+ "benchmark_type": "other",
1785
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1786
+ },
1787
+ "multiple_go": {
1788
+ "evaluator": null,
1789
+ "extractor_location": "huggingface_pairs",
1790
+ "extractor_file": "multipl_e",
1791
+ "benchmark_type": "other",
1792
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1793
+ },
1794
+ "multiple_java": {
1795
+ "evaluator": null,
1796
+ "extractor_location": "huggingface_pairs",
1797
+ "extractor_file": "multipl_e",
1798
+ "benchmark_type": "other",
1799
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1800
+ },
1801
+ "multiple_js": {
1802
+ "evaluator": null,
1803
+ "extractor_location": "huggingface_pairs",
1804
+ "extractor_file": "multipl_e",
1805
+ "benchmark_type": "other",
1806
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1807
+ },
1808
+ "multiple_py": {
1809
+ "evaluator": null,
1810
+ "extractor_location": "huggingface_pairs",
1811
+ "extractor_file": "multipl_e",
1812
+ "benchmark_type": "other",
1813
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1814
+ },
1815
+ "multiple_rs": {
1816
+ "evaluator": null,
1817
+ "extractor_location": "huggingface_pairs",
1818
+ "extractor_file": "multipl_e",
1819
+ "benchmark_type": "other",
1820
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1821
+ },
1822
+ "multirc": {
1823
+ "evaluator": null,
1824
+ "extractor_location": "lm_eval_pairs",
1825
+ "extractor_file": "multirc",
1826
+ "benchmark_type": "other",
1827
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1828
+ },
1829
+ "mutual": {
1830
+ "evaluator": null,
1831
+ "extractor_location": "lm_eval_pairs",
1832
+ "extractor_file": "mutual",
1833
+ "benchmark_type": "other",
1834
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1835
+ },
1836
+ "non": {
1837
+ "evaluator": null,
1838
+ "extractor_location": "lm_eval_pairs",
1839
+ "extractor_file": "non",
1840
+ "benchmark_type": "other",
1841
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1842
+ },
1843
+ "noreval": {
1844
+ "evaluator": "log_likelihoods",
1845
+ "extractor_location": "lm_eval_pairs",
1846
+ "extractor_file": "noreval",
1847
+ "benchmark_type": "other",
1848
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1849
+ },
1850
+ "noreval_gen": {
1851
+ "evaluator": "generation",
1852
+ "extractor_location": "lm_eval_pairs",
1853
+ "extractor_file": "noreval_gen",
1854
+ "benchmark_type": "other",
1855
+ "explanation": "Text generation evaluation - assesses quality of generated text"
1856
+ },
1857
+ "noreval_mc": {
1858
+ "evaluator": "log_likelihoods",
1859
+ "extractor_location": "lm_eval_pairs",
1860
+ "extractor_file": "noreval_mc",
1861
+ "benchmark_type": "other",
1862
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1863
+ },
1864
+ "noticia": {
1865
+ "evaluator": "generation",
1866
+ "extractor_location": "huggingface_pairs",
1867
+ "extractor_file": "noticia",
1868
+ "benchmark_type": "other",
1869
+ "explanation": "Text generation evaluation - assesses quality of generated text"
1870
+ },
1871
+ "nq_open": {
1872
+ "evaluator": "generation",
1873
+ "extractor_location": "lm_eval_pairs",
1874
+ "extractor_file": "nq_open",
1875
+ "benchmark_type": "other",
1876
+ "explanation": "Text generation evaluation - assesses quality of generated text"
1877
+ },
1878
+ "okapi": {
1879
+ "evaluator": "log_likelihoods",
1880
+ "extractor_location": "lm_eval_pairs",
1881
+ "extractor_file": "okapi",
1882
+ "benchmark_type": "other",
1883
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1884
+ },
1885
+ "okapi_arc_multilingual": {
1886
+ "evaluator": "log_likelihoods",
1887
+ "extractor_location": "lm_eval_pairs",
1888
+ "extractor_file": "okapi_arc_multilingual",
1889
+ "benchmark_type": "knowledge",
1890
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1891
+ },
1892
+ "okapi_hellaswag_multilingual": {
1893
+ "evaluator": "log_likelihoods",
1894
+ "extractor_location": "lm_eval_pairs",
1895
+ "extractor_file": "okapi_hellaswag_multilingual",
1896
+ "benchmark_type": "knowledge",
1897
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1898
+ },
1899
+ "okapi_mmlu_multilingual": {
1900
+ "evaluator": "log_likelihoods",
1901
+ "extractor_location": "lm_eval_pairs",
1902
+ "extractor_file": "okapi_mmlu_multilingual",
1903
+ "benchmark_type": "knowledge",
1904
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1905
+ },
1906
+ "okapi_truthfulqa_multilingual": {
1907
+ "evaluator": "log_likelihoods",
1908
+ "extractor_location": "lm_eval_pairs",
1909
+ "extractor_file": "okapi_truthfulqa_multilingual",
1910
+ "benchmark_type": "question_answering",
1911
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1912
+ },
1913
+ "olaph": {
1914
+ "evaluator": "generation",
1915
+ "extractor_location": "lm_eval_pairs",
1916
+ "extractor_file": "olaph",
1917
+ "benchmark_type": "other",
1918
+ "explanation": "Text generation evaluation - assesses quality of generated text"
1919
+ },
1920
+ "openbookqa": {
1921
+ "evaluator": "log_likelihoods",
1922
+ "extractor_location": "lm_eval_pairs",
1923
+ "extractor_file": "openbookqa",
1924
+ "benchmark_type": "question_answering",
1925
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1926
+ },
1927
+ "openllm": {
1928
+ "evaluator": "log_likelihoods",
1929
+ "extractor_location": "huggingface_pairs",
1930
+ "extractor_file": "openllm",
1931
+ "benchmark_type": "other",
1932
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1933
+ },
1934
+ "option": {
1935
+ "evaluator": null,
1936
+ "extractor_location": "lm_eval_pairs",
1937
+ "extractor_file": "option",
1938
+ "benchmark_type": "other",
1939
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1940
+ },
1941
+ "paloma": {
1942
+ "evaluator": "perplexity",
1943
+ "extractor_location": "lm_eval_pairs",
1944
+ "extractor_file": "paloma",
1945
+ "benchmark_type": "other",
1946
+ "explanation": "Perplexity measurement - evaluates model's prediction confidence"
1947
+ },
1948
+ "parafraseja": {
1949
+ "evaluator": "log_likelihoods",
1950
+ "extractor_location": "lm_eval_pairs",
1951
+ "extractor_file": "parafraseja",
1952
+ "benchmark_type": "other",
1953
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1954
+ },
1955
+ "parafrases": {
1956
+ "evaluator": "log_likelihoods",
1957
+ "extractor_location": "lm_eval_pairs",
1958
+ "extractor_file": "parafrases",
1959
+ "benchmark_type": "other",
1960
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1961
+ },
1962
+ "paws": {
1963
+ "evaluator": null,
1964
+ "extractor_location": "lm_eval_pairs",
1965
+ "extractor_file": "paws",
1966
+ "benchmark_type": "other",
1967
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1968
+ },
1969
+ "paws_x": {
1970
+ "evaluator": "log_likelihoods",
1971
+ "extractor_location": "lm_eval_pairs",
1972
+ "extractor_file": "paws_x",
1973
+ "benchmark_type": "other",
1974
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1975
+ },
1976
+ "pawsx": {
1977
+ "evaluator": "log_likelihoods",
1978
+ "extractor_location": "lm_eval_pairs",
1979
+ "extractor_file": "pawsx",
1980
+ "benchmark_type": "other",
1981
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1982
+ },
1983
+ "persona": {
1984
+ "evaluator": "log_likelihoods",
1985
+ "extractor_location": "lm_eval_pairs",
1986
+ "extractor_file": "persona",
1987
+ "benchmark_type": "other",
1988
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1989
+ },
1990
+ "phrases": {
1991
+ "evaluator": null,
1992
+ "extractor_location": "lm_eval_pairs",
1993
+ "extractor_file": "phrases",
1994
+ "benchmark_type": "other",
1995
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
1996
+ },
1997
+ "pile": {
1998
+ "evaluator": "exact_match",
1999
+ "extractor_location": "lm_eval_pairs",
2000
+ "extractor_file": "pile",
2001
+ "benchmark_type": "other",
2002
+ "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
2003
+ },
2004
+ "pile_10k": {
2005
+ "evaluator": "generation",
2006
+ "extractor_location": "lm_eval_pairs",
2007
+ "extractor_file": "pile_10k",
2008
+ "benchmark_type": "other",
2009
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2010
+ },
2011
+ "piqa": {
2012
+ "evaluator": "log_likelihoods",
2013
+ "extractor_location": "lm_eval_pairs",
2014
+ "extractor_file": "piqa",
2015
+ "benchmark_type": "question_answering",
2016
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2017
+ },
2018
+ "polemo2": {
2019
+ "evaluator": "generation",
2020
+ "extractor_location": "lm_eval_pairs",
2021
+ "extractor_file": "polemo2",
2022
+ "benchmark_type": "other",
2023
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2024
+ },
2025
+ "polymath": {
2026
+ "evaluator": null,
2027
+ "extractor_location": "huggingface_pairs",
2028
+ "extractor_file": "polymath",
2029
+ "benchmark_type": "mathematics",
2030
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2031
+ },
2032
+ "polymath_en_high": {
2033
+ "evaluator": null,
2034
+ "extractor_location": "huggingface_pairs",
2035
+ "extractor_file": "polymath_configs",
2036
+ "benchmark_type": "mathematics",
2037
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2038
+ },
2039
+ "polymath_en_medium": {
2040
+ "evaluator": null,
2041
+ "extractor_location": "huggingface_pairs",
2042
+ "extractor_file": "polymath_configs",
2043
+ "benchmark_type": "mathematics",
2044
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2045
+ },
2046
+ "polymath_zh_high": {
2047
+ "evaluator": null,
2048
+ "extractor_location": "huggingface_pairs",
2049
+ "extractor_file": "polymath_configs",
2050
+ "benchmark_type": "mathematics",
2051
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2052
+ },
2053
+ "polymath_zh_medium": {
2054
+ "evaluator": null,
2055
+ "extractor_location": "huggingface_pairs",
2056
+ "extractor_file": "polymath_configs",
2057
+ "benchmark_type": "mathematics",
2058
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2059
+ },
2060
+ "portuguese_bench": {
2061
+ "evaluator": "log_likelihoods",
2062
+ "extractor_location": "lm_eval_pairs",
2063
+ "extractor_file": "portuguese_bench",
2064
+ "benchmark_type": "other",
2065
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2066
+ },
2067
+ "prompt": {
2068
+ "evaluator": null,
2069
+ "extractor_location": "lm_eval_pairs",
2070
+ "extractor_file": "prompt",
2071
+ "benchmark_type": "other",
2072
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2073
+ },
2074
+ "prost": {
2075
+ "evaluator": null,
2076
+ "extractor_location": "lm_eval_pairs",
2077
+ "extractor_file": "prost",
2078
+ "benchmark_type": "other",
2079
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2080
+ },
2081
+ "pubmedqa": {
2082
+ "evaluator": null,
2083
+ "extractor_location": "lm_eval_pairs",
2084
+ "extractor_file": "pubmedqa",
2085
+ "benchmark_type": "question_answering",
2086
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2087
+ },
2088
+ "pythia": {
2089
+ "evaluator": "log_likelihoods",
2090
+ "extractor_location": "huggingface_pairs",
2091
+ "extractor_file": "pythia",
2092
+ "benchmark_type": "other",
2093
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2094
+ },
2095
+ "qa4mre": {
2096
+ "evaluator": "log_likelihoods",
2097
+ "extractor_location": "lm_eval_pairs",
2098
+ "extractor_file": "qa4mre",
2099
+ "benchmark_type": "question_answering",
2100
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2101
+ },
2102
+ "qasper": {
2103
+ "evaluator": "generation",
2104
+ "extractor_location": "lm_eval_pairs",
2105
+ "extractor_file": "qasper",
2106
+ "benchmark_type": "question_answering",
2107
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2108
+ },
2109
+ "qnli": {
2110
+ "evaluator": null,
2111
+ "extractor_location": "lm_eval_pairs",
2112
+ "extractor_file": "qnli",
2113
+ "benchmark_type": "other",
2114
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2115
+ },
2116
+ "qnlieu": {
2117
+ "evaluator": null,
2118
+ "extractor_location": "lm_eval_pairs",
2119
+ "extractor_file": "qnlieu",
2120
+ "benchmark_type": "other",
2121
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2122
+ },
2123
+ "qqp": {
2124
+ "evaluator": null,
2125
+ "extractor_location": "lm_eval_pairs",
2126
+ "extractor_file": "qqp",
2127
+ "benchmark_type": "other",
2128
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2129
+ },
2130
+ "quac": {
2131
+ "evaluator": null,
2132
+ "extractor_location": "lm_eval_pairs",
2133
+ "extractor_file": "quac",
2134
+ "benchmark_type": "other",
2135
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2136
+ },
2137
+ "race": {
2138
+ "evaluator": "log_likelihoods",
2139
+ "extractor_location": "lm_eval_pairs",
2140
+ "extractor_file": "race",
2141
+ "benchmark_type": "other",
2142
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2143
+ },
2144
+ "random": {
2145
+ "evaluator": null,
2146
+ "extractor_location": "lm_eval_pairs",
2147
+ "extractor_file": "random",
2148
+ "benchmark_type": "other",
2149
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2150
+ },
2151
+ "realtoxicityprompts": {
2152
+ "evaluator": "generation",
2153
+ "extractor_location": "lm_eval_pairs",
2154
+ "extractor_file": "realtoxicityprompts",
2155
+ "benchmark_type": "other",
2156
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2157
+ },
2158
+ "recode": {
2159
+ "evaluator": null,
2160
+ "extractor_location": "huggingface_pairs",
2161
+ "extractor_file": "recode",
2162
+ "benchmark_type": "coding",
2163
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2164
+ },
2165
+ "record": {
2166
+ "evaluator": null,
2167
+ "extractor_location": "huggingface_pairs",
2168
+ "extractor_file": "record",
2169
+ "benchmark_type": "other",
2170
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2171
+ },
2172
+ "reversed": {
2173
+ "evaluator": "exact_match",
2174
+ "extractor_location": "lm_eval_pairs",
2175
+ "extractor_file": "reversed",
2176
+ "benchmark_type": "other",
2177
+ "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
2178
+ },
2179
+ "rte": {
2180
+ "evaluator": null,
2181
+ "extractor_location": "lm_eval_pairs",
2182
+ "extractor_file": "rte",
2183
+ "benchmark_type": "other",
2184
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2185
+ },
2186
+ "ruler": {
2187
+ "evaluator": null,
2188
+ "extractor_location": "lm_eval_pairs",
2189
+ "extractor_file": "ruler",
2190
+ "benchmark_type": "other",
2191
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2192
+ },
2193
+ "sciq": {
2194
+ "evaluator": "log_likelihoods",
2195
+ "extractor_location": "lm_eval_pairs",
2196
+ "extractor_file": "sciq",
2197
+ "benchmark_type": "other",
2198
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2199
+ },
2200
+ "score": {
2201
+ "evaluator": "log_likelihoods",
2202
+ "extractor_location": "lm_eval_pairs",
2203
+ "extractor_file": "score",
2204
+ "benchmark_type": "other",
2205
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2206
+ },
2207
+ "scrolls": {
2208
+ "evaluator": "generation",
2209
+ "extractor_location": "lm_eval_pairs",
2210
+ "extractor_file": "scrolls",
2211
+ "benchmark_type": "other",
2212
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2213
+ },
2214
+ "self": {
2215
+ "evaluator": "log_likelihoods",
2216
+ "extractor_location": "lm_eval_pairs",
2217
+ "extractor_file": "self",
2218
+ "benchmark_type": "other",
2219
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2220
+ },
2221
+ "sglue": {
2222
+ "evaluator": null,
2223
+ "extractor_location": "lm_eval_pairs",
2224
+ "extractor_file": "sglue",
2225
+ "benchmark_type": "other",
2226
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2227
+ },
2228
+ "simple_cooccurrence_bias": {
2229
+ "evaluator": null,
2230
+ "extractor_location": "lm_eval_pairs",
2231
+ "extractor_file": "simple_cooccurrence_bias",
2232
+ "benchmark_type": "other",
2233
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2234
+ },
2235
+ "siqa": {
2236
+ "evaluator": "log_likelihoods",
2237
+ "extractor_location": "lm_eval_pairs",
2238
+ "extractor_file": "siqa",
2239
+ "benchmark_type": "question_answering",
2240
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2241
+ },
2242
+ "social_iqa": {
2243
+ "evaluator": null,
2244
+ "extractor_location": "lm_eval_pairs",
2245
+ "extractor_file": "social_iqa",
2246
+ "benchmark_type": "question_answering",
2247
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2248
+ },
2249
+ "spanish_bench": {
2250
+ "evaluator": "log_likelihoods",
2251
+ "extractor_location": "lm_eval_pairs",
2252
+ "extractor_file": "spanish_bench",
2253
+ "benchmark_type": "other",
2254
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2255
+ },
2256
+ "squad2": {
2257
+ "evaluator": null,
2258
+ "extractor_location": "huggingface_pairs",
2259
+ "extractor_file": "squad2",
2260
+ "benchmark_type": "question_answering",
2261
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2262
+ },
2263
+ "squad_completion": {
2264
+ "evaluator": "exact_match",
2265
+ "extractor_location": "lm_eval_pairs",
2266
+ "extractor_file": "squad_completion",
2267
+ "benchmark_type": "question_answering",
2268
+ "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
2269
+ },
2270
+ "sst2": {
2271
+ "evaluator": null,
2272
+ "extractor_location": "lm_eval_pairs",
2273
+ "extractor_file": "sst2",
2274
+ "benchmark_type": "other",
2275
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2276
+ },
2277
+ "storycloze": {
2278
+ "evaluator": "log_likelihoods",
2279
+ "extractor_location": "lm_eval_pairs",
2280
+ "extractor_file": "storycloze",
2281
+ "benchmark_type": "other",
2282
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2283
+ },
2284
+ "stsb": {
2285
+ "evaluator": null,
2286
+ "extractor_location": "huggingface_pairs",
2287
+ "extractor_file": "stsb",
2288
+ "benchmark_type": "other",
2289
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2290
+ },
2291
+ "summarization": {
2292
+ "evaluator": null,
2293
+ "extractor_location": "lm_eval_pairs",
2294
+ "extractor_file": "summarization",
2295
+ "benchmark_type": "other",
2296
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2297
+ },
2298
+ "super": {
2299
+ "evaluator": null,
2300
+ "extractor_location": "lm_eval_pairs",
2301
+ "extractor_file": "super",
2302
+ "benchmark_type": "other",
2303
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2304
+ },
2305
+ "super_glue": {
2306
+ "evaluator": "log_likelihoods",
2307
+ "extractor_location": "lm_eval_pairs",
2308
+ "extractor_file": "super_glue",
2309
+ "benchmark_type": "other",
2310
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2311
+ },
2312
+ "super_glue_lm_eval_v1": {
2313
+ "evaluator": "log_likelihoods",
2314
+ "extractor_location": "huggingface_pairs",
2315
+ "extractor_file": "super_glue_lm_eval_v1",
2316
+ "benchmark_type": "other",
2317
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2318
+ },
2319
+ "super_glue_lm_eval_v1_seq2seq": {
2320
+ "evaluator": "generation",
2321
+ "extractor_location": "huggingface_pairs",
2322
+ "extractor_file": "super_glue_lm_eval_v1_seq2seq",
2323
+ "benchmark_type": "other",
2324
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2325
+ },
2326
+ "super_glue_t5_prompt": {
2327
+ "evaluator": "generation",
2328
+ "extractor_location": "huggingface_pairs",
2329
+ "extractor_file": "super_glue_t5_prompt",
2330
+ "benchmark_type": "other",
2331
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2332
+ },
2333
+ "super_gpqa": {
2334
+ "evaluator": null,
2335
+ "extractor_location": "huggingface_pairs",
2336
+ "extractor_file": "super_gpqa",
2337
+ "benchmark_type": "question_answering",
2338
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2339
+ },
2340
+ "superglue": {
2341
+ "evaluator": null,
2342
+ "extractor_location": "lm_eval_pairs",
2343
+ "extractor_file": "superglue",
2344
+ "benchmark_type": "other",
2345
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2346
+ },
2347
+ "supergpqa": {
2348
+ "evaluator": null,
2349
+ "extractor_location": "lm_eval_pairs",
2350
+ "extractor_file": "supergpqa",
2351
+ "benchmark_type": "question_answering",
2352
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2353
+ },
2354
+ "supergpqa_biology": {
2355
+ "evaluator": null,
2356
+ "extractor_location": "huggingface_pairs",
2357
+ "extractor_file": "super_gpqa",
2358
+ "benchmark_type": "question_answering",
2359
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2360
+ },
2361
+ "supergpqa_chemistry": {
2362
+ "evaluator": null,
2363
+ "extractor_location": "huggingface_pairs",
2364
+ "extractor_file": "super_gpqa",
2365
+ "benchmark_type": "question_answering",
2366
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2367
+ },
2368
+ "supergpqa_physics": {
2369
+ "evaluator": null,
2370
+ "extractor_location": "huggingface_pairs",
2371
+ "extractor_file": "super_gpqa",
2372
+ "benchmark_type": "question_answering",
2373
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2374
+ },
2375
+ "swag": {
2376
+ "evaluator": "log_likelihoods",
2377
+ "extractor_location": "lm_eval_pairs",
2378
+ "extractor_file": "swag",
2379
+ "benchmark_type": "other",
2380
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2381
+ },
2382
+ "swde": {
2383
+ "evaluator": null,
2384
+ "extractor_location": "lm_eval_pairs",
2385
+ "extractor_file": "swde",
2386
+ "benchmark_type": "other",
2387
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2388
+ },
2389
+ "sycophancy": {
2390
+ "evaluator": "log_likelihoods",
2391
+ "extractor_location": "lm_eval_pairs",
2392
+ "extractor_file": "sycophancy",
2393
+ "benchmark_type": "other",
2394
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2395
+ },
2396
+ "t0": {
2397
+ "evaluator": "generation",
2398
+ "extractor_location": "lm_eval_pairs",
2399
+ "extractor_file": "t0",
2400
+ "benchmark_type": "other",
2401
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2402
+ },
2403
+ "tag": {
2404
+ "evaluator": null,
2405
+ "extractor_location": "huggingface_pairs",
2406
+ "extractor_file": "tag",
2407
+ "benchmark_type": "other",
2408
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2409
+ },
2410
+ "teca": {
2411
+ "evaluator": "log_likelihoods",
2412
+ "extractor_location": "lm_eval_pairs",
2413
+ "extractor_file": "teca",
2414
+ "benchmark_type": "other",
2415
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2416
+ },
2417
+ "tinyarc": {
2418
+ "evaluator": "log_likelihoods",
2419
+ "extractor_location": "lm_eval_pairs",
2420
+ "extractor_file": "tinyarc",
2421
+ "benchmark_type": "knowledge",
2422
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2423
+ },
2424
+ "tinybenchmarks": {
2425
+ "evaluator": "log_likelihoods",
2426
+ "extractor_location": "lm_eval_pairs",
2427
+ "extractor_file": "tinybenchmarks",
2428
+ "benchmark_type": "other",
2429
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2430
+ },
2431
+ "tinygsm8k": {
2432
+ "evaluator": "generation",
2433
+ "extractor_location": "lm_eval_pairs",
2434
+ "extractor_file": "tinygsm8k",
2435
+ "benchmark_type": "mathematics",
2436
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2437
+ },
2438
+ "tinyhellaswag": {
2439
+ "evaluator": "log_likelihoods",
2440
+ "extractor_location": "lm_eval_pairs",
2441
+ "extractor_file": "tinyhellaswag",
2442
+ "benchmark_type": "knowledge",
2443
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2444
+ },
2445
+ "tinymmlu": {
2446
+ "evaluator": "log_likelihoods",
2447
+ "extractor_location": "lm_eval_pairs",
2448
+ "extractor_file": "tinymmlu",
2449
+ "benchmark_type": "knowledge",
2450
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2451
+ },
2452
+ "tinytruthfulqa": {
2453
+ "evaluator": "log_likelihoods",
2454
+ "extractor_location": "lm_eval_pairs",
2455
+ "extractor_file": "tinytruthfulqa",
2456
+ "benchmark_type": "question_answering",
2457
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2458
+ },
2459
+ "tinywinogrande": {
2460
+ "evaluator": "log_likelihoods",
2461
+ "extractor_location": "lm_eval_pairs",
2462
+ "extractor_file": "tinywinogrande",
2463
+ "benchmark_type": "other",
2464
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2465
+ },
2466
+ "tmlu": {
2467
+ "evaluator": "log_likelihoods",
2468
+ "extractor_location": "huggingface_pairs",
2469
+ "extractor_file": "tmlu",
2470
+ "benchmark_type": "other",
2471
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2472
+ },
2473
+ "tmmluplus": {
2474
+ "evaluator": "log_likelihoods",
2475
+ "extractor_location": "lm_eval_pairs",
2476
+ "extractor_file": "tmmluplus",
2477
+ "benchmark_type": "knowledge",
2478
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2479
+ },
2480
+ "toxigen": {
2481
+ "evaluator": "log_likelihoods",
2482
+ "extractor_location": "lm_eval_pairs",
2483
+ "extractor_file": "toxigen",
2484
+ "benchmark_type": "other",
2485
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2486
+ },
2487
+ "translation": {
2488
+ "evaluator": "generation",
2489
+ "extractor_location": "lm_eval_pairs",
2490
+ "extractor_file": "translation",
2491
+ "benchmark_type": "translation",
2492
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2493
+ },
2494
+ "triviaqa": {
2495
+ "evaluator": "generation",
2496
+ "extractor_location": "lm_eval_pairs",
2497
+ "extractor_file": "triviaqa",
2498
+ "benchmark_type": "question_answering",
2499
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2500
+ },
2501
+ "truthfulqa": {
2502
+ "evaluator": "log_likelihoods",
2503
+ "extractor_location": "lm_eval_pairs",
2504
+ "extractor_file": "truthfulqa",
2505
+ "benchmark_type": "question_answering",
2506
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2507
+ },
2508
+ "truthfulqa_gen": {
2509
+ "evaluator": "generation",
2510
+ "extractor_location": "lm_eval_pairs",
2511
+ "extractor_file": "truthfulqa_gen",
2512
+ "benchmark_type": "question_answering",
2513
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2514
+ },
2515
+ "truthfulqa_mc1": {
2516
+ "evaluator": "log_likelihoods",
2517
+ "extractor_location": "lm_eval_pairs",
2518
+ "extractor_file": "truthfulqa_mc1",
2519
+ "benchmark_type": "question_answering",
2520
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2521
+ },
2522
+ "truthfulqa_mc2": {
2523
+ "evaluator": "log_likelihoods",
2524
+ "extractor_location": "lm_eval_pairs",
2525
+ "extractor_file": "truthfulqa_mc2",
2526
+ "benchmark_type": "question_answering",
2527
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2528
+ },
2529
+ "truthfulqa_multi": {
2530
+ "evaluator": "mixed",
2531
+ "extractor_location": "lm_eval_pairs",
2532
+ "extractor_file": "truthfulqa_multi",
2533
+ "benchmark_type": "question_answering",
2534
+ "explanation": "Uses mixed evaluator"
2535
+ },
2536
+ "turblimp_core": {
2537
+ "evaluator": null,
2538
+ "extractor_location": "lm_eval_pairs",
2539
+ "extractor_file": "turblimp_core",
2540
+ "benchmark_type": "other",
2541
+ "explanation": "NO EVALUATOR DEFINED - will fail with error"
2542
+ },
2543
+ "turkishmmlu": {
2544
+ "evaluator": "log_likelihoods",
2545
+ "extractor_location": "lm_eval_pairs",
2546
+ "extractor_file": "turkishmmlu",
2547
+ "benchmark_type": "knowledge",
2548
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2549
+ },
2550
+ "twenty_newsgroups": {
2551
+ "evaluator": "exact_match",
2552
+ "extractor_location": "lm_eval_pairs",
2553
+ "extractor_file": "twenty_newsgroups",
2554
+ "benchmark_type": "other",
2555
+ "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
2556
+ },
2557
+ "unfair": {
2558
+ "evaluator": "generation",
2559
+ "extractor_location": "lm_eval_pairs",
2560
+ "extractor_file": "unfair",
2561
+ "benchmark_type": "other",
2562
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2563
+ },
2564
+ "unitxt": {
2565
+ "evaluator": "generation",
2566
+ "extractor_location": "lm_eval_pairs",
2567
+ "extractor_file": "unitxt",
2568
+ "benchmark_type": "other",
2569
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2570
+ },
2571
+ "unscramble": {
2572
+ "evaluator": "exact_match",
2573
+ "extractor_location": "lm_eval_pairs",
2574
+ "extractor_file": "unscramble",
2575
+ "benchmark_type": "other",
2576
+ "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
2577
+ },
2578
+ "vaxx": {
2579
+ "evaluator": "log_likelihoods",
2580
+ "extractor_location": "lm_eval_pairs",
2581
+ "extractor_file": "vaxx",
2582
+ "benchmark_type": "other",
2583
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2584
+ },
2585
+ "webqs": {
2586
+ "evaluator": "log_likelihoods",
2587
+ "extractor_location": "lm_eval_pairs",
2588
+ "extractor_file": "webqs",
2589
+ "benchmark_type": "other",
2590
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2591
+ },
2592
+ "wic": {
2593
+ "evaluator": "log_likelihoods",
2594
+ "extractor_location": "lm_eval_pairs",
2595
+ "extractor_file": "wic",
2596
+ "benchmark_type": "other",
2597
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2598
+ },
2599
+ "wiceu": {
2600
+ "evaluator": "log_likelihoods",
2601
+ "extractor_location": "huggingface_pairs",
2602
+ "extractor_file": "wiceu",
2603
+ "benchmark_type": "other",
2604
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2605
+ },
2606
+ "wikitext": {
2607
+ "evaluator": "generation",
2608
+ "extractor_location": "lm_eval_pairs",
2609
+ "extractor_file": "wikitext",
2610
+ "benchmark_type": "other",
2611
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2612
+ },
2613
+ "winogender": {
2614
+ "evaluator": "generation",
2615
+ "extractor_location": "lm_eval_pairs",
2616
+ "extractor_file": "winogender",
2617
+ "benchmark_type": "other",
2618
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2619
+ },
2620
+ "winogrande": {
2621
+ "evaluator": "log_likelihoods",
2622
+ "extractor_location": "lm_eval_pairs",
2623
+ "extractor_file": "winogrande",
2624
+ "benchmark_type": "other",
2625
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2626
+ },
2627
+ "wmdp": {
2628
+ "evaluator": "log_likelihoods",
2629
+ "extractor_location": "lm_eval_pairs",
2630
+ "extractor_file": "wmdp",
2631
+ "benchmark_type": "other",
2632
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2633
+ },
2634
+ "wmt14": {
2635
+ "evaluator": "generation",
2636
+ "extractor_location": "lm_eval_pairs",
2637
+ "extractor_file": "wmt14",
2638
+ "benchmark_type": "translation",
2639
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2640
+ },
2641
+ "wmt14_en_fr": {
2642
+ "evaluator": "generation",
2643
+ "extractor_location": "huggingface_pairs",
2644
+ "extractor_file": "wmt14_en_fr",
2645
+ "benchmark_type": "translation",
2646
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2647
+ },
2648
+ "wmt14_fr_en": {
2649
+ "evaluator": "generation",
2650
+ "extractor_location": "huggingface_pairs",
2651
+ "extractor_file": "wmt14_fr_en",
2652
+ "benchmark_type": "translation",
2653
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2654
+ },
2655
+ "wmt16": {
2656
+ "evaluator": "generation",
2657
+ "extractor_location": "lm_eval_pairs",
2658
+ "extractor_file": "wmt16",
2659
+ "benchmark_type": "translation",
2660
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2661
+ },
2662
+ "wmt16_de_en": {
2663
+ "evaluator": "generation",
2664
+ "extractor_location": "huggingface_pairs",
2665
+ "extractor_file": "wmt16_de_en",
2666
+ "benchmark_type": "translation",
2667
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2668
+ },
2669
+ "wmt16_en_de": {
2670
+ "evaluator": "generation",
2671
+ "extractor_location": "huggingface_pairs",
2672
+ "extractor_file": "wmt16_en_de",
2673
+ "benchmark_type": "translation",
2674
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2675
+ },
2676
+ "wmt16_en_ro": {
2677
+ "evaluator": "generation",
2678
+ "extractor_location": "huggingface_pairs",
2679
+ "extractor_file": "wmt16_en_ro",
2680
+ "benchmark_type": "translation",
2681
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2682
+ },
2683
+ "wmt16_ro_en": {
2684
+ "evaluator": "generation",
2685
+ "extractor_location": "huggingface_pairs",
2686
+ "extractor_file": "wmt16_ro_en",
2687
+ "benchmark_type": "translation",
2688
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2689
+ },
2690
+ "wmt_ro_en_t5_prompt": {
2691
+ "evaluator": "generation",
2692
+ "extractor_location": "huggingface_pairs",
2693
+ "extractor_file": "wmt_ro_en_t5_prompt",
2694
+ "benchmark_type": "translation",
2695
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2696
+ },
2697
+ "wnli": {
2698
+ "evaluator": "log_likelihoods",
2699
+ "extractor_location": "lm_eval_pairs",
2700
+ "extractor_file": "wnli",
2701
+ "benchmark_type": "other",
2702
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2703
+ },
2704
+ "wsc": {
2705
+ "evaluator": "log_likelihoods",
2706
+ "extractor_location": "lm_eval_pairs",
2707
+ "extractor_file": "wsc",
2708
+ "benchmark_type": "other",
2709
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2710
+ },
2711
+ "wsc273": {
2712
+ "evaluator": "log_likelihoods",
2713
+ "extractor_location": "lm_eval_pairs",
2714
+ "extractor_file": "wsc273",
2715
+ "benchmark_type": "other",
2716
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2717
+ },
2718
+ "xcopa": {
2719
+ "evaluator": "log_likelihoods",
2720
+ "extractor_location": "lm_eval_pairs",
2721
+ "extractor_file": "xcopa",
2722
+ "benchmark_type": "other",
2723
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2724
+ },
2725
+ "xlsum": {
2726
+ "evaluator": "generation",
2727
+ "extractor_location": "lm_eval_pairs",
2728
+ "extractor_file": "xlsum",
2729
+ "benchmark_type": "other",
2730
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2731
+ },
2732
+ "xnli": {
2733
+ "evaluator": "log_likelihoods",
2734
+ "extractor_location": "lm_eval_pairs",
2735
+ "extractor_file": "xnli",
2736
+ "benchmark_type": "other",
2737
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2738
+ },
2739
+ "xquad": {
2740
+ "evaluator": "generation",
2741
+ "extractor_location": "lm_eval_pairs",
2742
+ "extractor_file": "xquad",
2743
+ "benchmark_type": "other",
2744
+ "explanation": "Text generation evaluation - assesses quality of generated text"
2745
+ },
2746
+ "xstorycloze": {
2747
+ "evaluator": "log_likelihoods",
2748
+ "extractor_location": "lm_eval_pairs",
2749
+ "extractor_file": "xstorycloze",
2750
+ "benchmark_type": "other",
2751
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2752
+ },
2753
+ "xsum": {
2754
+ "evaluator": "exact_match",
2755
+ "extractor_location": "huggingface_pairs",
2756
+ "extractor_file": "xsum",
2757
+ "benchmark_type": "other",
2758
+ "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
2759
+ },
2760
+ "xwinograd": {
2761
+ "evaluator": "log_likelihoods",
2762
+ "extractor_location": "lm_eval_pairs",
2763
+ "extractor_file": "xwinograd",
2764
+ "benchmark_type": "other",
2765
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2766
+ },
2767
+ "yahoo": {
2768
+ "evaluator": "log_likelihoods",
2769
+ "extractor_location": "lm_eval_pairs",
2770
+ "extractor_file": "yahoo",
2771
+ "benchmark_type": "other",
2772
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2773
+ },
2774
+ "zhoblimp": {
2775
+ "evaluator": "log_likelihoods",
2776
+ "extractor_location": "lm_eval_pairs",
2777
+ "extractor_file": "zhoblimp",
2778
+ "benchmark_type": "other",
2779
+ "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2780
+ }
2781
+ }