wisent 0.7.379__py3-none-any.whl → 0.7.701__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (725) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/__init__.py +22 -6
  3. wisent/core/activations/activations.py +21 -39
  4. wisent/core/activations/activations_collector.py +141 -373
  5. wisent/core/activations/classifier_inference_strategy.py +194 -0
  6. wisent/core/activations/core/atoms.py +8 -92
  7. wisent/core/activations/extraction_strategy.py +308 -0
  8. wisent/core/agent/diagnose/response_diagnostics.py +3 -3
  9. wisent/core/agent/diagnose.py +3 -3
  10. wisent/core/autonomous_agent.py +2 -2
  11. wisent/core/cli/agent/apply_steering.py +23 -27
  12. wisent/core/cli/agent/evaluate_response.py +18 -20
  13. wisent/core/cli/agent/train_classifier.py +18 -20
  14. wisent/core/cli/cluster_benchmarks.py +472 -0
  15. wisent/core/cli/create_steering_vector.py +13 -5
  16. wisent/core/cli/generate_vector_from_task.py +4 -0
  17. wisent/core/cli/get_activations.py +12 -36
  18. wisent/core/cli/method_optimizer.py +859 -0
  19. wisent/core/cli/optimize.py +44 -5
  20. wisent/core/cli/optimize_classification.py +5 -6
  21. wisent/core/cli/optimize_sample_size.py +8 -22
  22. wisent/core/cli/optimize_steering.py +429 -153
  23. wisent/core/cli/optimize_weights.py +65 -6
  24. wisent/core/cli/steering_method_trainer.py +5 -4
  25. wisent/core/cli/steering_search_space.py +20 -15
  26. wisent/core/cli/tasks.py +14 -43
  27. wisent/core/cli/train_unified_goodness.py +17 -18
  28. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +1578 -173
  29. wisent/core/contrastive_pairs/diagnostics/linearity.py +63 -80
  30. wisent/core/contrastive_pairs/diagnostics/vector_quality.py +6 -5
  31. wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py +5 -19
  32. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py +11 -5
  33. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py +146 -32
  34. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py +2 -2
  35. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py +98 -57
  36. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py +8 -8
  37. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py +1 -1
  38. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -5
  39. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval_aqua_rat.py +129 -0
  40. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py +11 -6
  41. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +1 -1
  42. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py +47 -6
  43. wisent/core/evaluators/benchmark_specific/apps_evaluator.py +133 -0
  44. wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py +6 -1
  45. wisent/core/evaluators/benchmark_specific/conala_evaluator.py +31 -168
  46. wisent/core/evaluators/custom/examples/humanization_coherent.py +89 -35
  47. wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py +2 -20
  48. wisent/core/evaluators/personalization/coherence.py +46 -0
  49. wisent/core/hyperparameter_optimizer.py +13 -13
  50. wisent/core/lm_eval_harness_ground_truth.py +7 -11
  51. wisent/core/main.py +3 -0
  52. wisent/core/models/wisent_model.py +8 -7
  53. wisent/core/opti/methods/opti_weights.py +29 -2
  54. wisent/core/optuna/classifier/activation_generator.py +14 -12
  55. wisent/core/optuna/steering/steering_optimization.py +14 -9
  56. wisent/core/parser_arguments/cluster_benchmarks_parser.py +31 -0
  57. wisent/core/parser_arguments/generate_vector_from_task_parser.py +20 -0
  58. wisent/core/parser_arguments/main_parser.py +8 -0
  59. wisent/core/parser_arguments/optimize_steering_parser.py +117 -10
  60. wisent/core/parser_arguments/optimize_weights_parser.py +6 -0
  61. wisent/core/parser_arguments/tasks_parser.py +7 -19
  62. wisent/core/steering_methods/core/atoms.py +1 -2
  63. wisent/core/steering_methods/methods/caa.py +1 -1
  64. wisent/core/steering_methods/methods/hyperplane.py +74 -0
  65. wisent/core/steering_methods/methods/prism.py +1 -2
  66. wisent/core/steering_methods/methods/pulse.py +39 -8
  67. wisent/core/steering_methods/methods/titan.py +59 -14
  68. wisent/core/steering_methods/registry.py +52 -12
  69. wisent/core/steering_optimizer.py +15 -15
  70. wisent/core/trainers/steering_trainer.py +9 -18
  71. wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json +19 -70
  72. wisent/scripts/run_quality_metrics_sweep.sh +22 -27
  73. wisent/tests/test_aggregation_geometry.py +236 -0
  74. wisent/tests/test_detector_accuracy.py +163 -0
  75. wisent/tests/test_geometry_exhaustive.py +1202 -0
  76. wisent/tests/visualize_geometry.py +255 -61
  77. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/METADATA +1 -1
  78. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/RECORD +82 -714
  79. wisent/core/activations/prompt_construction_strategy.py +0 -47
  80. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text.py +0 -15
  81. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_go.py +0 -64
  82. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_java.py +0 -65
  83. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_javascript.py +0 -65
  84. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_php.py +0 -65
  85. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_python.py +0 -65
  86. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_ruby.py +0 -65
  87. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/freebase.py +0 -99
  88. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py +0 -180
  89. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py +0 -129
  90. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py +0 -142
  91. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py +0 -155
  92. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code2text.py +0 -161
  93. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/codexglue.py +0 -107
  94. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livemathbench.py +0 -155
  95. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polymath.py +0 -155
  96. wisent/examples/scripts/results/benchmark_descriptions.json +0 -1244
  97. wisent/examples/scripts/results/benchmark_evaluation_methods.json +0 -66
  98. wisent/examples/scripts/results/benchmark_evaluator_mapping.json +0 -2781
  99. wisent/examples/scripts/results/benchmark_evaluator_mapping_updated.json +0 -30536
  100. wisent/examples/scripts/results/benchmark_evaluators_clean.json +0 -469
  101. wisent/examples/scripts/results/benchmark_methods_summary.json +0 -260
  102. wisent/examples/scripts/results/benchmark_pair_creation_methods.json +0 -66
  103. wisent/examples/scripts/results/benchmark_pair_totals.json +0 -269
  104. wisent/examples/scripts/results/benchmark_tags.json +0 -917
  105. wisent/examples/scripts/results/benchmark_test_summary_nov4.json +0 -71
  106. wisent/examples/scripts/results/coding_benchmarks_test_code_status.json +0 -150
  107. wisent/examples/scripts/results/failing_benchmarks.json +0 -946
  108. wisent/examples/scripts/results/failing_benchmarks_list.json +0 -41
  109. wisent/examples/scripts/results/failing_benchmarks_test_results.json +0 -945
  110. wisent/examples/scripts/results/missing_benchmark_tags.json +0 -341
  111. wisent/examples/scripts/results/test_20_newsgroups_evaluation.json +0 -30
  112. wisent/examples/scripts/results/test_20_newsgroups_pairs.json +0 -8
  113. wisent/examples/scripts/results/test_AraDICE_evaluation.json +0 -51
  114. wisent/examples/scripts/results/test_AraDICE_pairs.json +0 -14
  115. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_evaluation.json +0 -30
  116. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_pairs.json +0 -8
  117. wisent/examples/scripts/results/test_ArabCulture_evaluation.json +0 -51
  118. wisent/examples/scripts/results/test_ArabCulture_pairs.json +0 -14
  119. wisent/examples/scripts/results/test_Tag_evaluation.json +0 -30
  120. wisent/examples/scripts/results/test_Tag_pairs.json +0 -8
  121. wisent/examples/scripts/results/test_aclue_evaluation.json +0 -51
  122. wisent/examples/scripts/results/test_aclue_pairs.json +0 -14
  123. wisent/examples/scripts/results/test_acp_bench_evaluation.json +0 -51
  124. wisent/examples/scripts/results/test_acp_bench_hard_evaluation.json +0 -51
  125. wisent/examples/scripts/results/test_acp_bench_hard_pairs.json +0 -14
  126. wisent/examples/scripts/results/test_acp_bench_pairs.json +0 -14
  127. wisent/examples/scripts/results/test_advanced_ai_risk_evaluation.json +0 -51
  128. wisent/examples/scripts/results/test_advanced_ai_risk_pairs.json +0 -14
  129. wisent/examples/scripts/results/test_aexams_evaluation.json +0 -51
  130. wisent/examples/scripts/results/test_aexams_pairs.json +0 -14
  131. wisent/examples/scripts/results/test_afrimgsm_direct_amh_evaluation.json +0 -30
  132. wisent/examples/scripts/results/test_afrimgsm_direct_amh_pairs.json +0 -8
  133. wisent/examples/scripts/results/test_afrimmlu_direct_amh_evaluation.json +0 -30
  134. wisent/examples/scripts/results/test_afrimmlu_direct_amh_pairs.json +0 -8
  135. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_evaluation.json +0 -30
  136. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_pairs.json +0 -8
  137. wisent/examples/scripts/results/test_ag_news_evaluation.json +0 -30
  138. wisent/examples/scripts/results/test_ag_news_pairs.json +0 -8
  139. wisent/examples/scripts/results/test_agieval_evaluation.json +0 -51
  140. wisent/examples/scripts/results/test_agieval_pairs.json +0 -14
  141. wisent/examples/scripts/results/test_aime2024_evaluation.json +0 -30
  142. wisent/examples/scripts/results/test_aime2024_pairs.json +0 -8
  143. wisent/examples/scripts/results/test_aime2025_evaluation.json +0 -30
  144. wisent/examples/scripts/results/test_aime2025_pairs.json +0 -8
  145. wisent/examples/scripts/results/test_aime_evaluation.json +0 -30
  146. wisent/examples/scripts/results/test_aime_pairs.json +0 -8
  147. wisent/examples/scripts/results/test_anagrams1_evaluation.json +0 -30
  148. wisent/examples/scripts/results/test_anagrams1_pairs.json +0 -8
  149. wisent/examples/scripts/results/test_anagrams2_evaluation.json +0 -30
  150. wisent/examples/scripts/results/test_anagrams2_pairs.json +0 -8
  151. wisent/examples/scripts/results/test_anli_evaluation.json +0 -30
  152. wisent/examples/scripts/results/test_anli_pairs.json +0 -8
  153. wisent/examples/scripts/results/test_apps_evaluation.json +0 -30
  154. wisent/examples/scripts/results/test_apps_pairs.json +0 -8
  155. wisent/examples/scripts/results/test_arabic_exams_evaluation.json +0 -30
  156. wisent/examples/scripts/results/test_arabic_exams_pairs.json +0 -8
  157. wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json +0 -51
  158. wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json +0 -14
  159. wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json +0 -51
  160. wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json +0 -14
  161. wisent/examples/scripts/results/test_arabicmmlu_evaluation.json +0 -51
  162. wisent/examples/scripts/results/test_arabicmmlu_pairs.json +0 -14
  163. wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json +0 -51
  164. wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json +0 -14
  165. wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json +0 -51
  166. wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json +0 -14
  167. wisent/examples/scripts/results/test_arc_ar_evaluation.json +0 -30
  168. wisent/examples/scripts/results/test_arc_ar_pairs.json +0 -8
  169. wisent/examples/scripts/results/test_arc_challenge_evaluation.json +0 -30
  170. wisent/examples/scripts/results/test_arc_challenge_pairs.json +0 -8
  171. wisent/examples/scripts/results/test_arc_easy_evaluation.json +0 -30
  172. wisent/examples/scripts/results/test_arc_easy_pairs.json +0 -8
  173. wisent/examples/scripts/results/test_argument_topic_evaluation.json +0 -30
  174. wisent/examples/scripts/results/test_argument_topic_pairs.json +0 -8
  175. wisent/examples/scripts/results/test_arithmetic_evaluation.json +0 -51
  176. wisent/examples/scripts/results/test_arithmetic_pairs.json +0 -14
  177. wisent/examples/scripts/results/test_asdiv_evaluation.json +0 -30
  178. wisent/examples/scripts/results/test_asdiv_pairs.json +0 -8
  179. wisent/examples/scripts/results/test_assin_entailment_evaluation.json +0 -30
  180. wisent/examples/scripts/results/test_assin_entailment_pairs.json +0 -8
  181. wisent/examples/scripts/results/test_atis_evaluation.json +0 -30
  182. wisent/examples/scripts/results/test_atis_pairs.json +0 -8
  183. wisent/examples/scripts/results/test_babi_evaluation.json +0 -30
  184. wisent/examples/scripts/results/test_babi_pairs.json +0 -8
  185. wisent/examples/scripts/results/test_babilong_evaluation.json +0 -30
  186. wisent/examples/scripts/results/test_babilong_pairs.json +0 -8
  187. wisent/examples/scripts/results/test_bangla_mmlu_evaluation.json +0 -30
  188. wisent/examples/scripts/results/test_bangla_mmlu_pairs.json +0 -8
  189. wisent/examples/scripts/results/test_banking77_evaluation.json +0 -30
  190. wisent/examples/scripts/results/test_banking77_pairs.json +0 -8
  191. wisent/examples/scripts/results/test_basque/test_basque-glue_pairs.json +0 -14
  192. wisent/examples/scripts/results/test_basque-glue_evaluation.json +0 -51
  193. wisent/examples/scripts/results/test_basque-glue_pairs.json +0 -14
  194. wisent/examples/scripts/results/test_basque2/test_basque-glue_evaluation.json +0 -51
  195. wisent/examples/scripts/results/test_basque2/test_basque-glue_pairs.json +0 -14
  196. wisent/examples/scripts/results/test_basque_bench_evaluation.json +0 -51
  197. wisent/examples/scripts/results/test_basque_bench_pairs.json +0 -14
  198. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_evaluation.json +0 -51
  199. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_pairs.json +0 -14
  200. wisent/examples/scripts/results/test_basqueglue_evaluation.json +0 -51
  201. wisent/examples/scripts/results/test_basqueglue_pairs.json +0 -14
  202. wisent/examples/scripts/results/test_bbh_evaluation.json +0 -51
  203. wisent/examples/scripts/results/test_bbh_pairs.json +0 -14
  204. wisent/examples/scripts/results/test_bbq_evaluation.json +0 -30
  205. wisent/examples/scripts/results/test_bbq_pairs.json +0 -8
  206. wisent/examples/scripts/results/test_bec2016eu_evaluation.json +0 -51
  207. wisent/examples/scripts/results/test_bec2016eu_pairs.json +0 -14
  208. wisent/examples/scripts/results/test_belebele_evaluation.json +0 -51
  209. wisent/examples/scripts/results/test_belebele_pairs.json +0 -14
  210. wisent/examples/scripts/results/test_benchmarks_evaluation.json +0 -51
  211. wisent/examples/scripts/results/test_benchmarks_pairs.json +0 -14
  212. wisent/examples/scripts/results/test_bertaqa_evaluation.json +0 -51
  213. wisent/examples/scripts/results/test_bertaqa_pairs.json +0 -14
  214. wisent/examples/scripts/results/test_bhtc_v2_evaluation.json +0 -30
  215. wisent/examples/scripts/results/test_bhtc_v2_pairs.json +0 -8
  216. wisent/examples/scripts/results/test_bigbench_evaluation.json +0 -51
  217. wisent/examples/scripts/results/test_bigbench_pairs.json +0 -14
  218. wisent/examples/scripts/results/test_blimp_evaluation.json +0 -51
  219. wisent/examples/scripts/results/test_blimp_pairs.json +0 -14
  220. wisent/examples/scripts/results/test_boolq/test_boolq_evaluation.json +0 -30
  221. wisent/examples/scripts/results/test_boolq/test_boolq_pairs.json +0 -8
  222. wisent/examples/scripts/results/test_boolq-seq2seq_evaluation.json +0 -30
  223. wisent/examples/scripts/results/test_boolq-seq2seq_pairs.json +0 -8
  224. wisent/examples/scripts/results/test_boolq_evaluation.json +0 -30
  225. wisent/examples/scripts/results/test_boolq_pairs.json +0 -8
  226. wisent/examples/scripts/results/test_c4_evaluation.json +0 -30
  227. wisent/examples/scripts/results/test_c4_pairs.json +0 -8
  228. wisent/examples/scripts/results/test_cabreu_evaluation.json +0 -30
  229. wisent/examples/scripts/results/test_cabreu_pairs.json +0 -8
  230. wisent/examples/scripts/results/test_careqa_evaluation.json +0 -30
  231. wisent/examples/scripts/results/test_careqa_pairs.json +0 -8
  232. wisent/examples/scripts/results/test_catalan_bench_evaluation.json +0 -51
  233. wisent/examples/scripts/results/test_catalan_bench_pairs.json +0 -14
  234. wisent/examples/scripts/results/test_catalanqa_evaluation.json +0 -30
  235. wisent/examples/scripts/results/test_catalanqa_pairs.json +0 -8
  236. wisent/examples/scripts/results/test_catcola_evaluation.json +0 -30
  237. wisent/examples/scripts/results/test_catcola_pairs.json +0 -8
  238. wisent/examples/scripts/results/test_cb_evaluation.json +0 -30
  239. wisent/examples/scripts/results/test_cb_pairs.json +0 -8
  240. wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json +0 -51
  241. wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json +0 -14
  242. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json +0 -30
  243. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json +0 -8
  244. wisent/examples/scripts/results/test_ceval_evaluation.json +0 -51
  245. wisent/examples/scripts/results/test_ceval_pairs.json +0 -14
  246. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json +0 -51
  247. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json +0 -14
  248. wisent/examples/scripts/results/test_chain_of_thought_evaluation.json +0 -51
  249. wisent/examples/scripts/results/test_chain_of_thought_pairs.json +0 -14
  250. wisent/examples/scripts/results/test_chartqa_evaluation.json +0 -30
  251. wisent/examples/scripts/results/test_chartqa_pairs.json +0 -8
  252. wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json +0 -30
  253. wisent/examples/scripts/results/test_claim_stance_topic_pairs.json +0 -8
  254. wisent/examples/scripts/results/test_cmmlu_evaluation.json +0 -51
  255. wisent/examples/scripts/results/test_cmmlu_pairs.json +0 -14
  256. wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json +0 -30
  257. wisent/examples/scripts/results/test_cnn_dailymail_pairs.json +0 -8
  258. wisent/examples/scripts/results/test_cocoteros_es_evaluation.json +0 -30
  259. wisent/examples/scripts/results/test_cocoteros_es_pairs.json +0 -8
  260. wisent/examples/scripts/results/test_codexglue_code_to_text_go_evaluation.json +0 -30
  261. wisent/examples/scripts/results/test_codexglue_code_to_text_go_pairs.json +0 -8
  262. wisent/examples/scripts/results/test_codexglue_code_to_text_java_evaluation.json +0 -30
  263. wisent/examples/scripts/results/test_codexglue_code_to_text_java_pairs.json +0 -8
  264. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_evaluation.json +0 -30
  265. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_pairs.json +0 -8
  266. wisent/examples/scripts/results/test_codexglue_code_to_text_php_evaluation.json +0 -30
  267. wisent/examples/scripts/results/test_codexglue_code_to_text_php_pairs.json +0 -8
  268. wisent/examples/scripts/results/test_codexglue_code_to_text_python_evaluation.json +0 -30
  269. wisent/examples/scripts/results/test_codexglue_code_to_text_python_pairs.json +0 -8
  270. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_evaluation.json +0 -30
  271. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_pairs.json +0 -8
  272. wisent/examples/scripts/results/test_coedit_gec_evaluation.json +0 -30
  273. wisent/examples/scripts/results/test_coedit_gec_pairs.json +0 -8
  274. wisent/examples/scripts/results/test_cola_evaluation.json +0 -30
  275. wisent/examples/scripts/results/test_cola_pairs.json +0 -8
  276. wisent/examples/scripts/results/test_commonsense_qa_evaluation.json +0 -30
  277. wisent/examples/scripts/results/test_commonsense_qa_pairs.json +0 -8
  278. wisent/examples/scripts/results/test_conala_evaluation.json +0 -30
  279. wisent/examples/scripts/results/test_conala_pairs.json +0 -8
  280. wisent/examples/scripts/results/test_concode_evaluation.json +0 -30
  281. wisent/examples/scripts/results/test_concode_pairs.json +0 -8
  282. wisent/examples/scripts/results/test_copa_evaluation.json +0 -30
  283. wisent/examples/scripts/results/test_copa_pairs.json +0 -8
  284. wisent/examples/scripts/results/test_copal_id_evaluation.json +0 -30
  285. wisent/examples/scripts/results/test_copal_id_pairs.json +0 -8
  286. wisent/examples/scripts/results/test_coqa_evaluation.json +0 -30
  287. wisent/examples/scripts/results/test_coqa_pairs.json +0 -8
  288. wisent/examples/scripts/results/test_coqcat_evaluation.json +0 -30
  289. wisent/examples/scripts/results/test_coqcat_pairs.json +0 -8
  290. wisent/examples/scripts/results/test_crows_pairs_evaluation.json +0 -51
  291. wisent/examples/scripts/results/test_crows_pairs_pairs.json +0 -14
  292. wisent/examples/scripts/results/test_csatqa_evaluation.json +0 -51
  293. wisent/examples/scripts/results/test_csatqa_pairs.json +0 -14
  294. wisent/examples/scripts/results/test_cycle_letters_evaluation.json +0 -30
  295. wisent/examples/scripts/results/test_cycle_letters_pairs.json +0 -8
  296. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_evaluation.json +0 -51
  297. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_pairs.json +0 -14
  298. wisent/examples/scripts/results/test_darija_bench_evaluation.json +0 -51
  299. wisent/examples/scripts/results/test_darija_bench_pairs.json +0 -14
  300. wisent/examples/scripts/results/test_darijahellaswag_evaluation.json +0 -30
  301. wisent/examples/scripts/results/test_darijahellaswag_pairs.json +0 -8
  302. wisent/examples/scripts/results/test_darijammlu_evaluation.json +0 -51
  303. wisent/examples/scripts/results/test_darijammlu_pairs.json +0 -14
  304. wisent/examples/scripts/results/test_dbpedia_14_evaluation.json +0 -30
  305. wisent/examples/scripts/results/test_dbpedia_14_pairs.json +0 -8
  306. wisent/examples/scripts/results/test_drop_evaluation.json +0 -30
  307. wisent/examples/scripts/results/test_drop_pairs.json +0 -8
  308. wisent/examples/scripts/results/test_ds1000_evaluation.json +0 -30
  309. wisent/examples/scripts/results/test_ds1000_pairs.json +0 -8
  310. wisent/examples/scripts/results/test_egyhellaswag_evaluation.json +0 -30
  311. wisent/examples/scripts/results/test_egyhellaswag_pairs.json +0 -8
  312. wisent/examples/scripts/results/test_egymmlu_evaluation.json +0 -51
  313. wisent/examples/scripts/results/test_egymmlu_pairs.json +0 -14
  314. wisent/examples/scripts/results/test_epec_koref_bin_evaluation.json +0 -30
  315. wisent/examples/scripts/results/test_epec_koref_bin_pairs.json +0 -8
  316. wisent/examples/scripts/results/test_eq_bench_evaluation.json +0 -30
  317. wisent/examples/scripts/results/test_eq_bench_pairs.json +0 -8
  318. wisent/examples/scripts/results/test_escola_evaluation.json +0 -30
  319. wisent/examples/scripts/results/test_escola_pairs.json +0 -8
  320. wisent/examples/scripts/results/test_ethics_cm_evaluation.json +0 -30
  321. wisent/examples/scripts/results/test_ethics_cm_pairs.json +0 -8
  322. wisent/examples/scripts/results/test_ethos_binary_evaluation.json +0 -30
  323. wisent/examples/scripts/results/test_ethos_binary_pairs.json +0 -8
  324. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_evaluation.json +0 -51
  325. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_pairs.json +0 -14
  326. wisent/examples/scripts/results/test_eus_exams_es_evaluation.json +0 -51
  327. wisent/examples/scripts/results/test_eus_exams_es_pairs.json +0 -14
  328. wisent/examples/scripts/results/test_eus_exams_evaluation.json +0 -51
  329. wisent/examples/scripts/results/test_eus_exams_pairs.json +0 -14
  330. wisent/examples/scripts/results/test_eus_proficiency_evaluation.json +0 -30
  331. wisent/examples/scripts/results/test_eus_proficiency_pairs.json +0 -8
  332. wisent/examples/scripts/results/test_eus_reading_evaluation.json +0 -30
  333. wisent/examples/scripts/results/test_eus_reading_pairs.json +0 -8
  334. wisent/examples/scripts/results/test_eus_trivia_evaluation.json +0 -30
  335. wisent/examples/scripts/results/test_eus_trivia_pairs.json +0 -8
  336. wisent/examples/scripts/results/test_evalita-mp_evaluation.json +0 -51
  337. wisent/examples/scripts/results/test_evalita-mp_pairs.json +0 -14
  338. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
  339. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
  340. wisent/examples/scripts/results/test_evalita_LLM_evaluation.json +0 -51
  341. wisent/examples/scripts/results/test_evalita_LLM_pairs.json +0 -14
  342. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_evaluation.json +0 -51
  343. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_pairs.json +0 -14
  344. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_evaluation.json +0 -30
  345. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_pairs.json +0 -8
  346. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_evaluation.json +0 -51
  347. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_pairs.json +0 -14
  348. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
  349. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
  350. wisent/examples/scripts/results/test_fda_evaluation.json +0 -30
  351. wisent/examples/scripts/results/test_fda_pairs.json +0 -8
  352. wisent/examples/scripts/results/test_financial_tweets_evaluation.json +0 -30
  353. wisent/examples/scripts/results/test_financial_tweets_pairs.json +0 -8
  354. wisent/examples/scripts/results/test_fld/test_fld_evaluation.json +0 -30
  355. wisent/examples/scripts/results/test_fld/test_fld_pairs.json +0 -8
  356. wisent/examples/scripts/results/test_fld_evaluation.json +0 -30
  357. wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json +0 -30
  358. wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json +0 -8
  359. wisent/examples/scripts/results/test_fld_pairs.json +0 -8
  360. wisent/examples/scripts/results/test_flores_evaluation.json +0 -51
  361. wisent/examples/scripts/results/test_flores_pairs.json +0 -14
  362. wisent/examples/scripts/results/test_freebase_evaluation.json +0 -30
  363. wisent/examples/scripts/results/test_freebase_pairs.json +0 -8
  364. wisent/examples/scripts/results/test_french_bench_evaluation.json +0 -51
  365. wisent/examples/scripts/results/test_french_bench_pairs.json +0 -14
  366. wisent/examples/scripts/results/test_galcola_evaluation.json +0 -30
  367. wisent/examples/scripts/results/test_galcola_pairs.json +0 -8
  368. wisent/examples/scripts/results/test_galician_bench_evaluation.json +0 -51
  369. wisent/examples/scripts/results/test_galician_bench_pairs.json +0 -14
  370. wisent/examples/scripts/results/test_glianorex_evaluation.json +0 -30
  371. wisent/examples/scripts/results/test_glianorex_pairs.json +0 -8
  372. wisent/examples/scripts/results/test_global_mmlu_evaluation.json +0 -51
  373. wisent/examples/scripts/results/test_global_mmlu_pairs.json +0 -14
  374. wisent/examples/scripts/results/test_glue_evaluation.json +0 -51
  375. wisent/examples/scripts/results/test_glue_pairs.json +0 -14
  376. wisent/examples/scripts/results/test_gpqa_evaluation.json +0 -51
  377. wisent/examples/scripts/results/test_gpqa_pairs.json +0 -14
  378. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_evaluation.json +0 -51
  379. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_pairs.json +0 -14
  380. wisent/examples/scripts/results/test_groundcocoa_evaluation.json +0 -30
  381. wisent/examples/scripts/results/test_groundcocoa_pairs.json +0 -8
  382. wisent/examples/scripts/results/test_gsm8k_evaluation.json +0 -30
  383. wisent/examples/scripts/results/test_gsm8k_pairs.json +0 -8
  384. wisent/examples/scripts/results/test_haerae_evaluation.json +0 -51
  385. wisent/examples/scripts/results/test_haerae_pairs.json +0 -14
  386. wisent/examples/scripts/results/test_headqa_evaluation.json +0 -30
  387. wisent/examples/scripts/results/test_headqa_pairs.json +0 -8
  388. wisent/examples/scripts/results/test_hellaswag_evaluation.json +0 -30
  389. wisent/examples/scripts/results/test_hellaswag_pairs.json +0 -8
  390. wisent/examples/scripts/results/test_hendrycks_ethics_evaluation.json +0 -51
  391. wisent/examples/scripts/results/test_hendrycks_ethics_pairs.json +0 -14
  392. wisent/examples/scripts/results/test_hendrycks_math_evaluation.json +0 -51
  393. wisent/examples/scripts/results/test_hendrycks_math_pairs.json +0 -14
  394. wisent/examples/scripts/results/test_histoires_morales_evaluation.json +0 -30
  395. wisent/examples/scripts/results/test_histoires_morales_pairs.json +0 -8
  396. wisent/examples/scripts/results/test_hmmt_evaluation.json +0 -30
  397. wisent/examples/scripts/results/test_hmmt_feb_2025_evaluation.json +0 -30
  398. wisent/examples/scripts/results/test_hmmt_feb_2025_pairs.json +0 -8
  399. wisent/examples/scripts/results/test_hmmt_pairs.json +0 -8
  400. wisent/examples/scripts/results/test_hrm8k_evaluation.json +0 -51
  401. wisent/examples/scripts/results/test_hrm8k_pairs.json +0 -14
  402. wisent/examples/scripts/results/test_humaneval_evaluation.json +0 -30
  403. wisent/examples/scripts/results/test_humaneval_pairs.json +0 -8
  404. wisent/examples/scripts/results/test_humaneval_plus_evaluation.json +0 -30
  405. wisent/examples/scripts/results/test_humaneval_plus_pairs.json +0 -8
  406. wisent/examples/scripts/results/test_ifeval_evaluation.json +0 -30
  407. wisent/examples/scripts/results/test_ifeval_pairs.json +0 -8
  408. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_evaluation.json +0 -30
  409. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_pairs.json +0 -8
  410. wisent/examples/scripts/results/test_instruct_humaneval_evaluation.json +0 -30
  411. wisent/examples/scripts/results/test_instruct_humaneval_pairs.json +0 -8
  412. wisent/examples/scripts/results/test_inverse_scaling_evaluation.json +0 -51
  413. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_evaluation.json +0 -30
  414. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_pairs.json +0 -8
  415. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_evaluation.json +0 -51
  416. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_pairs.json +0 -14
  417. wisent/examples/scripts/results/test_inverse_scaling_pairs.json +0 -14
  418. wisent/examples/scripts/results/test_iwslt2017-ar-en_evaluation.json +0 -30
  419. wisent/examples/scripts/results/test_iwslt2017-ar-en_pairs.json +0 -8
  420. wisent/examples/scripts/results/test_iwslt2017-en-ar_evaluation.json +0 -30
  421. wisent/examples/scripts/results/test_iwslt2017-en-ar_pairs.json +0 -8
  422. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_evaluation.json +0 -30
  423. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_pairs.json +0 -8
  424. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_evaluation.json +0 -30
  425. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_pairs.json +0 -8
  426. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_evaluation.json +0 -30
  427. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_pairs.json +0 -8
  428. wisent/examples/scripts/results/test_japanese_leaderboard_evaluation.json +0 -51
  429. wisent/examples/scripts/results/test_japanese_leaderboard_pairs.json +0 -14
  430. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_evaluation.json +0 -30
  431. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_pairs.json +0 -8
  432. wisent/examples/scripts/results/test_jsonschema_bench_evaluation.json +0 -30
  433. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_evaluation.json +0 -30
  434. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_pairs.json +0 -8
  435. wisent/examples/scripts/results/test_jsonschema_bench_pairs.json +0 -8
  436. wisent/examples/scripts/results/test_kbl_evaluation.json +0 -51
  437. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_evaluation.json +0 -51
  438. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_pairs.json +0 -14
  439. wisent/examples/scripts/results/test_kbl_pairs.json +0 -14
  440. wisent/examples/scripts/results/test_kmmlu_evaluation.json +0 -51
  441. wisent/examples/scripts/results/test_kmmlu_pairs.json +0 -14
  442. wisent/examples/scripts/results/test_kobest_evaluation.json +0 -51
  443. wisent/examples/scripts/results/test_kobest_pairs.json +0 -14
  444. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json +0 -30
  445. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json +0 -8
  446. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json +0 -30
  447. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json +0 -8
  448. wisent/examples/scripts/results/test_kormedmcqa_evaluation.json +0 -30
  449. wisent/examples/scripts/results/test_kormedmcqa_pairs.json +0 -8
  450. wisent/examples/scripts/results/test_lambada_cloze_evaluation.json +0 -30
  451. wisent/examples/scripts/results/test_lambada_cloze_pairs.json +0 -8
  452. wisent/examples/scripts/results/test_lambada_evaluation.json +0 -30
  453. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  454. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  455. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json +0 -51
  456. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json +0 -14
  457. wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json +0 -51
  458. wisent/examples/scripts/results/test_lambada_multilingual_pairs.json +0 -14
  459. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json +0 -51
  460. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json +0 -14
  461. wisent/examples/scripts/results/test_lambada_openai_evaluation.json +0 -30
  462. wisent/examples/scripts/results/test_lambada_openai_pairs.json +0 -8
  463. wisent/examples/scripts/results/test_lambada_pairs.json +0 -8
  464. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  465. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  466. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  467. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  468. wisent/examples/scripts/results/test_lambada_standard_evaluation.json +0 -30
  469. wisent/examples/scripts/results/test_lambada_standard_pairs.json +0 -8
  470. wisent/examples/scripts/results/test_leaderboard_evaluation.json +0 -51
  471. wisent/examples/scripts/results/test_leaderboard_pairs.json +0 -14
  472. wisent/examples/scripts/results/test_libra/test_libra_evaluation.json +0 -51
  473. wisent/examples/scripts/results/test_libra/test_libra_pairs.json +0 -14
  474. wisent/examples/scripts/results/test_libra_evaluation.json +0 -51
  475. wisent/examples/scripts/results/test_libra_pairs.json +0 -14
  476. wisent/examples/scripts/results/test_lingoly_evaluation.json +0 -30
  477. wisent/examples/scripts/results/test_lingoly_pairs.json +0 -8
  478. wisent/examples/scripts/results/test_livecodebench_evaluation.json +0 -30
  479. wisent/examples/scripts/results/test_livecodebench_pairs.json +0 -8
  480. wisent/examples/scripts/results/test_livemathbench_cnmo_en_evaluation.json +0 -30
  481. wisent/examples/scripts/results/test_livemathbench_cnmo_en_pairs.json +0 -8
  482. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_evaluation.json +0 -30
  483. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_pairs.json +0 -8
  484. wisent/examples/scripts/results/test_llama_evaluation.json +0 -30
  485. wisent/examples/scripts/results/test_llama_pairs.json +0 -8
  486. wisent/examples/scripts/results/test_logiqa2_evaluation.json +0 -30
  487. wisent/examples/scripts/results/test_logiqa2_pairs.json +0 -8
  488. wisent/examples/scripts/results/test_logiqa_evaluation.json +0 -30
  489. wisent/examples/scripts/results/test_logiqa_pairs.json +0 -8
  490. wisent/examples/scripts/results/test_m_mmlu_evaluation.json +0 -51
  491. wisent/examples/scripts/results/test_m_mmlu_pairs.json +0 -14
  492. wisent/examples/scripts/results/test_mastermind/test_mastermind_evaluation.json +0 -51
  493. wisent/examples/scripts/results/test_mastermind/test_mastermind_pairs.json +0 -14
  494. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_evaluation.json +0 -30
  495. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_pairs.json +0 -8
  496. wisent/examples/scripts/results/test_mastermind_evaluation.json +0 -51
  497. wisent/examples/scripts/results/test_mastermind_pairs.json +0 -14
  498. wisent/examples/scripts/results/test_math500_evaluation.json +0 -30
  499. wisent/examples/scripts/results/test_math500_pairs.json +0 -8
  500. wisent/examples/scripts/results/test_math_evaluation.json +0 -30
  501. wisent/examples/scripts/results/test_math_pairs.json +0 -8
  502. wisent/examples/scripts/results/test_mathqa_evaluation.json +0 -30
  503. wisent/examples/scripts/results/test_mathqa_pairs.json +0 -8
  504. wisent/examples/scripts/results/test_mbpp_evaluation.json +0 -30
  505. wisent/examples/scripts/results/test_mbpp_pairs.json +0 -8
  506. wisent/examples/scripts/results/test_mbpp_plus_evaluation.json +0 -30
  507. wisent/examples/scripts/results/test_mbpp_plus_pairs.json +0 -8
  508. wisent/examples/scripts/results/test_mc_taco_evaluation.json +0 -30
  509. wisent/examples/scripts/results/test_mc_taco_pairs.json +0 -8
  510. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_evaluation.json +0 -51
  511. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_pairs.json +0 -14
  512. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_evaluation.json +0 -30
  513. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_pairs.json +0 -8
  514. wisent/examples/scripts/results/test_med_concepts_qa_evaluation.json +0 -51
  515. wisent/examples/scripts/results/test_med_concepts_qa_pairs.json +0 -14
  516. wisent/examples/scripts/results/test_meddialog_evaluation.json +0 -30
  517. wisent/examples/scripts/results/test_meddialog_pairs.json +0 -8
  518. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_evaluation.json +0 -30
  519. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_pairs.json +0 -8
  520. wisent/examples/scripts/results/test_mediqa_qa2019_evaluation.json +0 -30
  521. wisent/examples/scripts/results/test_mediqa_qa2019_pairs.json +0 -8
  522. wisent/examples/scripts/results/test_medmcqa_evaluation.json +0 -30
  523. wisent/examples/scripts/results/test_medmcqa_pairs.json +0 -8
  524. wisent/examples/scripts/results/test_medqa_evaluation.json +0 -30
  525. wisent/examples/scripts/results/test_medqa_pairs.json +0 -8
  526. wisent/examples/scripts/results/test_medtext_evaluation.json +0 -30
  527. wisent/examples/scripts/results/test_medtext_pairs.json +0 -8
  528. wisent/examples/scripts/results/test_mela_evaluation.json +0 -51
  529. wisent/examples/scripts/results/test_mela_pairs.json +0 -14
  530. wisent/examples/scripts/results/test_meqsum_evaluation.json +0 -30
  531. wisent/examples/scripts/results/test_meqsum_pairs.json +0 -8
  532. wisent/examples/scripts/results/test_mercury_evaluation.json +0 -30
  533. wisent/examples/scripts/results/test_mercury_pairs.json +0 -8
  534. wisent/examples/scripts/results/test_metabench_evaluation.json +0 -51
  535. wisent/examples/scripts/results/test_metabench_pairs.json +0 -14
  536. wisent/examples/scripts/results/test_mgsm_evaluation.json +0 -51
  537. wisent/examples/scripts/results/test_mgsm_pairs.json +0 -14
  538. wisent/examples/scripts/results/test_mimic_repsum_evaluation.json +0 -30
  539. wisent/examples/scripts/results/test_mimic_repsum_pairs.json +0 -8
  540. wisent/examples/scripts/results/test_minerva_math_evaluation.json +0 -51
  541. wisent/examples/scripts/results/test_minerva_math_pairs.json +0 -14
  542. wisent/examples/scripts/results/test_mlqa_evaluation.json +0 -51
  543. wisent/examples/scripts/results/test_mlqa_pairs.json +0 -14
  544. wisent/examples/scripts/results/test_mmlu-pro-plus_evaluation.json +0 -51
  545. wisent/examples/scripts/results/test_mmlu-pro-plus_pairs.json +0 -14
  546. wisent/examples/scripts/results/test_mmlu_evaluation.json +0 -51
  547. wisent/examples/scripts/results/test_mmlu_pairs.json +0 -14
  548. wisent/examples/scripts/results/test_mmlu_pro_evaluation.json +0 -51
  549. wisent/examples/scripts/results/test_mmlu_pro_pairs.json +0 -14
  550. wisent/examples/scripts/results/test_mmlu_prox_evaluation.json +0 -51
  551. wisent/examples/scripts/results/test_mmlu_prox_pairs.json +0 -14
  552. wisent/examples/scripts/results/test_mmlusr_evaluation.json +0 -30
  553. wisent/examples/scripts/results/test_mmlusr_pairs.json +0 -8
  554. wisent/examples/scripts/results/test_mmmu_evaluation.json +0 -51
  555. wisent/examples/scripts/results/test_mmmu_pairs.json +0 -14
  556. wisent/examples/scripts/results/test_mnli_evaluation.json +0 -30
  557. wisent/examples/scripts/results/test_mnli_pairs.json +0 -8
  558. wisent/examples/scripts/results/test_model_written_evals_evaluation.json +0 -51
  559. wisent/examples/scripts/results/test_model_written_evals_pairs.json +0 -14
  560. wisent/examples/scripts/results/test_moral_stories_evaluation.json +0 -30
  561. wisent/examples/scripts/results/test_moral_stories_pairs.json +0 -8
  562. wisent/examples/scripts/results/test_mts_dialog_evaluation.json +0 -30
  563. wisent/examples/scripts/results/test_mts_dialog_pairs.json +0 -8
  564. wisent/examples/scripts/results/test_multiblimp_evaluation.json +0 -51
  565. wisent/examples/scripts/results/test_multiblimp_pairs.json +0 -14
  566. wisent/examples/scripts/results/test_multimedqa_evaluation.json +0 -51
  567. wisent/examples/scripts/results/test_multimedqa_pairs.json +0 -14
  568. wisent/examples/scripts/results/test_multipl_e_evaluation.json +0 -30
  569. wisent/examples/scripts/results/test_multipl_e_pairs.json +0 -8
  570. wisent/examples/scripts/results/test_mutual_evaluation.json +0 -30
  571. wisent/examples/scripts/results/test_mutual_pairs.json +0 -8
  572. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_evaluation.json +0 -30
  573. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_pairs.json +0 -8
  574. wisent/examples/scripts/results/test_noreval_evaluation.json +0 -51
  575. wisent/examples/scripts/results/test_noreval_pairs.json +0 -14
  576. wisent/examples/scripts/results/test_noticia_evaluation.json +0 -30
  577. wisent/examples/scripts/results/test_noticia_pairs.json +0 -8
  578. wisent/examples/scripts/results/test_nq_open_evaluation.json +0 -30
  579. wisent/examples/scripts/results/test_nq_open_pairs.json +0 -8
  580. wisent/examples/scripts/results/test_olaph_evaluation.json +0 -30
  581. wisent/examples/scripts/results/test_olaph_pairs.json +0 -8
  582. wisent/examples/scripts/results/test_openbookqa_evaluation.json +0 -30
  583. wisent/examples/scripts/results/test_openbookqa_pairs.json +0 -8
  584. wisent/examples/scripts/results/test_openllm_evaluation.json +0 -51
  585. wisent/examples/scripts/results/test_openllm_pairs.json +0 -14
  586. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_evaluation.json +0 -30
  587. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_pairs.json +0 -8
  588. wisent/examples/scripts/results/test_paloma_evaluation.json +0 -51
  589. wisent/examples/scripts/results/test_paloma_pairs.json +0 -14
  590. wisent/examples/scripts/results/test_passkey/test_passkey_evaluation.json +0 -30
  591. wisent/examples/scripts/results/test_passkey/test_passkey_pairs.json +0 -8
  592. wisent/examples/scripts/results/test_paws-x_evaluation.json +0 -51
  593. wisent/examples/scripts/results/test_paws-x_pairs.json +0 -14
  594. wisent/examples/scripts/results/test_paws_en/test_paws_en_evaluation.json +0 -30
  595. wisent/examples/scripts/results/test_paws_en/test_paws_en_pairs.json +0 -8
  596. wisent/examples/scripts/results/test_penn_treebank_evaluation.json +0 -30
  597. wisent/examples/scripts/results/test_penn_treebank_pairs.json +0 -8
  598. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_evaluation.json +0 -30
  599. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_pairs.json +0 -8
  600. wisent/examples/scripts/results/test_piqa_evaluation.json +0 -30
  601. wisent/examples/scripts/results/test_piqa_pairs.json +0 -8
  602. wisent/examples/scripts/results/test_polemo2_evaluation.json +0 -30
  603. wisent/examples/scripts/results/test_polemo2_pairs.json +0 -8
  604. wisent/examples/scripts/results/test_polymath_en_high_evaluation.json +0 -30
  605. wisent/examples/scripts/results/test_polymath_en_high_pairs.json +0 -8
  606. wisent/examples/scripts/results/test_polymath_en_medium_evaluation.json +0 -30
  607. wisent/examples/scripts/results/test_polymath_en_medium_pairs.json +0 -8
  608. wisent/examples/scripts/results/test_polymath_zh_high_evaluation.json +0 -30
  609. wisent/examples/scripts/results/test_polymath_zh_high_pairs.json +0 -8
  610. wisent/examples/scripts/results/test_polymath_zh_medium_evaluation.json +0 -30
  611. wisent/examples/scripts/results/test_polymath_zh_medium_pairs.json +0 -8
  612. wisent/examples/scripts/results/test_portuguese_bench_evaluation.json +0 -51
  613. wisent/examples/scripts/results/test_portuguese_bench_pairs.json +0 -14
  614. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
  615. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
  616. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
  617. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
  618. wisent/examples/scripts/results/test_prost_evaluation.json +0 -30
  619. wisent/examples/scripts/results/test_prost_pairs.json +0 -8
  620. wisent/examples/scripts/results/test_ptb_evaluation.json +0 -30
  621. wisent/examples/scripts/results/test_ptb_pairs.json +0 -8
  622. wisent/examples/scripts/results/test_pubmedqa_evaluation.json +0 -30
  623. wisent/examples/scripts/results/test_pubmedqa_pairs.json +0 -8
  624. wisent/examples/scripts/results/test_pythia_evaluation.json +0 -51
  625. wisent/examples/scripts/results/test_pythia_pairs.json +0 -14
  626. wisent/examples/scripts/results/test_qa4mre_evaluation.json +0 -30
  627. wisent/examples/scripts/results/test_qa4mre_pairs.json +0 -8
  628. wisent/examples/scripts/results/test_qasper_evaluation.json +0 -30
  629. wisent/examples/scripts/results/test_qasper_pairs.json +0 -8
  630. wisent/examples/scripts/results/test_race_evaluation.json +0 -30
  631. wisent/examples/scripts/results/test_race_pairs.json +0 -8
  632. wisent/examples/scripts/results/test_realtoxicityprompts_evaluation.json +0 -30
  633. wisent/examples/scripts/results/test_realtoxicityprompts_pairs.json +0 -8
  634. wisent/examples/scripts/results/test_recode_evaluation.json +0 -30
  635. wisent/examples/scripts/results/test_recode_pairs.json +0 -8
  636. wisent/examples/scripts/results/test_record_evaluation.json +0 -30
  637. wisent/examples/scripts/results/test_record_pairs.json +0 -8
  638. wisent/examples/scripts/results/test_ruler_evaluation.json +0 -51
  639. wisent/examples/scripts/results/test_ruler_pairs.json +0 -14
  640. wisent/examples/scripts/results/test_sciq_evaluation.json +0 -30
  641. wisent/examples/scripts/results/test_sciq_pairs.json +0 -8
  642. wisent/examples/scripts/results/test_score_evaluation.json +0 -51
  643. wisent/examples/scripts/results/test_score_pairs.json +0 -14
  644. wisent/examples/scripts/results/test_self_consistency_evaluation.json +0 -30
  645. wisent/examples/scripts/results/test_self_consistency_pairs.json +0 -8
  646. wisent/examples/scripts/results/test_siqa/test_siqa_evaluation.json +0 -30
  647. wisent/examples/scripts/results/test_siqa/test_siqa_pairs.json +0 -8
  648. wisent/examples/scripts/results/test_siqa_evaluation.json +0 -30
  649. wisent/examples/scripts/results/test_siqa_pairs.json +0 -8
  650. wisent/examples/scripts/results/test_spanish_bench_evaluation.json +0 -51
  651. wisent/examples/scripts/results/test_spanish_bench_pairs.json +0 -14
  652. wisent/examples/scripts/results/test_squad2_evaluation.json +0 -30
  653. wisent/examples/scripts/results/test_squad2_pairs.json +0 -8
  654. wisent/examples/scripts/results/test_squadv2_evaluation.json +0 -30
  655. wisent/examples/scripts/results/test_squadv2_pairs.json +0 -8
  656. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_evaluation.json +0 -30
  657. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_pairs.json +0 -8
  658. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_evaluation.json +0 -51
  659. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_pairs.json +0 -14
  660. wisent/examples/scripts/results/test_swag_evaluation.json +0 -30
  661. wisent/examples/scripts/results/test_swag_pairs.json +0 -8
  662. wisent/examples/scripts/results/test_tinyBenchmarks_evaluation.json +0 -51
  663. wisent/examples/scripts/results/test_tinyBenchmarks_pairs.json +0 -14
  664. wisent/examples/scripts/results/test_tmmluplus_evaluation.json +0 -51
  665. wisent/examples/scripts/results/test_tmmluplus_pairs.json +0 -14
  666. wisent/examples/scripts/results/test_translation_evaluation.json +0 -51
  667. wisent/examples/scripts/results/test_translation_pairs.json +0 -14
  668. wisent/examples/scripts/results/test_triviaqa_evaluation.json +0 -30
  669. wisent/examples/scripts/results/test_triviaqa_pairs.json +0 -8
  670. wisent/examples/scripts/results/test_truthfulqa-multi_evaluation.json +0 -51
  671. wisent/examples/scripts/results/test_truthfulqa-multi_pairs.json +0 -14
  672. wisent/examples/scripts/results/test_truthfulqa_evaluation.json +0 -30
  673. wisent/examples/scripts/results/test_truthfulqa_mc1_evaluation.json +0 -30
  674. wisent/examples/scripts/results/test_truthfulqa_mc1_pairs.json +0 -8
  675. wisent/examples/scripts/results/test_truthfulqa_mc2_evaluation.json +0 -30
  676. wisent/examples/scripts/results/test_truthfulqa_mc2_pairs.json +0 -8
  677. wisent/examples/scripts/results/test_truthfulqa_pairs.json +0 -8
  678. wisent/examples/scripts/results/test_turkishmmlu_evaluation.json +0 -51
  679. wisent/examples/scripts/results/test_turkishmmlu_pairs.json +0 -14
  680. wisent/examples/scripts/results/test_unfair_tos_evaluation.json +0 -30
  681. wisent/examples/scripts/results/test_unfair_tos_pairs.json +0 -8
  682. wisent/examples/scripts/results/test_unscramble_evaluation.json +0 -51
  683. wisent/examples/scripts/results/test_unscramble_pairs.json +0 -14
  684. wisent/examples/scripts/results/test_webqs_evaluation.json +0 -30
  685. wisent/examples/scripts/results/test_webqs_pairs.json +0 -8
  686. wisent/examples/scripts/results/test_wikitext103_evaluation.json +0 -30
  687. wisent/examples/scripts/results/test_wikitext103_pairs.json +0 -8
  688. wisent/examples/scripts/results/test_wikitext_evaluation.json +0 -30
  689. wisent/examples/scripts/results/test_wikitext_pairs.json +0 -8
  690. wisent/examples/scripts/results/test_winogender_evaluation.json +0 -51
  691. wisent/examples/scripts/results/test_winogender_pairs.json +0 -14
  692. wisent/examples/scripts/results/test_winogrande_evaluation.json +0 -30
  693. wisent/examples/scripts/results/test_winogrande_pairs.json +0 -8
  694. wisent/examples/scripts/results/test_wmdp_evaluation.json +0 -30
  695. wisent/examples/scripts/results/test_wmdp_pairs.json +0 -8
  696. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_evaluation.json +0 -30
  697. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_pairs.json +0 -8
  698. wisent/examples/scripts/results/test_wmt14_en_fr_evaluation.json +0 -30
  699. wisent/examples/scripts/results/test_wmt14_en_fr_pairs.json +0 -8
  700. wisent/examples/scripts/results/test_wmt16_en_de_evaluation.json +0 -30
  701. wisent/examples/scripts/results/test_wmt16_en_de_pairs.json +0 -8
  702. wisent/examples/scripts/results/test_wmt16_ro_en_evaluation.json +0 -30
  703. wisent/examples/scripts/results/test_wmt16_ro_en_pairs.json +0 -8
  704. wisent/examples/scripts/results/test_wsc273_evaluation.json +0 -30
  705. wisent/examples/scripts/results/test_wsc273_pairs.json +0 -8
  706. wisent/examples/scripts/results/test_xcopa_evaluation.json +0 -51
  707. wisent/examples/scripts/results/test_xcopa_pairs.json +0 -14
  708. wisent/examples/scripts/results/test_xnli_eu_evaluation.json +0 -30
  709. wisent/examples/scripts/results/test_xnli_eu_pairs.json +0 -8
  710. wisent/examples/scripts/results/test_xnli_evaluation.json +0 -51
  711. wisent/examples/scripts/results/test_xnli_pairs.json +0 -14
  712. wisent/examples/scripts/results/test_xquad_evaluation.json +0 -51
  713. wisent/examples/scripts/results/test_xquad_pairs.json +0 -14
  714. wisent/examples/scripts/results/test_xstorycloze_evaluation.json +0 -51
  715. wisent/examples/scripts/results/test_xstorycloze_pairs.json +0 -14
  716. wisent/examples/scripts/results/test_xsum_evaluation.json +0 -30
  717. wisent/examples/scripts/results/test_xsum_pairs.json +0 -8
  718. wisent/examples/scripts/results/test_xwinograd_evaluation.json +0 -51
  719. wisent/examples/scripts/results/test_xwinograd_pairs.json +0 -14
  720. wisent/examples/scripts/results/test_yahoo_answers_topics_evaluation.json +0 -30
  721. wisent/examples/scripts/results/test_yahoo_answers_topics_pairs.json +0 -8
  722. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/WHEEL +0 -0
  723. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/entry_points.txt +0 -0
  724. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/licenses/LICENSE +0 -0
  725. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/top_level.txt +0 -0
@@ -1,2781 +0,0 @@
1
- {
2
- "Tag": {
3
- "evaluator": null,
4
- "extractor_location": "huggingface_pairs",
5
- "extractor_file": "tag",
6
- "benchmark_type": "other",
7
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
8
- },
9
- "aclue": {
10
- "evaluator": "log_likelihoods",
11
- "extractor_location": "lm_eval_pairs",
12
- "extractor_file": "aclue",
13
- "benchmark_type": "other",
14
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
15
- },
16
- "acp_bench": {
17
- "evaluator": "log_likelihoods",
18
- "extractor_location": "lm_eval_pairs",
19
- "extractor_file": "acp_bench",
20
- "benchmark_type": "other",
21
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
22
- },
23
- "acp_bench_hard": {
24
- "evaluator": "generation",
25
- "extractor_location": "lm_eval_pairs",
26
- "extractor_file": "acp_bench_hard",
27
- "benchmark_type": "other",
28
- "explanation": "Text generation evaluation - assesses quality of generated text"
29
- },
30
- "advanced": {
31
- "evaluator": "log_likelihoods",
32
- "extractor_location": "lm_eval_pairs",
33
- "extractor_file": "advanced",
34
- "benchmark_type": "other",
35
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
36
- },
37
- "aexams": {
38
- "evaluator": "log_likelihoods",
39
- "extractor_location": "lm_eval_pairs",
40
- "extractor_file": "aexams",
41
- "benchmark_type": "knowledge",
42
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
43
- },
44
- "afrimgsm": {
45
- "evaluator": "generation",
46
- "extractor_location": "lm_eval_pairs",
47
- "extractor_file": "afrimgsm",
48
- "benchmark_type": "mathematics",
49
- "explanation": "Text generation evaluation - assesses quality of generated text"
50
- },
51
- "afrimmlu": {
52
- "evaluator": "log_likelihoods",
53
- "extractor_location": "lm_eval_pairs",
54
- "extractor_file": "afrimmlu",
55
- "benchmark_type": "knowledge",
56
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
57
- },
58
- "afrixnli": {
59
- "evaluator": "log_likelihoods",
60
- "extractor_location": "lm_eval_pairs",
61
- "extractor_file": "afrixnli",
62
- "benchmark_type": "other",
63
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
64
- },
65
- "ag": {
66
- "evaluator": "exact_match",
67
- "extractor_location": "lm_eval_pairs",
68
- "extractor_file": "ag",
69
- "benchmark_type": "other",
70
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
71
- },
72
- "agieval": {
73
- "evaluator": "exact_match",
74
- "extractor_location": "lm_eval_pairs",
75
- "extractor_file": "agieval",
76
- "benchmark_type": "other",
77
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
78
- },
79
- "ai2_arc": {
80
- "evaluator": "log_likelihoods",
81
- "extractor_location": "lm_eval_pairs",
82
- "extractor_file": "ai2_arc",
83
- "benchmark_type": "knowledge",
84
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
85
- },
86
- "aime": {
87
- "evaluator": null,
88
- "extractor_location": "huggingface_pairs",
89
- "extractor_file": "aime",
90
- "benchmark_type": "mathematics",
91
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
92
- },
93
- "aime2024": {
94
- "evaluator": null,
95
- "extractor_location": "huggingface_pairs",
96
- "extractor_file": "aime",
97
- "benchmark_type": "mathematics",
98
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
99
- },
100
- "aime2025": {
101
- "evaluator": null,
102
- "extractor_location": "huggingface_pairs",
103
- "extractor_file": "aime",
104
- "benchmark_type": "mathematics",
105
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
106
- },
107
- "anagrams1": {
108
- "evaluator": "exact_match",
109
- "extractor_location": "lm_eval_pairs",
110
- "extractor_file": "anagrams1",
111
- "benchmark_type": "other",
112
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
113
- },
114
- "anagrams2": {
115
- "evaluator": "exact_match",
116
- "extractor_location": "lm_eval_pairs",
117
- "extractor_file": "anagrams2",
118
- "benchmark_type": "other",
119
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
120
- },
121
- "anli": {
122
- "evaluator": "log_likelihoods",
123
- "extractor_location": "lm_eval_pairs",
124
- "extractor_file": "anli",
125
- "benchmark_type": "other",
126
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
127
- },
128
- "apps": {
129
- "evaluator": null,
130
- "extractor_location": "huggingface_pairs",
131
- "extractor_file": "apps",
132
- "benchmark_type": "coding",
133
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
134
- },
135
- "arabculture": {
136
- "evaluator": "log_likelihoods",
137
- "extractor_location": "lm_eval_pairs",
138
- "extractor_file": "arabculture",
139
- "benchmark_type": "other",
140
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
141
- },
142
- "arabic": {
143
- "evaluator": "log_likelihoods",
144
- "extractor_location": "lm_eval_pairs",
145
- "extractor_file": "arabic",
146
- "benchmark_type": "other",
147
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
148
- },
149
- "arabic_leaderboard_complete": {
150
- "evaluator": "log_likelihoods",
151
- "extractor_location": "lm_eval_pairs",
152
- "extractor_file": "arabic_leaderboard_complete",
153
- "benchmark_type": "other",
154
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
155
- },
156
- "arabic_leaderboard_light": {
157
- "evaluator": "log_likelihoods",
158
- "extractor_location": "lm_eval_pairs",
159
- "extractor_file": "arabic_leaderboard_light",
160
- "benchmark_type": "other",
161
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
162
- },
163
- "arabicmmlu": {
164
- "evaluator": "log_likelihoods",
165
- "extractor_location": "lm_eval_pairs",
166
- "extractor_file": "arabicmmlu",
167
- "benchmark_type": "knowledge",
168
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
169
- },
170
- "aradice": {
171
- "evaluator": "log_likelihoods",
172
- "extractor_location": "lm_eval_pairs",
173
- "extractor_file": "aradice",
174
- "benchmark_type": "other",
175
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
176
- },
177
- "arc": {
178
- "evaluator": "log_likelihoods",
179
- "extractor_location": "lm_eval_pairs",
180
- "extractor_file": "arc",
181
- "benchmark_type": "knowledge",
182
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
183
- },
184
- "arc_challenge": {
185
- "evaluator": "log_likelihoods",
186
- "extractor_location": "lm_eval_pairs",
187
- "extractor_file": "arc_challenge",
188
- "benchmark_type": "knowledge",
189
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
190
- },
191
- "arc_easy": {
192
- "evaluator": "log_likelihoods",
193
- "extractor_location": "lm_eval_pairs",
194
- "extractor_file": "arc_easy",
195
- "benchmark_type": "knowledge",
196
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
197
- },
198
- "argument": {
199
- "evaluator": "exact_match",
200
- "extractor_location": "lm_eval_pairs",
201
- "extractor_file": "argument",
202
- "benchmark_type": "other",
203
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
204
- },
205
- "arithmetic": {
206
- "evaluator": "exact_match",
207
- "extractor_location": "lm_eval_pairs",
208
- "extractor_file": "arithmetic",
209
- "benchmark_type": "mathematics",
210
- "explanation": "Text comparison (WARNING: should use execution for mathematics)"
211
- },
212
- "asdiv": {
213
- "evaluator": "exact_match",
214
- "extractor_location": "lm_eval_pairs",
215
- "extractor_file": "asdiv",
216
- "benchmark_type": "mathematics",
217
- "explanation": "Text comparison (WARNING: should use execution for mathematics)"
218
- },
219
- "asdiv_cot_llama": {
220
- "evaluator": null,
221
- "extractor_location": "huggingface_pairs",
222
- "extractor_file": "math",
223
- "benchmark_type": "mathematics",
224
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
225
- },
226
- "assin": {
227
- "evaluator": "log_likelihoods",
228
- "extractor_location": "lm_eval_pairs",
229
- "extractor_file": "assin",
230
- "benchmark_type": "other",
231
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
232
- },
233
- "atis": {
234
- "evaluator": "generation",
235
- "extractor_location": "huggingface_pairs",
236
- "extractor_file": "atis",
237
- "benchmark_type": "other",
238
- "explanation": "Text generation evaluation - assesses quality of generated text"
239
- },
240
- "babi": {
241
- "evaluator": "generation",
242
- "extractor_location": "lm_eval_pairs",
243
- "extractor_file": "babi",
244
- "benchmark_type": "other",
245
- "explanation": "Text generation evaluation - assesses quality of generated text"
246
- },
247
- "babilong": {
248
- "evaluator": "generation",
249
- "extractor_location": "huggingface_pairs",
250
- "extractor_file": "babilong",
251
- "benchmark_type": "other",
252
- "explanation": "Text generation evaluation - assesses quality of generated text"
253
- },
254
- "bangla_mmlu": {
255
- "evaluator": "log_likelihoods",
256
- "extractor_location": "huggingface_pairs",
257
- "extractor_file": "bangla_mmlu",
258
- "benchmark_type": "knowledge",
259
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
260
- },
261
- "banking77": {
262
- "evaluator": "exact_match",
263
- "extractor_location": "huggingface_pairs",
264
- "extractor_file": "banking77",
265
- "benchmark_type": "other",
266
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
267
- },
268
- "basque_bench": {
269
- "evaluator": "log_likelihoods",
270
- "extractor_location": "lm_eval_pairs",
271
- "extractor_file": "basque_bench",
272
- "benchmark_type": "other",
273
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
274
- },
275
- "basque_glue": {
276
- "evaluator": "log_likelihoods",
277
- "extractor_location": "lm_eval_pairs",
278
- "extractor_file": "basque_glue",
279
- "benchmark_type": "other",
280
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
281
- },
282
- "basqueglue": {
283
- "evaluator": "log_likelihoods",
284
- "extractor_location": "huggingface_pairs",
285
- "extractor_file": "basqueglue",
286
- "benchmark_type": "other",
287
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
288
- },
289
- "bbh": {
290
- "evaluator": "exact_match",
291
- "extractor_location": "lm_eval_pairs",
292
- "extractor_file": "bbh",
293
- "benchmark_type": "other",
294
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
295
- },
296
- "bbq": {
297
- "evaluator": "log_likelihoods",
298
- "extractor_location": "lm_eval_pairs",
299
- "extractor_file": "bbq",
300
- "benchmark_type": "other",
301
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
302
- },
303
- "bec2016eu": {
304
- "evaluator": "log_likelihoods",
305
- "extractor_location": "huggingface_pairs",
306
- "extractor_file": "bec2016eu",
307
- "benchmark_type": "other",
308
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
309
- },
310
- "belebele": {
311
- "evaluator": "log_likelihoods",
312
- "extractor_location": "lm_eval_pairs",
313
- "extractor_file": "belebele",
314
- "benchmark_type": "other",
315
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
316
- },
317
- "benchmarks": {
318
- "evaluator": "log_likelihoods",
319
- "extractor_location": "lm_eval_pairs",
320
- "extractor_file": "benchmarks",
321
- "benchmark_type": "other",
322
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
323
- },
324
- "bertaqa": {
325
- "evaluator": "log_likelihoods",
326
- "extractor_location": "lm_eval_pairs",
327
- "extractor_file": "bertaqa",
328
- "benchmark_type": "question_answering",
329
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
330
- },
331
- "bhs": {
332
- "evaluator": "log_likelihoods",
333
- "extractor_location": "lm_eval_pairs",
334
- "extractor_file": "bhs",
335
- "benchmark_type": "other",
336
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
337
- },
338
- "bhtc": {
339
- "evaluator": "log_likelihoods",
340
- "extractor_location": "lm_eval_pairs",
341
- "extractor_file": "bhtc",
342
- "benchmark_type": "other",
343
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
344
- },
345
- "bigbench": {
346
- "evaluator": "exact_match",
347
- "extractor_location": "lm_eval_pairs",
348
- "extractor_file": "bigbench",
349
- "benchmark_type": "other",
350
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
351
- },
352
- "blimp": {
353
- "evaluator": "log_likelihoods",
354
- "extractor_location": "lm_eval_pairs",
355
- "extractor_file": "blimp",
356
- "benchmark_type": "other",
357
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
358
- },
359
- "blimp_nl": {
360
- "evaluator": null,
361
- "extractor_location": "lm_eval_pairs",
362
- "extractor_file": "blimp_nl",
363
- "benchmark_type": "other",
364
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
365
- },
366
- "boolq": {
367
- "evaluator": "log_likelihoods",
368
- "extractor_location": "huggingface_pairs",
369
- "extractor_file": "boolq",
370
- "benchmark_type": "other",
371
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
372
- },
373
- "boolq_seq2seq": {
374
- "evaluator": null,
375
- "extractor_location": "huggingface_pairs",
376
- "extractor_file": "boolq_seq2seq",
377
- "benchmark_type": "other",
378
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
379
- },
380
- "c4": {
381
- "evaluator": null,
382
- "extractor_location": "lm_eval_pairs",
383
- "extractor_file": "c4",
384
- "benchmark_type": "other",
385
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
386
- },
387
- "cabbq": {
388
- "evaluator": null,
389
- "extractor_location": "lm_eval_pairs",
390
- "extractor_file": "cabbq",
391
- "benchmark_type": "other",
392
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
393
- },
394
- "cabreu": {
395
- "evaluator": "log_likelihoods",
396
- "extractor_location": "lm_eval_pairs",
397
- "extractor_file": "cabreu",
398
- "benchmark_type": "other",
399
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
400
- },
401
- "careqa": {
402
- "evaluator": "log_likelihoods",
403
- "extractor_location": "lm_eval_pairs",
404
- "extractor_file": "careqa",
405
- "benchmark_type": "question_answering",
406
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
407
- },
408
- "catalan_bench": {
409
- "evaluator": "log_likelihoods",
410
- "extractor_location": "lm_eval_pairs",
411
- "extractor_file": "catalan_bench",
412
- "benchmark_type": "other",
413
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
414
- },
415
- "catalanqa": {
416
- "evaluator": "log_likelihoods",
417
- "extractor_location": "lm_eval_pairs",
418
- "extractor_file": "catalanqa",
419
- "benchmark_type": "question_answering",
420
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
421
- },
422
- "catcola": {
423
- "evaluator": "log_likelihoods",
424
- "extractor_location": "lm_eval_pairs",
425
- "extractor_file": "catcola",
426
- "benchmark_type": "other",
427
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
428
- },
429
- "cb": {
430
- "evaluator": null,
431
- "extractor_location": "huggingface_pairs",
432
- "extractor_file": "cb",
433
- "benchmark_type": "other",
434
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
435
- },
436
- "ceval": {
437
- "evaluator": "log_likelihoods",
438
- "extractor_location": "lm_eval_pairs",
439
- "extractor_file": "ceval",
440
- "benchmark_type": "other",
441
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
442
- },
443
- "ceval_valid": {
444
- "evaluator": "log_likelihoods",
445
- "extractor_location": "lm_eval_pairs",
446
- "extractor_file": "ceval_valid",
447
- "benchmark_type": "other",
448
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
449
- },
450
- "chain": {
451
- "evaluator": "log_likelihoods",
452
- "extractor_location": "lm_eval_pairs",
453
- "extractor_file": "chain",
454
- "benchmark_type": "other",
455
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
456
- },
457
- "chain_of_thought": {
458
- "evaluator": null,
459
- "extractor_location": "huggingface_pairs",
460
- "extractor_file": "math",
461
- "benchmark_type": "other",
462
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
463
- },
464
- "chartqa": {
465
- "evaluator": "generation",
466
- "extractor_location": "lm_eval_pairs",
467
- "extractor_file": "chartqa",
468
- "benchmark_type": "question_answering",
469
- "explanation": "Text generation evaluation - assesses quality of generated text"
470
- },
471
- "claim": {
472
- "evaluator": "log_likelihoods",
473
- "extractor_location": "lm_eval_pairs",
474
- "extractor_file": "claim",
475
- "benchmark_type": "other",
476
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
477
- },
478
- "click": {
479
- "evaluator": null,
480
- "extractor_location": "lm_eval_pairs",
481
- "extractor_file": "click",
482
- "benchmark_type": "other",
483
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
484
- },
485
- "cmmlu": {
486
- "evaluator": null,
487
- "extractor_location": "lm_eval_pairs",
488
- "extractor_file": "cmmlu",
489
- "benchmark_type": "knowledge",
490
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
491
- },
492
- "cnn": {
493
- "evaluator": "exact_match",
494
- "extractor_location": "lm_eval_pairs",
495
- "extractor_file": "cnn",
496
- "benchmark_type": "other",
497
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
498
- },
499
- "cocoteros": {
500
- "evaluator": "generation",
501
- "extractor_location": "lm_eval_pairs",
502
- "extractor_file": "cocoteros",
503
- "benchmark_type": "other",
504
- "explanation": "Text generation evaluation - assesses quality of generated text"
505
- },
506
- "code2text": {
507
- "evaluator": "generation",
508
- "extractor_location": "lm_eval_pairs",
509
- "extractor_file": "code2text",
510
- "benchmark_type": "coding",
511
- "explanation": "Text generation evaluation - assesses quality of generated text"
512
- },
513
- "code_x_glue": {
514
- "evaluator": "generation",
515
- "extractor_location": "lm_eval_pairs",
516
- "extractor_file": "code_x_glue",
517
- "benchmark_type": "coding",
518
- "explanation": "Text generation evaluation - assesses quality of generated text"
519
- },
520
- "codexglue": {
521
- "evaluator": null,
522
- "extractor_location": "huggingface_pairs",
523
- "extractor_file": "codexglue",
524
- "benchmark_type": "coding",
525
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
526
- },
527
- "codexglue_code_to_text_go": {
528
- "evaluator": null,
529
- "extractor_location": "huggingface_pairs",
530
- "extractor_file": "codexglue",
531
- "benchmark_type": "coding",
532
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
533
- },
534
- "codexglue_code_to_text_java": {
535
- "evaluator": null,
536
- "extractor_location": "huggingface_pairs",
537
- "extractor_file": "codexglue",
538
- "benchmark_type": "coding",
539
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
540
- },
541
- "codexglue_code_to_text_javascript": {
542
- "evaluator": null,
543
- "extractor_location": "huggingface_pairs",
544
- "extractor_file": "codexglue",
545
- "benchmark_type": "coding",
546
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
547
- },
548
- "codexglue_code_to_text_php": {
549
- "evaluator": null,
550
- "extractor_location": "huggingface_pairs",
551
- "extractor_file": "codexglue",
552
- "benchmark_type": "coding",
553
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
554
- },
555
- "codexglue_code_to_text_python": {
556
- "evaluator": null,
557
- "extractor_location": "huggingface_pairs",
558
- "extractor_file": "codexglue",
559
- "benchmark_type": "coding",
560
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
561
- },
562
- "codexglue_code_to_text_ruby": {
563
- "evaluator": null,
564
- "extractor_location": "huggingface_pairs",
565
- "extractor_file": "codexglue",
566
- "benchmark_type": "coding",
567
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
568
- },
569
- "coedit": {
570
- "evaluator": "generation",
571
- "extractor_location": "lm_eval_pairs",
572
- "extractor_file": "coedit",
573
- "benchmark_type": "other",
574
- "explanation": "Text generation evaluation - assesses quality of generated text"
575
- },
576
- "cola": {
577
- "evaluator": "log_likelihoods",
578
- "extractor_location": "lm_eval_pairs",
579
- "extractor_file": "cola",
580
- "benchmark_type": "other",
581
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
582
- },
583
- "commonsense": {
584
- "evaluator": null,
585
- "extractor_location": "lm_eval_pairs",
586
- "extractor_file": "commonsense",
587
- "benchmark_type": "other",
588
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
589
- },
590
- "commonsense_qa": {
591
- "evaluator": "log_likelihoods",
592
- "extractor_location": "lm_eval_pairs",
593
- "extractor_file": "commonsense_qa",
594
- "benchmark_type": "question_answering",
595
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
596
- },
597
- "conala": {
598
- "evaluator": null,
599
- "extractor_location": "huggingface_pairs",
600
- "extractor_file": "conala",
601
- "benchmark_type": "coding",
602
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
603
- },
604
- "concode": {
605
- "evaluator": null,
606
- "extractor_location": "huggingface_pairs",
607
- "extractor_file": "concode",
608
- "benchmark_type": "coding",
609
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
610
- },
611
- "copa": {
612
- "evaluator": null,
613
- "extractor_location": "huggingface_pairs",
614
- "extractor_file": "copa",
615
- "benchmark_type": "other",
616
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
617
- },
618
- "copal_id": {
619
- "evaluator": "log_likelihoods",
620
- "extractor_location": "lm_eval_pairs",
621
- "extractor_file": "copal_id",
622
- "benchmark_type": "other",
623
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
624
- },
625
- "coqa": {
626
- "evaluator": null,
627
- "extractor_location": "lm_eval_pairs",
628
- "extractor_file": "coqa",
629
- "benchmark_type": "question_answering",
630
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
631
- },
632
- "coqcat": {
633
- "evaluator": "generation",
634
- "extractor_location": "lm_eval_pairs",
635
- "extractor_file": "coqcat",
636
- "benchmark_type": "other",
637
- "explanation": "Text generation evaluation - assesses quality of generated text"
638
- },
639
- "crows_pairs": {
640
- "evaluator": "log_likelihoods",
641
- "extractor_location": "lm_eval_pairs",
642
- "extractor_file": "crows_pairs",
643
- "benchmark_type": "other",
644
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
645
- },
646
- "csatqa": {
647
- "evaluator": null,
648
- "extractor_location": "lm_eval_pairs",
649
- "extractor_file": "csatqa",
650
- "benchmark_type": "question_answering",
651
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
652
- },
653
- "cycle": {
654
- "evaluator": null,
655
- "extractor_location": "lm_eval_pairs",
656
- "extractor_file": "cycle",
657
- "benchmark_type": "other",
658
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
659
- },
660
- "cycle_letters": {
661
- "evaluator": "exact_match",
662
- "extractor_location": "lm_eval_pairs",
663
- "extractor_file": "cycle_letters",
664
- "benchmark_type": "other",
665
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
666
- },
667
- "darija_bench": {
668
- "evaluator": "log_likelihoods",
669
- "extractor_location": "lm_eval_pairs",
670
- "extractor_file": "darija_bench",
671
- "benchmark_type": "other",
672
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
673
- },
674
- "darijahellaswag": {
675
- "evaluator": "log_likelihoods",
676
- "extractor_location": "lm_eval_pairs",
677
- "extractor_file": "darijahellaswag",
678
- "benchmark_type": "knowledge",
679
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
680
- },
681
- "darijammlu": {
682
- "evaluator": null,
683
- "extractor_location": "lm_eval_pairs",
684
- "extractor_file": "darijammlu",
685
- "benchmark_type": "knowledge",
686
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
687
- },
688
- "dbpedia": {
689
- "evaluator": "generation",
690
- "extractor_location": "lm_eval_pairs",
691
- "extractor_file": "dbpedia",
692
- "benchmark_type": "other",
693
- "explanation": "Text generation evaluation - assesses quality of generated text"
694
- },
695
- "discrim_eval": {
696
- "evaluator": null,
697
- "extractor_location": "lm_eval_pairs",
698
- "extractor_file": "discrim_eval",
699
- "benchmark_type": "other",
700
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
701
- },
702
- "doc": {
703
- "evaluator": null,
704
- "extractor_location": "lm_eval_pairs",
705
- "extractor_file": "doc",
706
- "benchmark_type": "other",
707
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
708
- },
709
- "doc_vqa": {
710
- "evaluator": "generation",
711
- "extractor_location": "huggingface_pairs",
712
- "extractor_file": "doc_vqa",
713
- "benchmark_type": "question_answering",
714
- "explanation": "Text generation evaluation - assesses quality of generated text"
715
- },
716
- "drop": {
717
- "evaluator": null,
718
- "extractor_location": "lm_eval_pairs",
719
- "extractor_file": "drop",
720
- "benchmark_type": "other",
721
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
722
- },
723
- "ds1000": {
724
- "evaluator": "exact_match",
725
- "extractor_location": "huggingface_pairs",
726
- "extractor_file": "ds1000",
727
- "benchmark_type": "coding",
728
- "explanation": "Text comparison (WARNING: should use execution for coding)"
729
- },
730
- "ds_1000": {
731
- "evaluator": null,
732
- "extractor_location": "huggingface_pairs",
733
- "extractor_file": "ds_1000",
734
- "benchmark_type": "other",
735
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
736
- },
737
- "egyhellaswag": {
738
- "evaluator": "log_likelihoods",
739
- "extractor_location": "lm_eval_pairs",
740
- "extractor_file": "egyhellaswag",
741
- "benchmark_type": "knowledge",
742
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
743
- },
744
- "egymmlu": {
745
- "evaluator": "log_likelihoods",
746
- "extractor_location": "lm_eval_pairs",
747
- "extractor_file": "egymmlu",
748
- "benchmark_type": "knowledge",
749
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
750
- },
751
- "epec": {
752
- "evaluator": "log_likelihoods",
753
- "extractor_location": "lm_eval_pairs",
754
- "extractor_file": "epec",
755
- "benchmark_type": "other",
756
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
757
- },
758
- "eq": {
759
- "evaluator": null,
760
- "extractor_location": "lm_eval_pairs",
761
- "extractor_file": "eq",
762
- "benchmark_type": "other",
763
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
764
- },
765
- "eq_bench": {
766
- "evaluator": "exact_match",
767
- "extractor_location": "lm_eval_pairs",
768
- "extractor_file": "eq_bench",
769
- "benchmark_type": "other",
770
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
771
- },
772
- "eq_bench_ca": {
773
- "evaluator": null,
774
- "extractor_location": "lm_eval_pairs",
775
- "extractor_file": "eq_bench_ca",
776
- "benchmark_type": "other",
777
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
778
- },
779
- "eq_bench_es": {
780
- "evaluator": null,
781
- "extractor_location": "lm_eval_pairs",
782
- "extractor_file": "eq_bench_es",
783
- "benchmark_type": "other",
784
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
785
- },
786
- "esbbq": {
787
- "evaluator": null,
788
- "extractor_location": "lm_eval_pairs",
789
- "extractor_file": "esbbq",
790
- "benchmark_type": "other",
791
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
792
- },
793
- "escola": {
794
- "evaluator": "log_likelihoods",
795
- "extractor_location": "lm_eval_pairs",
796
- "extractor_file": "escola",
797
- "benchmark_type": "other",
798
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
799
- },
800
- "ethics": {
801
- "evaluator": "log_likelihoods",
802
- "extractor_location": "lm_eval_pairs",
803
- "extractor_file": "ethics",
804
- "benchmark_type": "other",
805
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
806
- },
807
- "ethos": {
808
- "evaluator": "generation",
809
- "extractor_location": "lm_eval_pairs",
810
- "extractor_file": "ethos",
811
- "benchmark_type": "other",
812
- "explanation": "Text generation evaluation - assesses quality of generated text"
813
- },
814
- "eus": {
815
- "evaluator": null,
816
- "extractor_location": "lm_eval_pairs",
817
- "extractor_file": "eus",
818
- "benchmark_type": "other",
819
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
820
- },
821
- "eus_exams": {
822
- "evaluator": "log_likelihoods",
823
- "extractor_location": "lm_eval_pairs",
824
- "extractor_file": "eus_exams",
825
- "benchmark_type": "knowledge",
826
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
827
- },
828
- "eus_proficiency": {
829
- "evaluator": "log_likelihoods",
830
- "extractor_location": "lm_eval_pairs",
831
- "extractor_file": "eus_proficiency",
832
- "benchmark_type": "other",
833
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
834
- },
835
- "eus_reading": {
836
- "evaluator": "log_likelihoods",
837
- "extractor_location": "lm_eval_pairs",
838
- "extractor_file": "eus_reading",
839
- "benchmark_type": "other",
840
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
841
- },
842
- "eus_trivia": {
843
- "evaluator": "log_likelihoods",
844
- "extractor_location": "lm_eval_pairs",
845
- "extractor_file": "eus_trivia",
846
- "benchmark_type": "other",
847
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
848
- },
849
- "evalita_llm": {
850
- "evaluator": "log_likelihoods",
851
- "extractor_location": "lm_eval_pairs",
852
- "extractor_file": "evalita_llm",
853
- "benchmark_type": "other",
854
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
855
- },
856
- "evalita_mp": {
857
- "evaluator": "log_likelihoods",
858
- "extractor_location": "huggingface_pairs",
859
- "extractor_file": "evalita_mp",
860
- "benchmark_type": "other",
861
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
862
- },
863
- "evalita_sp": {
864
- "evaluator": "generation",
865
- "extractor_location": "lm_eval_pairs",
866
- "extractor_file": "evalita_sp",
867
- "benchmark_type": "other",
868
- "explanation": "Text generation evaluation - assesses quality of generated text"
869
- },
870
- "fda": {
871
- "evaluator": "generation",
872
- "extractor_location": "lm_eval_pairs",
873
- "extractor_file": "fda",
874
- "benchmark_type": "other",
875
- "explanation": "Text generation evaluation - assesses quality of generated text"
876
- },
877
- "financial": {
878
- "evaluator": null,
879
- "extractor_location": "lm_eval_pairs",
880
- "extractor_file": "financial",
881
- "benchmark_type": "other",
882
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
883
- },
884
- "financial_tweets": {
885
- "evaluator": "generation",
886
- "extractor_location": "huggingface_pairs",
887
- "extractor_file": "financial_tweets",
888
- "benchmark_type": "other",
889
- "explanation": "Text generation evaluation - assesses quality of generated text"
890
- },
891
- "flan": {
892
- "evaluator": "log_likelihoods",
893
- "extractor_location": "lm_eval_pairs",
894
- "extractor_file": "flan",
895
- "benchmark_type": "other",
896
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
897
- },
898
- "fld": {
899
- "evaluator": "exact_match",
900
- "extractor_location": "lm_eval_pairs",
901
- "extractor_file": "fld",
902
- "benchmark_type": "other",
903
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
904
- },
905
- "flores": {
906
- "evaluator": "generation",
907
- "extractor_location": "huggingface_pairs",
908
- "extractor_file": "flores",
909
- "benchmark_type": "translation",
910
- "explanation": "Text generation evaluation - assesses quality of generated text"
911
- },
912
- "freebase": {
913
- "evaluator": "log_likelihoods",
914
- "extractor_location": "huggingface_pairs",
915
- "extractor_file": "freebase",
916
- "benchmark_type": "other",
917
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
918
- },
919
- "french_bench": {
920
- "evaluator": "log_likelihoods",
921
- "extractor_location": "lm_eval_pairs",
922
- "extractor_file": "french_bench",
923
- "benchmark_type": "other",
924
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
925
- },
926
- "galcola": {
927
- "evaluator": "log_likelihoods",
928
- "extractor_location": "lm_eval_pairs",
929
- "extractor_file": "galcola",
930
- "benchmark_type": "other",
931
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
932
- },
933
- "galician_bench": {
934
- "evaluator": "log_likelihoods",
935
- "extractor_location": "lm_eval_pairs",
936
- "extractor_file": "galician_bench",
937
- "benchmark_type": "other",
938
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
939
- },
940
- "gaokao": {
941
- "evaluator": "log_likelihoods",
942
- "extractor_location": "lm_eval_pairs",
943
- "extractor_file": "gaokao",
944
- "benchmark_type": "other",
945
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
946
- },
947
- "glianorex": {
948
- "evaluator": "log_likelihoods",
949
- "extractor_location": "huggingface_pairs",
950
- "extractor_file": "glianorex",
951
- "benchmark_type": "other",
952
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
953
- },
954
- "global_mmlu": {
955
- "evaluator": "log_likelihoods",
956
- "extractor_location": "lm_eval_pairs",
957
- "extractor_file": "global_mmlu",
958
- "benchmark_type": "knowledge",
959
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
960
- },
961
- "global_piqa": {
962
- "evaluator": null,
963
- "extractor_location": "lm_eval_pairs",
964
- "extractor_file": "global_piqa",
965
- "benchmark_type": "question_answering",
966
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
967
- },
968
- "glue": {
969
- "evaluator": "log_likelihoods",
970
- "extractor_location": "lm_eval_pairs",
971
- "extractor_file": "glue",
972
- "benchmark_type": "other",
973
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
974
- },
975
- "gpqa": {
976
- "evaluator": "log_likelihoods",
977
- "extractor_location": "lm_eval_pairs",
978
- "extractor_file": "gpqa",
979
- "benchmark_type": "question_answering",
980
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
981
- },
982
- "gpt3": {
983
- "evaluator": "log_likelihoods",
984
- "extractor_location": "lm_eval_pairs",
985
- "extractor_file": "gpt3",
986
- "benchmark_type": "other",
987
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
988
- },
989
- "groundcocoa": {
990
- "evaluator": "generation",
991
- "extractor_location": "lm_eval_pairs",
992
- "extractor_file": "groundcocoa",
993
- "benchmark_type": "other",
994
- "explanation": "Text generation evaluation - assesses quality of generated text"
995
- },
996
- "gsm": {
997
- "evaluator": "exact_match",
998
- "extractor_location": "lm_eval_pairs",
999
- "extractor_file": "gsm",
1000
- "benchmark_type": "mathematics",
1001
- "explanation": "Text comparison (WARNING: should use execution for mathematics)"
1002
- },
1003
- "gsm8k": {
1004
- "evaluator": "exact_match",
1005
- "extractor_location": "lm_eval_pairs",
1006
- "extractor_file": "gsm8k",
1007
- "benchmark_type": "mathematics",
1008
- "explanation": "Text comparison (WARNING: should use execution for mathematics)"
1009
- },
1010
- "gsm8k_cot": {
1011
- "evaluator": null,
1012
- "extractor_location": "huggingface_pairs",
1013
- "extractor_file": "math",
1014
- "benchmark_type": "mathematics",
1015
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1016
- },
1017
- "gsm8k_cot_llama": {
1018
- "evaluator": null,
1019
- "extractor_location": "huggingface_pairs",
1020
- "extractor_file": "math",
1021
- "benchmark_type": "mathematics",
1022
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1023
- },
1024
- "gsm8k_cot_self_consistency": {
1025
- "evaluator": null,
1026
- "extractor_location": "huggingface_pairs",
1027
- "extractor_file": "math",
1028
- "benchmark_type": "mathematics",
1029
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1030
- },
1031
- "gsm8k_llama": {
1032
- "evaluator": null,
1033
- "extractor_location": "huggingface_pairs",
1034
- "extractor_file": "math",
1035
- "benchmark_type": "mathematics",
1036
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1037
- },
1038
- "gsm8k_platinum_cot": {
1039
- "evaluator": null,
1040
- "extractor_location": "huggingface_pairs",
1041
- "extractor_file": "math",
1042
- "benchmark_type": "mathematics",
1043
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1044
- },
1045
- "gsm8k_platinum_cot_llama": {
1046
- "evaluator": null,
1047
- "extractor_location": "huggingface_pairs",
1048
- "extractor_file": "math",
1049
- "benchmark_type": "mathematics",
1050
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1051
- },
1052
- "gsm8k_platinum_cot_self_consistency": {
1053
- "evaluator": null,
1054
- "extractor_location": "huggingface_pairs",
1055
- "extractor_file": "math",
1056
- "benchmark_type": "mathematics",
1057
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1058
- },
1059
- "gsm_plus": {
1060
- "evaluator": "exact_match",
1061
- "extractor_location": "huggingface_pairs",
1062
- "extractor_file": "gsm_plus",
1063
- "benchmark_type": "mathematics",
1064
- "explanation": "Text comparison (WARNING: should use execution for mathematics)"
1065
- },
1066
- "haerae": {
1067
- "evaluator": null,
1068
- "extractor_location": "lm_eval_pairs",
1069
- "extractor_file": "haerae",
1070
- "benchmark_type": "other",
1071
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1072
- },
1073
- "headqa": {
1074
- "evaluator": null,
1075
- "extractor_location": "lm_eval_pairs",
1076
- "extractor_file": "headqa",
1077
- "benchmark_type": "question_answering",
1078
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1079
- },
1080
- "hellaswag": {
1081
- "evaluator": "log_likelihoods",
1082
- "extractor_location": "lm_eval_pairs",
1083
- "extractor_file": "hellaswag",
1084
- "benchmark_type": "knowledge",
1085
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1086
- },
1087
- "hendrycks_ethics": {
1088
- "evaluator": null,
1089
- "extractor_location": "lm_eval_pairs",
1090
- "extractor_file": "hendrycks_ethics",
1091
- "benchmark_type": "other",
1092
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1093
- },
1094
- "hendrycks_math": {
1095
- "evaluator": null,
1096
- "extractor_location": "lm_eval_pairs",
1097
- "extractor_file": "hendrycks_math",
1098
- "benchmark_type": "mathematics",
1099
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1100
- },
1101
- "histoires_morales": {
1102
- "evaluator": "generation",
1103
- "extractor_location": "lm_eval_pairs",
1104
- "extractor_file": "histoires_morales",
1105
- "benchmark_type": "other",
1106
- "explanation": "Text generation evaluation - assesses quality of generated text"
1107
- },
1108
- "hle": {
1109
- "evaluator": null,
1110
- "extractor_location": "huggingface_pairs",
1111
- "extractor_file": "hle",
1112
- "benchmark_type": "other",
1113
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1114
- },
1115
- "hle_exact_match": {
1116
- "evaluator": null,
1117
- "extractor_location": "huggingface_pairs",
1118
- "extractor_file": "hle",
1119
- "benchmark_type": "other",
1120
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1121
- },
1122
- "hle_multiple_choice": {
1123
- "evaluator": null,
1124
- "extractor_location": "huggingface_pairs",
1125
- "extractor_file": "hle",
1126
- "benchmark_type": "other",
1127
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1128
- },
1129
- "hmmt": {
1130
- "evaluator": null,
1131
- "extractor_location": "huggingface_pairs",
1132
- "extractor_file": "hmmt",
1133
- "benchmark_type": "mathematics",
1134
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1135
- },
1136
- "hmmt_feb_2025": {
1137
- "evaluator": null,
1138
- "extractor_location": "huggingface_pairs",
1139
- "extractor_file": "hmmt",
1140
- "benchmark_type": "mathematics",
1141
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1142
- },
1143
- "hrm8k": {
1144
- "evaluator": "exact_match",
1145
- "extractor_location": "lm_eval_pairs",
1146
- "extractor_file": "hrm8k",
1147
- "benchmark_type": "other",
1148
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
1149
- },
1150
- "humaneval": {
1151
- "evaluator": null,
1152
- "extractor_location": "huggingface_pairs",
1153
- "extractor_file": "humaneval",
1154
- "benchmark_type": "coding",
1155
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1156
- },
1157
- "humaneval_64_instruct": {
1158
- "evaluator": null,
1159
- "extractor_location": "huggingface_pairs",
1160
- "extractor_file": "instructhumaneval",
1161
- "benchmark_type": "coding",
1162
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1163
- },
1164
- "humaneval_infilling": {
1165
- "evaluator": null,
1166
- "extractor_location": "lm_eval_pairs",
1167
- "extractor_file": "humaneval_infilling",
1168
- "benchmark_type": "coding",
1169
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1170
- },
1171
- "humaneval_instruct": {
1172
- "evaluator": null,
1173
- "extractor_location": "huggingface_pairs",
1174
- "extractor_file": "instructhumaneval",
1175
- "benchmark_type": "coding",
1176
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1177
- },
1178
- "humaneval_plus": {
1179
- "evaluator": null,
1180
- "extractor_location": "huggingface_pairs",
1181
- "extractor_file": "humaneval",
1182
- "benchmark_type": "coding",
1183
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1184
- },
1185
- "humanevalpack": {
1186
- "evaluator": "exact_match",
1187
- "extractor_location": "huggingface_pairs",
1188
- "extractor_file": "humanevalpack",
1189
- "benchmark_type": "coding",
1190
- "explanation": "Text comparison (WARNING: should use execution for coding)"
1191
- },
1192
- "icelandic_winogrande": {
1193
- "evaluator": null,
1194
- "extractor_location": "lm_eval_pairs",
1195
- "extractor_file": "icelandic_winogrande",
1196
- "benchmark_type": "other",
1197
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1198
- },
1199
- "ifeval": {
1200
- "evaluator": "exact_match",
1201
- "extractor_location": "lm_eval_pairs",
1202
- "extractor_file": "ifeval",
1203
- "benchmark_type": "other",
1204
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
1205
- },
1206
- "instruct_humaneval": {
1207
- "evaluator": null,
1208
- "extractor_location": "huggingface_pairs",
1209
- "extractor_file": "instructhumaneval",
1210
- "benchmark_type": "coding",
1211
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1212
- },
1213
- "instructhumaneval": {
1214
- "evaluator": null,
1215
- "extractor_location": "huggingface_pairs",
1216
- "extractor_file": "instructhumaneval",
1217
- "benchmark_type": "coding",
1218
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1219
- },
1220
- "inverse": {
1221
- "evaluator": null,
1222
- "extractor_location": "lm_eval_pairs",
1223
- "extractor_file": "inverse",
1224
- "benchmark_type": "other",
1225
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1226
- },
1227
- "inverse_scaling": {
1228
- "evaluator": "log_likelihoods",
1229
- "extractor_location": "lm_eval_pairs",
1230
- "extractor_file": "inverse_scaling",
1231
- "benchmark_type": "other",
1232
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1233
- },
1234
- "iwslt2017": {
1235
- "evaluator": "generation",
1236
- "extractor_location": "lm_eval_pairs",
1237
- "extractor_file": "iwslt2017",
1238
- "benchmark_type": "translation",
1239
- "explanation": "Text generation evaluation - assesses quality of generated text"
1240
- },
1241
- "iwslt2017_ar_en": {
1242
- "evaluator": "generation",
1243
- "extractor_location": "huggingface_pairs",
1244
- "extractor_file": "iwslt2017_ar_en",
1245
- "benchmark_type": "translation",
1246
- "explanation": "Text generation evaluation - assesses quality of generated text"
1247
- },
1248
- "iwslt2017_en_ar": {
1249
- "evaluator": "generation",
1250
- "extractor_location": "huggingface_pairs",
1251
- "extractor_file": "iwslt2017_en_ar",
1252
- "benchmark_type": "translation",
1253
- "explanation": "Text generation evaluation - assesses quality of generated text"
1254
- },
1255
- "ja": {
1256
- "evaluator": null,
1257
- "extractor_location": "lm_eval_pairs",
1258
- "extractor_file": "ja",
1259
- "benchmark_type": "other",
1260
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1261
- },
1262
- "japanese_leaderboard": {
1263
- "evaluator": "log_likelihoods",
1264
- "extractor_location": "lm_eval_pairs",
1265
- "extractor_file": "japanese_leaderboard",
1266
- "benchmark_type": "other",
1267
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1268
- },
1269
- "jsonschema_bench": {
1270
- "evaluator": "generation",
1271
- "extractor_location": "lm_eval_pairs",
1272
- "extractor_file": "jsonschema_bench",
1273
- "benchmark_type": "other",
1274
- "explanation": "Text generation evaluation - assesses quality of generated text"
1275
- },
1276
- "kbl": {
1277
- "evaluator": "log_likelihoods",
1278
- "extractor_location": "lm_eval_pairs",
1279
- "extractor_file": "kbl",
1280
- "benchmark_type": "other",
1281
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1282
- },
1283
- "kmmlu": {
1284
- "evaluator": "log_likelihoods",
1285
- "extractor_location": "lm_eval_pairs",
1286
- "extractor_file": "kmmlu",
1287
- "benchmark_type": "knowledge",
1288
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1289
- },
1290
- "kobest": {
1291
- "evaluator": null,
1292
- "extractor_location": "lm_eval_pairs",
1293
- "extractor_file": "kobest",
1294
- "benchmark_type": "other",
1295
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1296
- },
1297
- "kormedmcqa": {
1298
- "evaluator": "generation",
1299
- "extractor_location": "lm_eval_pairs",
1300
- "extractor_file": "kormedmcqa",
1301
- "benchmark_type": "question_answering",
1302
- "explanation": "Text generation evaluation - assesses quality of generated text"
1303
- },
1304
- "lambada": {
1305
- "evaluator": "exact_match",
1306
- "extractor_location": "lm_eval_pairs",
1307
- "extractor_file": "lambada",
1308
- "benchmark_type": "other",
1309
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
1310
- },
1311
- "lambada_cloze": {
1312
- "evaluator": null,
1313
- "extractor_location": "lm_eval_pairs",
1314
- "extractor_file": "lambada_cloze",
1315
- "benchmark_type": "other",
1316
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1317
- },
1318
- "lambada_multilingual": {
1319
- "evaluator": null,
1320
- "extractor_location": "lm_eval_pairs",
1321
- "extractor_file": "lambada_multilingual",
1322
- "benchmark_type": "other",
1323
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1324
- },
1325
- "lambada_multilingual_stablelm": {
1326
- "evaluator": "log_likelihoods",
1327
- "extractor_location": "lm_eval_pairs",
1328
- "extractor_file": "lambada_multilingual_stablelm",
1329
- "benchmark_type": "other",
1330
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1331
- },
1332
- "law": {
1333
- "evaluator": null,
1334
- "extractor_location": "lm_eval_pairs",
1335
- "extractor_file": "law",
1336
- "benchmark_type": "other",
1337
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1338
- },
1339
- "law_stack_exchange": {
1340
- "evaluator": "generation",
1341
- "extractor_location": "huggingface_pairs",
1342
- "extractor_file": "law_stack_exchange",
1343
- "benchmark_type": "other",
1344
- "explanation": "Text generation evaluation - assesses quality of generated text"
1345
- },
1346
- "leaderboard": {
1347
- "evaluator": null,
1348
- "extractor_location": "lm_eval_pairs",
1349
- "extractor_file": "leaderboard",
1350
- "benchmark_type": "other",
1351
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1352
- },
1353
- "ledgar": {
1354
- "evaluator": null,
1355
- "extractor_location": "huggingface_pairs",
1356
- "extractor_file": "ledgar",
1357
- "benchmark_type": "other",
1358
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1359
- },
1360
- "libra": {
1361
- "evaluator": "generation",
1362
- "extractor_location": "lm_eval_pairs",
1363
- "extractor_file": "libra",
1364
- "benchmark_type": "other",
1365
- "explanation": "Text generation evaluation - assesses quality of generated text"
1366
- },
1367
- "lingoly": {
1368
- "evaluator": "log_likelihoods",
1369
- "extractor_location": "lm_eval_pairs",
1370
- "extractor_file": "lingoly",
1371
- "benchmark_type": "other",
1372
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1373
- },
1374
- "livecodebench": {
1375
- "evaluator": null,
1376
- "extractor_location": "huggingface_pairs",
1377
- "extractor_file": "livecodebench",
1378
- "benchmark_type": "coding",
1379
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1380
- },
1381
- "livemathbench": {
1382
- "evaluator": null,
1383
- "extractor_location": "huggingface_pairs",
1384
- "extractor_file": "livemathbench",
1385
- "benchmark_type": "mathematics",
1386
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1387
- },
1388
- "livemathbench_cnmo_en": {
1389
- "evaluator": null,
1390
- "extractor_location": "huggingface_pairs",
1391
- "extractor_file": "livemathbench_configs",
1392
- "benchmark_type": "mathematics",
1393
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1394
- },
1395
- "livemathbench_cnmo_zh": {
1396
- "evaluator": null,
1397
- "extractor_location": "huggingface_pairs",
1398
- "extractor_file": "livemathbench_configs",
1399
- "benchmark_type": "mathematics",
1400
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1401
- },
1402
- "llama": {
1403
- "evaluator": null,
1404
- "extractor_location": "huggingface_pairs",
1405
- "extractor_file": "llama",
1406
- "benchmark_type": "other",
1407
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1408
- },
1409
- "llama3": {
1410
- "evaluator": null,
1411
- "extractor_location": "lm_eval_pairs",
1412
- "extractor_file": "llama3",
1413
- "benchmark_type": "other",
1414
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1415
- },
1416
- "lm_syneval": {
1417
- "evaluator": null,
1418
- "extractor_location": "lm_eval_pairs",
1419
- "extractor_file": "lm_syneval",
1420
- "benchmark_type": "other",
1421
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1422
- },
1423
- "logieval": {
1424
- "evaluator": null,
1425
- "extractor_location": "huggingface_pairs",
1426
- "extractor_file": "logieval",
1427
- "benchmark_type": "other",
1428
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1429
- },
1430
- "logiqa": {
1431
- "evaluator": null,
1432
- "extractor_location": "lm_eval_pairs",
1433
- "extractor_file": "logiqa",
1434
- "benchmark_type": "question_answering",
1435
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1436
- },
1437
- "logiqa2": {
1438
- "evaluator": null,
1439
- "extractor_location": "lm_eval_pairs",
1440
- "extractor_file": "logiqa2",
1441
- "benchmark_type": "question_answering",
1442
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1443
- },
1444
- "longbench": {
1445
- "evaluator": null,
1446
- "extractor_location": "lm_eval_pairs",
1447
- "extractor_file": "longbench",
1448
- "benchmark_type": "other",
1449
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1450
- },
1451
- "longbenchv2": {
1452
- "evaluator": null,
1453
- "extractor_location": "lm_eval_pairs",
1454
- "extractor_file": "longbenchv2",
1455
- "benchmark_type": "other",
1456
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1457
- },
1458
- "m_mmlu": {
1459
- "evaluator": null,
1460
- "extractor_location": "huggingface_pairs",
1461
- "extractor_file": "m_mmlu",
1462
- "benchmark_type": "knowledge",
1463
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1464
- },
1465
- "mastermind": {
1466
- "evaluator": "log_likelihoods",
1467
- "extractor_location": "lm_eval_pairs",
1468
- "extractor_file": "mastermind",
1469
- "benchmark_type": "other",
1470
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1471
- },
1472
- "math": {
1473
- "evaluator": null,
1474
- "extractor_location": "huggingface_pairs",
1475
- "extractor_file": "math",
1476
- "benchmark_type": "mathematics",
1477
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1478
- },
1479
- "math500": {
1480
- "evaluator": null,
1481
- "extractor_location": "huggingface_pairs",
1482
- "extractor_file": "math",
1483
- "benchmark_type": "mathematics",
1484
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1485
- },
1486
- "math_500": {
1487
- "evaluator": null,
1488
- "extractor_location": "huggingface_pairs",
1489
- "extractor_file": "math",
1490
- "benchmark_type": "mathematics",
1491
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1492
- },
1493
- "mathqa": {
1494
- "evaluator": null,
1495
- "extractor_location": "lm_eval_pairs",
1496
- "extractor_file": "mathqa",
1497
- "benchmark_type": "mathematics",
1498
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1499
- },
1500
- "mbpp": {
1501
- "evaluator": null,
1502
- "extractor_location": "huggingface_pairs",
1503
- "extractor_file": "mbpp",
1504
- "benchmark_type": "coding",
1505
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1506
- },
1507
- "mbpp_plus": {
1508
- "evaluator": null,
1509
- "extractor_location": "huggingface_pairs",
1510
- "extractor_file": "mbpp",
1511
- "benchmark_type": "coding",
1512
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1513
- },
1514
- "mc-taco": {
1515
- "evaluator": null,
1516
- "extractor_location": "lm_eval_pairs",
1517
- "extractor_file": "mc-taco",
1518
- "benchmark_type": "other",
1519
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1520
- },
1521
- "med_concepts_qa": {
1522
- "evaluator": "log_likelihoods",
1523
- "extractor_location": "lm_eval_pairs",
1524
- "extractor_file": "med_concepts_qa",
1525
- "benchmark_type": "question_answering",
1526
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1527
- },
1528
- "meddialog": {
1529
- "evaluator": null,
1530
- "extractor_location": "huggingface_pairs",
1531
- "extractor_file": "meddialog",
1532
- "benchmark_type": "other",
1533
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1534
- },
1535
- "meddialog_qsumm": {
1536
- "evaluator": null,
1537
- "extractor_location": "huggingface_pairs",
1538
- "extractor_file": "meddialog",
1539
- "benchmark_type": "other",
1540
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1541
- },
1542
- "meddialog_qsumm_perplexity": {
1543
- "evaluator": null,
1544
- "extractor_location": "huggingface_pairs",
1545
- "extractor_file": "meddialog",
1546
- "benchmark_type": "other",
1547
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1548
- },
1549
- "meddialog_raw_dialogues": {
1550
- "evaluator": null,
1551
- "extractor_location": "huggingface_pairs",
1552
- "extractor_file": "meddialog",
1553
- "benchmark_type": "other",
1554
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1555
- },
1556
- "meddialog_raw_perplexity": {
1557
- "evaluator": null,
1558
- "extractor_location": "huggingface_pairs",
1559
- "extractor_file": "meddialog",
1560
- "benchmark_type": "other",
1561
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1562
- },
1563
- "medical": {
1564
- "evaluator": null,
1565
- "extractor_location": "lm_eval_pairs",
1566
- "extractor_file": "medical",
1567
- "benchmark_type": "other",
1568
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1569
- },
1570
- "medical_abstracts": {
1571
- "evaluator": "generation",
1572
- "extractor_location": "huggingface_pairs",
1573
- "extractor_file": "medical_abstracts",
1574
- "benchmark_type": "other",
1575
- "explanation": "Text generation evaluation - assesses quality of generated text"
1576
- },
1577
- "mediqa_qa2019": {
1578
- "evaluator": "generation",
1579
- "extractor_location": "lm_eval_pairs",
1580
- "extractor_file": "mediqa_qa2019",
1581
- "benchmark_type": "question_answering",
1582
- "explanation": "Text generation evaluation - assesses quality of generated text"
1583
- },
1584
- "medmcqa": {
1585
- "evaluator": "log_likelihoods",
1586
- "extractor_location": "lm_eval_pairs",
1587
- "extractor_file": "medmcqa",
1588
- "benchmark_type": "question_answering",
1589
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1590
- },
1591
- "medqa": {
1592
- "evaluator": null,
1593
- "extractor_location": "lm_eval_pairs",
1594
- "extractor_file": "medqa",
1595
- "benchmark_type": "question_answering",
1596
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1597
- },
1598
- "medtext": {
1599
- "evaluator": "generation",
1600
- "extractor_location": "lm_eval_pairs",
1601
- "extractor_file": "medtext",
1602
- "benchmark_type": "other",
1603
- "explanation": "Text generation evaluation - assesses quality of generated text"
1604
- },
1605
- "mela": {
1606
- "evaluator": "log_likelihoods",
1607
- "extractor_location": "huggingface_pairs",
1608
- "extractor_file": "mela",
1609
- "benchmark_type": "other",
1610
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1611
- },
1612
- "meqsum": {
1613
- "evaluator": "generation",
1614
- "extractor_location": "lm_eval_pairs",
1615
- "extractor_file": "meqsum",
1616
- "benchmark_type": "other",
1617
- "explanation": "Text generation evaluation - assesses quality of generated text"
1618
- },
1619
- "mercury": {
1620
- "evaluator": null,
1621
- "extractor_location": "huggingface_pairs",
1622
- "extractor_file": "mercury",
1623
- "benchmark_type": "other",
1624
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1625
- },
1626
- "metabench": {
1627
- "evaluator": null,
1628
- "extractor_location": "lm_eval_pairs",
1629
- "extractor_file": "metabench",
1630
- "benchmark_type": "other",
1631
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1632
- },
1633
- "mgsm": {
1634
- "evaluator": "generation",
1635
- "extractor_location": "lm_eval_pairs",
1636
- "extractor_file": "mgsm",
1637
- "benchmark_type": "mathematics",
1638
- "explanation": "Text generation evaluation - assesses quality of generated text"
1639
- },
1640
- "mimic_repsum": {
1641
- "evaluator": "generation",
1642
- "extractor_location": "lm_eval_pairs",
1643
- "extractor_file": "mimic_repsum",
1644
- "benchmark_type": "other",
1645
- "explanation": "Text generation evaluation - assesses quality of generated text"
1646
- },
1647
- "minerva_math": {
1648
- "evaluator": "generation",
1649
- "extractor_location": "lm_eval_pairs",
1650
- "extractor_file": "minerva_math",
1651
- "benchmark_type": "mathematics",
1652
- "explanation": "Text generation evaluation - assesses quality of generated text"
1653
- },
1654
- "mlqa": {
1655
- "evaluator": "generation",
1656
- "extractor_location": "lm_eval_pairs",
1657
- "extractor_file": "mlqa",
1658
- "benchmark_type": "question_answering",
1659
- "explanation": "Text generation evaluation - assesses quality of generated text"
1660
- },
1661
- "mmlu": {
1662
- "evaluator": "log_likelihoods",
1663
- "extractor_location": "lm_eval_pairs",
1664
- "extractor_file": "mmlu",
1665
- "benchmark_type": "knowledge",
1666
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1667
- },
1668
- "mmlu_pro": {
1669
- "evaluator": null,
1670
- "extractor_location": "lm_eval_pairs",
1671
- "extractor_file": "mmlu_pro",
1672
- "benchmark_type": "knowledge",
1673
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1674
- },
1675
- "mmlusr": {
1676
- "evaluator": null,
1677
- "extractor_location": "huggingface_pairs",
1678
- "extractor_file": "mmlusr",
1679
- "benchmark_type": "knowledge",
1680
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1681
- },
1682
- "mmlusr_answer_only": {
1683
- "evaluator": null,
1684
- "extractor_location": "huggingface_pairs",
1685
- "extractor_file": "mmlusr",
1686
- "benchmark_type": "knowledge",
1687
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1688
- },
1689
- "mmlusr_question_and_answer": {
1690
- "evaluator": null,
1691
- "extractor_location": "huggingface_pairs",
1692
- "extractor_file": "mmlusr",
1693
- "benchmark_type": "knowledge",
1694
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1695
- },
1696
- "mmlusr_question_only": {
1697
- "evaluator": null,
1698
- "extractor_location": "huggingface_pairs",
1699
- "extractor_file": "mmlusr",
1700
- "benchmark_type": "knowledge",
1701
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1702
- },
1703
- "mmmu": {
1704
- "evaluator": "log_likelihoods",
1705
- "extractor_location": "lm_eval_pairs",
1706
- "extractor_file": "mmmu",
1707
- "benchmark_type": "other",
1708
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1709
- },
1710
- "mnli": {
1711
- "evaluator": "log_likelihoods",
1712
- "extractor_location": "lm_eval_pairs",
1713
- "extractor_file": "mnli",
1714
- "benchmark_type": "other",
1715
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1716
- },
1717
- "model_written_evals": {
1718
- "evaluator": "log_likelihoods",
1719
- "extractor_location": "lm_eval_pairs",
1720
- "extractor_file": "model_written_evals",
1721
- "benchmark_type": "other",
1722
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1723
- },
1724
- "moral_stories": {
1725
- "evaluator": "log_likelihoods",
1726
- "extractor_location": "lm_eval_pairs",
1727
- "extractor_file": "moral_stories",
1728
- "benchmark_type": "other",
1729
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1730
- },
1731
- "mrpc": {
1732
- "evaluator": null,
1733
- "extractor_location": "lm_eval_pairs",
1734
- "extractor_file": "mrpc",
1735
- "benchmark_type": "other",
1736
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1737
- },
1738
- "mts_dialog": {
1739
- "evaluator": "generation",
1740
- "extractor_location": "lm_eval_pairs",
1741
- "extractor_file": "mts_dialog",
1742
- "benchmark_type": "other",
1743
- "explanation": "Text generation evaluation - assesses quality of generated text"
1744
- },
1745
- "multiblimp": {
1746
- "evaluator": "log_likelihoods",
1747
- "extractor_location": "lm_eval_pairs",
1748
- "extractor_file": "multiblimp",
1749
- "benchmark_type": "other",
1750
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1751
- },
1752
- "multilingual": {
1753
- "evaluator": null,
1754
- "extractor_location": "lm_eval_pairs",
1755
- "extractor_file": "multilingual",
1756
- "benchmark_type": "other",
1757
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1758
- },
1759
- "multimedqa": {
1760
- "evaluator": "log_likelihoods",
1761
- "extractor_location": "huggingface_pairs",
1762
- "extractor_file": "multimedqa",
1763
- "benchmark_type": "question_answering",
1764
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1765
- },
1766
- "multipl_e": {
1767
- "evaluator": null,
1768
- "extractor_location": "huggingface_pairs",
1769
- "extractor_file": "multipl_e",
1770
- "benchmark_type": "other",
1771
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1772
- },
1773
- "multiple": {
1774
- "evaluator": null,
1775
- "extractor_location": "huggingface_pairs",
1776
- "extractor_file": "multiple",
1777
- "benchmark_type": "other",
1778
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1779
- },
1780
- "multiple_cpp": {
1781
- "evaluator": null,
1782
- "extractor_location": "huggingface_pairs",
1783
- "extractor_file": "multipl_e",
1784
- "benchmark_type": "other",
1785
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1786
- },
1787
- "multiple_go": {
1788
- "evaluator": null,
1789
- "extractor_location": "huggingface_pairs",
1790
- "extractor_file": "multipl_e",
1791
- "benchmark_type": "other",
1792
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1793
- },
1794
- "multiple_java": {
1795
- "evaluator": null,
1796
- "extractor_location": "huggingface_pairs",
1797
- "extractor_file": "multipl_e",
1798
- "benchmark_type": "other",
1799
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1800
- },
1801
- "multiple_js": {
1802
- "evaluator": null,
1803
- "extractor_location": "huggingface_pairs",
1804
- "extractor_file": "multipl_e",
1805
- "benchmark_type": "other",
1806
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1807
- },
1808
- "multiple_py": {
1809
- "evaluator": null,
1810
- "extractor_location": "huggingface_pairs",
1811
- "extractor_file": "multipl_e",
1812
- "benchmark_type": "other",
1813
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1814
- },
1815
- "multiple_rs": {
1816
- "evaluator": null,
1817
- "extractor_location": "huggingface_pairs",
1818
- "extractor_file": "multipl_e",
1819
- "benchmark_type": "other",
1820
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1821
- },
1822
- "multirc": {
1823
- "evaluator": null,
1824
- "extractor_location": "lm_eval_pairs",
1825
- "extractor_file": "multirc",
1826
- "benchmark_type": "other",
1827
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1828
- },
1829
- "mutual": {
1830
- "evaluator": null,
1831
- "extractor_location": "lm_eval_pairs",
1832
- "extractor_file": "mutual",
1833
- "benchmark_type": "other",
1834
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1835
- },
1836
- "non": {
1837
- "evaluator": null,
1838
- "extractor_location": "lm_eval_pairs",
1839
- "extractor_file": "non",
1840
- "benchmark_type": "other",
1841
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1842
- },
1843
- "noreval": {
1844
- "evaluator": "log_likelihoods",
1845
- "extractor_location": "lm_eval_pairs",
1846
- "extractor_file": "noreval",
1847
- "benchmark_type": "other",
1848
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1849
- },
1850
- "noreval_gen": {
1851
- "evaluator": "generation",
1852
- "extractor_location": "lm_eval_pairs",
1853
- "extractor_file": "noreval_gen",
1854
- "benchmark_type": "other",
1855
- "explanation": "Text generation evaluation - assesses quality of generated text"
1856
- },
1857
- "noreval_mc": {
1858
- "evaluator": "log_likelihoods",
1859
- "extractor_location": "lm_eval_pairs",
1860
- "extractor_file": "noreval_mc",
1861
- "benchmark_type": "other",
1862
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1863
- },
1864
- "noticia": {
1865
- "evaluator": "generation",
1866
- "extractor_location": "huggingface_pairs",
1867
- "extractor_file": "noticia",
1868
- "benchmark_type": "other",
1869
- "explanation": "Text generation evaluation - assesses quality of generated text"
1870
- },
1871
- "nq_open": {
1872
- "evaluator": "generation",
1873
- "extractor_location": "lm_eval_pairs",
1874
- "extractor_file": "nq_open",
1875
- "benchmark_type": "other",
1876
- "explanation": "Text generation evaluation - assesses quality of generated text"
1877
- },
1878
- "okapi": {
1879
- "evaluator": "log_likelihoods",
1880
- "extractor_location": "lm_eval_pairs",
1881
- "extractor_file": "okapi",
1882
- "benchmark_type": "other",
1883
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1884
- },
1885
- "okapi_arc_multilingual": {
1886
- "evaluator": "log_likelihoods",
1887
- "extractor_location": "lm_eval_pairs",
1888
- "extractor_file": "okapi_arc_multilingual",
1889
- "benchmark_type": "knowledge",
1890
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1891
- },
1892
- "okapi_hellaswag_multilingual": {
1893
- "evaluator": "log_likelihoods",
1894
- "extractor_location": "lm_eval_pairs",
1895
- "extractor_file": "okapi_hellaswag_multilingual",
1896
- "benchmark_type": "knowledge",
1897
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1898
- },
1899
- "okapi_mmlu_multilingual": {
1900
- "evaluator": "log_likelihoods",
1901
- "extractor_location": "lm_eval_pairs",
1902
- "extractor_file": "okapi_mmlu_multilingual",
1903
- "benchmark_type": "knowledge",
1904
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1905
- },
1906
- "okapi_truthfulqa_multilingual": {
1907
- "evaluator": "log_likelihoods",
1908
- "extractor_location": "lm_eval_pairs",
1909
- "extractor_file": "okapi_truthfulqa_multilingual",
1910
- "benchmark_type": "question_answering",
1911
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1912
- },
1913
- "olaph": {
1914
- "evaluator": "generation",
1915
- "extractor_location": "lm_eval_pairs",
1916
- "extractor_file": "olaph",
1917
- "benchmark_type": "other",
1918
- "explanation": "Text generation evaluation - assesses quality of generated text"
1919
- },
1920
- "openbookqa": {
1921
- "evaluator": "log_likelihoods",
1922
- "extractor_location": "lm_eval_pairs",
1923
- "extractor_file": "openbookqa",
1924
- "benchmark_type": "question_answering",
1925
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1926
- },
1927
- "openllm": {
1928
- "evaluator": "log_likelihoods",
1929
- "extractor_location": "huggingface_pairs",
1930
- "extractor_file": "openllm",
1931
- "benchmark_type": "other",
1932
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1933
- },
1934
- "option": {
1935
- "evaluator": null,
1936
- "extractor_location": "lm_eval_pairs",
1937
- "extractor_file": "option",
1938
- "benchmark_type": "other",
1939
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1940
- },
1941
- "paloma": {
1942
- "evaluator": "perplexity",
1943
- "extractor_location": "lm_eval_pairs",
1944
- "extractor_file": "paloma",
1945
- "benchmark_type": "other",
1946
- "explanation": "Perplexity measurement - evaluates model's prediction confidence"
1947
- },
1948
- "parafraseja": {
1949
- "evaluator": "log_likelihoods",
1950
- "extractor_location": "lm_eval_pairs",
1951
- "extractor_file": "parafraseja",
1952
- "benchmark_type": "other",
1953
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1954
- },
1955
- "parafrases": {
1956
- "evaluator": "log_likelihoods",
1957
- "extractor_location": "lm_eval_pairs",
1958
- "extractor_file": "parafrases",
1959
- "benchmark_type": "other",
1960
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1961
- },
1962
- "paws": {
1963
- "evaluator": null,
1964
- "extractor_location": "lm_eval_pairs",
1965
- "extractor_file": "paws",
1966
- "benchmark_type": "other",
1967
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1968
- },
1969
- "paws_x": {
1970
- "evaluator": "log_likelihoods",
1971
- "extractor_location": "lm_eval_pairs",
1972
- "extractor_file": "paws_x",
1973
- "benchmark_type": "other",
1974
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1975
- },
1976
- "pawsx": {
1977
- "evaluator": "log_likelihoods",
1978
- "extractor_location": "lm_eval_pairs",
1979
- "extractor_file": "pawsx",
1980
- "benchmark_type": "other",
1981
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1982
- },
1983
- "persona": {
1984
- "evaluator": "log_likelihoods",
1985
- "extractor_location": "lm_eval_pairs",
1986
- "extractor_file": "persona",
1987
- "benchmark_type": "other",
1988
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1989
- },
1990
- "phrases": {
1991
- "evaluator": null,
1992
- "extractor_location": "lm_eval_pairs",
1993
- "extractor_file": "phrases",
1994
- "benchmark_type": "other",
1995
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1996
- },
1997
- "pile": {
1998
- "evaluator": "exact_match",
1999
- "extractor_location": "lm_eval_pairs",
2000
- "extractor_file": "pile",
2001
- "benchmark_type": "other",
2002
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
2003
- },
2004
- "pile_10k": {
2005
- "evaluator": "generation",
2006
- "extractor_location": "lm_eval_pairs",
2007
- "extractor_file": "pile_10k",
2008
- "benchmark_type": "other",
2009
- "explanation": "Text generation evaluation - assesses quality of generated text"
2010
- },
2011
- "piqa": {
2012
- "evaluator": "log_likelihoods",
2013
- "extractor_location": "lm_eval_pairs",
2014
- "extractor_file": "piqa",
2015
- "benchmark_type": "question_answering",
2016
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2017
- },
2018
- "polemo2": {
2019
- "evaluator": "generation",
2020
- "extractor_location": "lm_eval_pairs",
2021
- "extractor_file": "polemo2",
2022
- "benchmark_type": "other",
2023
- "explanation": "Text generation evaluation - assesses quality of generated text"
2024
- },
2025
- "polymath": {
2026
- "evaluator": null,
2027
- "extractor_location": "huggingface_pairs",
2028
- "extractor_file": "polymath",
2029
- "benchmark_type": "mathematics",
2030
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2031
- },
2032
- "polymath_en_high": {
2033
- "evaluator": null,
2034
- "extractor_location": "huggingface_pairs",
2035
- "extractor_file": "polymath_configs",
2036
- "benchmark_type": "mathematics",
2037
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2038
- },
2039
- "polymath_en_medium": {
2040
- "evaluator": null,
2041
- "extractor_location": "huggingface_pairs",
2042
- "extractor_file": "polymath_configs",
2043
- "benchmark_type": "mathematics",
2044
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2045
- },
2046
- "polymath_zh_high": {
2047
- "evaluator": null,
2048
- "extractor_location": "huggingface_pairs",
2049
- "extractor_file": "polymath_configs",
2050
- "benchmark_type": "mathematics",
2051
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2052
- },
2053
- "polymath_zh_medium": {
2054
- "evaluator": null,
2055
- "extractor_location": "huggingface_pairs",
2056
- "extractor_file": "polymath_configs",
2057
- "benchmark_type": "mathematics",
2058
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2059
- },
2060
- "portuguese_bench": {
2061
- "evaluator": "log_likelihoods",
2062
- "extractor_location": "lm_eval_pairs",
2063
- "extractor_file": "portuguese_bench",
2064
- "benchmark_type": "other",
2065
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2066
- },
2067
- "prompt": {
2068
- "evaluator": null,
2069
- "extractor_location": "lm_eval_pairs",
2070
- "extractor_file": "prompt",
2071
- "benchmark_type": "other",
2072
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2073
- },
2074
- "prost": {
2075
- "evaluator": null,
2076
- "extractor_location": "lm_eval_pairs",
2077
- "extractor_file": "prost",
2078
- "benchmark_type": "other",
2079
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2080
- },
2081
- "pubmedqa": {
2082
- "evaluator": null,
2083
- "extractor_location": "lm_eval_pairs",
2084
- "extractor_file": "pubmedqa",
2085
- "benchmark_type": "question_answering",
2086
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2087
- },
2088
- "pythia": {
2089
- "evaluator": "log_likelihoods",
2090
- "extractor_location": "huggingface_pairs",
2091
- "extractor_file": "pythia",
2092
- "benchmark_type": "other",
2093
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2094
- },
2095
- "qa4mre": {
2096
- "evaluator": "log_likelihoods",
2097
- "extractor_location": "lm_eval_pairs",
2098
- "extractor_file": "qa4mre",
2099
- "benchmark_type": "question_answering",
2100
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2101
- },
2102
- "qasper": {
2103
- "evaluator": "generation",
2104
- "extractor_location": "lm_eval_pairs",
2105
- "extractor_file": "qasper",
2106
- "benchmark_type": "question_answering",
2107
- "explanation": "Text generation evaluation - assesses quality of generated text"
2108
- },
2109
- "qnli": {
2110
- "evaluator": null,
2111
- "extractor_location": "lm_eval_pairs",
2112
- "extractor_file": "qnli",
2113
- "benchmark_type": "other",
2114
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2115
- },
2116
- "qnlieu": {
2117
- "evaluator": null,
2118
- "extractor_location": "lm_eval_pairs",
2119
- "extractor_file": "qnlieu",
2120
- "benchmark_type": "other",
2121
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2122
- },
2123
- "qqp": {
2124
- "evaluator": null,
2125
- "extractor_location": "lm_eval_pairs",
2126
- "extractor_file": "qqp",
2127
- "benchmark_type": "other",
2128
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2129
- },
2130
- "quac": {
2131
- "evaluator": null,
2132
- "extractor_location": "lm_eval_pairs",
2133
- "extractor_file": "quac",
2134
- "benchmark_type": "other",
2135
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2136
- },
2137
- "race": {
2138
- "evaluator": "log_likelihoods",
2139
- "extractor_location": "lm_eval_pairs",
2140
- "extractor_file": "race",
2141
- "benchmark_type": "other",
2142
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2143
- },
2144
- "random": {
2145
- "evaluator": null,
2146
- "extractor_location": "lm_eval_pairs",
2147
- "extractor_file": "random",
2148
- "benchmark_type": "other",
2149
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2150
- },
2151
- "realtoxicityprompts": {
2152
- "evaluator": "generation",
2153
- "extractor_location": "lm_eval_pairs",
2154
- "extractor_file": "realtoxicityprompts",
2155
- "benchmark_type": "other",
2156
- "explanation": "Text generation evaluation - assesses quality of generated text"
2157
- },
2158
- "recode": {
2159
- "evaluator": null,
2160
- "extractor_location": "huggingface_pairs",
2161
- "extractor_file": "recode",
2162
- "benchmark_type": "coding",
2163
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2164
- },
2165
- "record": {
2166
- "evaluator": null,
2167
- "extractor_location": "huggingface_pairs",
2168
- "extractor_file": "record",
2169
- "benchmark_type": "other",
2170
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2171
- },
2172
- "reversed": {
2173
- "evaluator": "exact_match",
2174
- "extractor_location": "lm_eval_pairs",
2175
- "extractor_file": "reversed",
2176
- "benchmark_type": "other",
2177
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
2178
- },
2179
- "rte": {
2180
- "evaluator": null,
2181
- "extractor_location": "lm_eval_pairs",
2182
- "extractor_file": "rte",
2183
- "benchmark_type": "other",
2184
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2185
- },
2186
- "ruler": {
2187
- "evaluator": null,
2188
- "extractor_location": "lm_eval_pairs",
2189
- "extractor_file": "ruler",
2190
- "benchmark_type": "other",
2191
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2192
- },
2193
- "sciq": {
2194
- "evaluator": "log_likelihoods",
2195
- "extractor_location": "lm_eval_pairs",
2196
- "extractor_file": "sciq",
2197
- "benchmark_type": "other",
2198
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2199
- },
2200
- "score": {
2201
- "evaluator": "log_likelihoods",
2202
- "extractor_location": "lm_eval_pairs",
2203
- "extractor_file": "score",
2204
- "benchmark_type": "other",
2205
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2206
- },
2207
- "scrolls": {
2208
- "evaluator": "generation",
2209
- "extractor_location": "lm_eval_pairs",
2210
- "extractor_file": "scrolls",
2211
- "benchmark_type": "other",
2212
- "explanation": "Text generation evaluation - assesses quality of generated text"
2213
- },
2214
- "self": {
2215
- "evaluator": "log_likelihoods",
2216
- "extractor_location": "lm_eval_pairs",
2217
- "extractor_file": "self",
2218
- "benchmark_type": "other",
2219
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2220
- },
2221
- "sglue": {
2222
- "evaluator": null,
2223
- "extractor_location": "lm_eval_pairs",
2224
- "extractor_file": "sglue",
2225
- "benchmark_type": "other",
2226
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2227
- },
2228
- "simple_cooccurrence_bias": {
2229
- "evaluator": null,
2230
- "extractor_location": "lm_eval_pairs",
2231
- "extractor_file": "simple_cooccurrence_bias",
2232
- "benchmark_type": "other",
2233
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2234
- },
2235
- "siqa": {
2236
- "evaluator": "log_likelihoods",
2237
- "extractor_location": "lm_eval_pairs",
2238
- "extractor_file": "siqa",
2239
- "benchmark_type": "question_answering",
2240
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2241
- },
2242
- "social_iqa": {
2243
- "evaluator": null,
2244
- "extractor_location": "lm_eval_pairs",
2245
- "extractor_file": "social_iqa",
2246
- "benchmark_type": "question_answering",
2247
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2248
- },
2249
- "spanish_bench": {
2250
- "evaluator": "log_likelihoods",
2251
- "extractor_location": "lm_eval_pairs",
2252
- "extractor_file": "spanish_bench",
2253
- "benchmark_type": "other",
2254
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2255
- },
2256
- "squad2": {
2257
- "evaluator": null,
2258
- "extractor_location": "huggingface_pairs",
2259
- "extractor_file": "squad2",
2260
- "benchmark_type": "question_answering",
2261
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2262
- },
2263
- "squad_completion": {
2264
- "evaluator": "exact_match",
2265
- "extractor_location": "lm_eval_pairs",
2266
- "extractor_file": "squad_completion",
2267
- "benchmark_type": "question_answering",
2268
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
2269
- },
2270
- "sst2": {
2271
- "evaluator": null,
2272
- "extractor_location": "lm_eval_pairs",
2273
- "extractor_file": "sst2",
2274
- "benchmark_type": "other",
2275
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2276
- },
2277
- "storycloze": {
2278
- "evaluator": "log_likelihoods",
2279
- "extractor_location": "lm_eval_pairs",
2280
- "extractor_file": "storycloze",
2281
- "benchmark_type": "other",
2282
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2283
- },
2284
- "stsb": {
2285
- "evaluator": null,
2286
- "extractor_location": "huggingface_pairs",
2287
- "extractor_file": "stsb",
2288
- "benchmark_type": "other",
2289
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2290
- },
2291
- "summarization": {
2292
- "evaluator": null,
2293
- "extractor_location": "lm_eval_pairs",
2294
- "extractor_file": "summarization",
2295
- "benchmark_type": "other",
2296
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2297
- },
2298
- "super": {
2299
- "evaluator": null,
2300
- "extractor_location": "lm_eval_pairs",
2301
- "extractor_file": "super",
2302
- "benchmark_type": "other",
2303
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2304
- },
2305
- "super_glue": {
2306
- "evaluator": "log_likelihoods",
2307
- "extractor_location": "lm_eval_pairs",
2308
- "extractor_file": "super_glue",
2309
- "benchmark_type": "other",
2310
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2311
- },
2312
- "super_glue_lm_eval_v1": {
2313
- "evaluator": "log_likelihoods",
2314
- "extractor_location": "huggingface_pairs",
2315
- "extractor_file": "super_glue_lm_eval_v1",
2316
- "benchmark_type": "other",
2317
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2318
- },
2319
- "super_glue_lm_eval_v1_seq2seq": {
2320
- "evaluator": "generation",
2321
- "extractor_location": "huggingface_pairs",
2322
- "extractor_file": "super_glue_lm_eval_v1_seq2seq",
2323
- "benchmark_type": "other",
2324
- "explanation": "Text generation evaluation - assesses quality of generated text"
2325
- },
2326
- "super_glue_t5_prompt": {
2327
- "evaluator": "generation",
2328
- "extractor_location": "huggingface_pairs",
2329
- "extractor_file": "super_glue_t5_prompt",
2330
- "benchmark_type": "other",
2331
- "explanation": "Text generation evaluation - assesses quality of generated text"
2332
- },
2333
- "super_gpqa": {
2334
- "evaluator": null,
2335
- "extractor_location": "huggingface_pairs",
2336
- "extractor_file": "super_gpqa",
2337
- "benchmark_type": "question_answering",
2338
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2339
- },
2340
- "superglue": {
2341
- "evaluator": null,
2342
- "extractor_location": "lm_eval_pairs",
2343
- "extractor_file": "superglue",
2344
- "benchmark_type": "other",
2345
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2346
- },
2347
- "supergpqa": {
2348
- "evaluator": null,
2349
- "extractor_location": "lm_eval_pairs",
2350
- "extractor_file": "supergpqa",
2351
- "benchmark_type": "question_answering",
2352
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2353
- },
2354
- "supergpqa_biology": {
2355
- "evaluator": null,
2356
- "extractor_location": "huggingface_pairs",
2357
- "extractor_file": "super_gpqa",
2358
- "benchmark_type": "question_answering",
2359
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2360
- },
2361
- "supergpqa_chemistry": {
2362
- "evaluator": null,
2363
- "extractor_location": "huggingface_pairs",
2364
- "extractor_file": "super_gpqa",
2365
- "benchmark_type": "question_answering",
2366
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2367
- },
2368
- "supergpqa_physics": {
2369
- "evaluator": null,
2370
- "extractor_location": "huggingface_pairs",
2371
- "extractor_file": "super_gpqa",
2372
- "benchmark_type": "question_answering",
2373
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2374
- },
2375
- "swag": {
2376
- "evaluator": "log_likelihoods",
2377
- "extractor_location": "lm_eval_pairs",
2378
- "extractor_file": "swag",
2379
- "benchmark_type": "other",
2380
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2381
- },
2382
- "swde": {
2383
- "evaluator": null,
2384
- "extractor_location": "lm_eval_pairs",
2385
- "extractor_file": "swde",
2386
- "benchmark_type": "other",
2387
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2388
- },
2389
- "sycophancy": {
2390
- "evaluator": "log_likelihoods",
2391
- "extractor_location": "lm_eval_pairs",
2392
- "extractor_file": "sycophancy",
2393
- "benchmark_type": "other",
2394
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2395
- },
2396
- "t0": {
2397
- "evaluator": "generation",
2398
- "extractor_location": "lm_eval_pairs",
2399
- "extractor_file": "t0",
2400
- "benchmark_type": "other",
2401
- "explanation": "Text generation evaluation - assesses quality of generated text"
2402
- },
2403
- "tag": {
2404
- "evaluator": null,
2405
- "extractor_location": "huggingface_pairs",
2406
- "extractor_file": "tag",
2407
- "benchmark_type": "other",
2408
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2409
- },
2410
- "teca": {
2411
- "evaluator": "log_likelihoods",
2412
- "extractor_location": "lm_eval_pairs",
2413
- "extractor_file": "teca",
2414
- "benchmark_type": "other",
2415
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2416
- },
2417
- "tinyarc": {
2418
- "evaluator": "log_likelihoods",
2419
- "extractor_location": "lm_eval_pairs",
2420
- "extractor_file": "tinyarc",
2421
- "benchmark_type": "knowledge",
2422
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2423
- },
2424
- "tinybenchmarks": {
2425
- "evaluator": "log_likelihoods",
2426
- "extractor_location": "lm_eval_pairs",
2427
- "extractor_file": "tinybenchmarks",
2428
- "benchmark_type": "other",
2429
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2430
- },
2431
- "tinygsm8k": {
2432
- "evaluator": "generation",
2433
- "extractor_location": "lm_eval_pairs",
2434
- "extractor_file": "tinygsm8k",
2435
- "benchmark_type": "mathematics",
2436
- "explanation": "Text generation evaluation - assesses quality of generated text"
2437
- },
2438
- "tinyhellaswag": {
2439
- "evaluator": "log_likelihoods",
2440
- "extractor_location": "lm_eval_pairs",
2441
- "extractor_file": "tinyhellaswag",
2442
- "benchmark_type": "knowledge",
2443
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2444
- },
2445
- "tinymmlu": {
2446
- "evaluator": "log_likelihoods",
2447
- "extractor_location": "lm_eval_pairs",
2448
- "extractor_file": "tinymmlu",
2449
- "benchmark_type": "knowledge",
2450
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2451
- },
2452
- "tinytruthfulqa": {
2453
- "evaluator": "log_likelihoods",
2454
- "extractor_location": "lm_eval_pairs",
2455
- "extractor_file": "tinytruthfulqa",
2456
- "benchmark_type": "question_answering",
2457
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2458
- },
2459
- "tinywinogrande": {
2460
- "evaluator": "log_likelihoods",
2461
- "extractor_location": "lm_eval_pairs",
2462
- "extractor_file": "tinywinogrande",
2463
- "benchmark_type": "other",
2464
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2465
- },
2466
- "tmlu": {
2467
- "evaluator": "log_likelihoods",
2468
- "extractor_location": "huggingface_pairs",
2469
- "extractor_file": "tmlu",
2470
- "benchmark_type": "other",
2471
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2472
- },
2473
- "tmmluplus": {
2474
- "evaluator": "log_likelihoods",
2475
- "extractor_location": "lm_eval_pairs",
2476
- "extractor_file": "tmmluplus",
2477
- "benchmark_type": "knowledge",
2478
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2479
- },
2480
- "toxigen": {
2481
- "evaluator": "log_likelihoods",
2482
- "extractor_location": "lm_eval_pairs",
2483
- "extractor_file": "toxigen",
2484
- "benchmark_type": "other",
2485
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2486
- },
2487
- "translation": {
2488
- "evaluator": "generation",
2489
- "extractor_location": "lm_eval_pairs",
2490
- "extractor_file": "translation",
2491
- "benchmark_type": "translation",
2492
- "explanation": "Text generation evaluation - assesses quality of generated text"
2493
- },
2494
- "triviaqa": {
2495
- "evaluator": "generation",
2496
- "extractor_location": "lm_eval_pairs",
2497
- "extractor_file": "triviaqa",
2498
- "benchmark_type": "question_answering",
2499
- "explanation": "Text generation evaluation - assesses quality of generated text"
2500
- },
2501
- "truthfulqa": {
2502
- "evaluator": "log_likelihoods",
2503
- "extractor_location": "lm_eval_pairs",
2504
- "extractor_file": "truthfulqa",
2505
- "benchmark_type": "question_answering",
2506
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2507
- },
2508
- "truthfulqa_gen": {
2509
- "evaluator": "generation",
2510
- "extractor_location": "lm_eval_pairs",
2511
- "extractor_file": "truthfulqa_gen",
2512
- "benchmark_type": "question_answering",
2513
- "explanation": "Text generation evaluation - assesses quality of generated text"
2514
- },
2515
- "truthfulqa_mc1": {
2516
- "evaluator": "log_likelihoods",
2517
- "extractor_location": "lm_eval_pairs",
2518
- "extractor_file": "truthfulqa_mc1",
2519
- "benchmark_type": "question_answering",
2520
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2521
- },
2522
- "truthfulqa_mc2": {
2523
- "evaluator": "log_likelihoods",
2524
- "extractor_location": "lm_eval_pairs",
2525
- "extractor_file": "truthfulqa_mc2",
2526
- "benchmark_type": "question_answering",
2527
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2528
- },
2529
- "truthfulqa_multi": {
2530
- "evaluator": "mixed",
2531
- "extractor_location": "lm_eval_pairs",
2532
- "extractor_file": "truthfulqa_multi",
2533
- "benchmark_type": "question_answering",
2534
- "explanation": "Uses mixed evaluator"
2535
- },
2536
- "turblimp_core": {
2537
- "evaluator": null,
2538
- "extractor_location": "lm_eval_pairs",
2539
- "extractor_file": "turblimp_core",
2540
- "benchmark_type": "other",
2541
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2542
- },
2543
- "turkishmmlu": {
2544
- "evaluator": "log_likelihoods",
2545
- "extractor_location": "lm_eval_pairs",
2546
- "extractor_file": "turkishmmlu",
2547
- "benchmark_type": "knowledge",
2548
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2549
- },
2550
- "twenty_newsgroups": {
2551
- "evaluator": "exact_match",
2552
- "extractor_location": "lm_eval_pairs",
2553
- "extractor_file": "twenty_newsgroups",
2554
- "benchmark_type": "other",
2555
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
2556
- },
2557
- "unfair": {
2558
- "evaluator": "generation",
2559
- "extractor_location": "lm_eval_pairs",
2560
- "extractor_file": "unfair",
2561
- "benchmark_type": "other",
2562
- "explanation": "Text generation evaluation - assesses quality of generated text"
2563
- },
2564
- "unitxt": {
2565
- "evaluator": "generation",
2566
- "extractor_location": "lm_eval_pairs",
2567
- "extractor_file": "unitxt",
2568
- "benchmark_type": "other",
2569
- "explanation": "Text generation evaluation - assesses quality of generated text"
2570
- },
2571
- "unscramble": {
2572
- "evaluator": "exact_match",
2573
- "extractor_location": "lm_eval_pairs",
2574
- "extractor_file": "unscramble",
2575
- "benchmark_type": "other",
2576
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
2577
- },
2578
- "vaxx": {
2579
- "evaluator": "log_likelihoods",
2580
- "extractor_location": "lm_eval_pairs",
2581
- "extractor_file": "vaxx",
2582
- "benchmark_type": "other",
2583
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2584
- },
2585
- "webqs": {
2586
- "evaluator": "log_likelihoods",
2587
- "extractor_location": "lm_eval_pairs",
2588
- "extractor_file": "webqs",
2589
- "benchmark_type": "other",
2590
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2591
- },
2592
- "wic": {
2593
- "evaluator": "log_likelihoods",
2594
- "extractor_location": "lm_eval_pairs",
2595
- "extractor_file": "wic",
2596
- "benchmark_type": "other",
2597
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2598
- },
2599
- "wiceu": {
2600
- "evaluator": "log_likelihoods",
2601
- "extractor_location": "huggingface_pairs",
2602
- "extractor_file": "wiceu",
2603
- "benchmark_type": "other",
2604
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2605
- },
2606
- "wikitext": {
2607
- "evaluator": "generation",
2608
- "extractor_location": "lm_eval_pairs",
2609
- "extractor_file": "wikitext",
2610
- "benchmark_type": "other",
2611
- "explanation": "Text generation evaluation - assesses quality of generated text"
2612
- },
2613
- "winogender": {
2614
- "evaluator": "generation",
2615
- "extractor_location": "lm_eval_pairs",
2616
- "extractor_file": "winogender",
2617
- "benchmark_type": "other",
2618
- "explanation": "Text generation evaluation - assesses quality of generated text"
2619
- },
2620
- "winogrande": {
2621
- "evaluator": "log_likelihoods",
2622
- "extractor_location": "lm_eval_pairs",
2623
- "extractor_file": "winogrande",
2624
- "benchmark_type": "other",
2625
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2626
- },
2627
- "wmdp": {
2628
- "evaluator": "log_likelihoods",
2629
- "extractor_location": "lm_eval_pairs",
2630
- "extractor_file": "wmdp",
2631
- "benchmark_type": "other",
2632
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2633
- },
2634
- "wmt14": {
2635
- "evaluator": "generation",
2636
- "extractor_location": "lm_eval_pairs",
2637
- "extractor_file": "wmt14",
2638
- "benchmark_type": "translation",
2639
- "explanation": "Text generation evaluation - assesses quality of generated text"
2640
- },
2641
- "wmt14_en_fr": {
2642
- "evaluator": "generation",
2643
- "extractor_location": "huggingface_pairs",
2644
- "extractor_file": "wmt14_en_fr",
2645
- "benchmark_type": "translation",
2646
- "explanation": "Text generation evaluation - assesses quality of generated text"
2647
- },
2648
- "wmt14_fr_en": {
2649
- "evaluator": "generation",
2650
- "extractor_location": "huggingface_pairs",
2651
- "extractor_file": "wmt14_fr_en",
2652
- "benchmark_type": "translation",
2653
- "explanation": "Text generation evaluation - assesses quality of generated text"
2654
- },
2655
- "wmt16": {
2656
- "evaluator": "generation",
2657
- "extractor_location": "lm_eval_pairs",
2658
- "extractor_file": "wmt16",
2659
- "benchmark_type": "translation",
2660
- "explanation": "Text generation evaluation - assesses quality of generated text"
2661
- },
2662
- "wmt16_de_en": {
2663
- "evaluator": "generation",
2664
- "extractor_location": "huggingface_pairs",
2665
- "extractor_file": "wmt16_de_en",
2666
- "benchmark_type": "translation",
2667
- "explanation": "Text generation evaluation - assesses quality of generated text"
2668
- },
2669
- "wmt16_en_de": {
2670
- "evaluator": "generation",
2671
- "extractor_location": "huggingface_pairs",
2672
- "extractor_file": "wmt16_en_de",
2673
- "benchmark_type": "translation",
2674
- "explanation": "Text generation evaluation - assesses quality of generated text"
2675
- },
2676
- "wmt16_en_ro": {
2677
- "evaluator": "generation",
2678
- "extractor_location": "huggingface_pairs",
2679
- "extractor_file": "wmt16_en_ro",
2680
- "benchmark_type": "translation",
2681
- "explanation": "Text generation evaluation - assesses quality of generated text"
2682
- },
2683
- "wmt16_ro_en": {
2684
- "evaluator": "generation",
2685
- "extractor_location": "huggingface_pairs",
2686
- "extractor_file": "wmt16_ro_en",
2687
- "benchmark_type": "translation",
2688
- "explanation": "Text generation evaluation - assesses quality of generated text"
2689
- },
2690
- "wmt_ro_en_t5_prompt": {
2691
- "evaluator": "generation",
2692
- "extractor_location": "huggingface_pairs",
2693
- "extractor_file": "wmt_ro_en_t5_prompt",
2694
- "benchmark_type": "translation",
2695
- "explanation": "Text generation evaluation - assesses quality of generated text"
2696
- },
2697
- "wnli": {
2698
- "evaluator": "log_likelihoods",
2699
- "extractor_location": "lm_eval_pairs",
2700
- "extractor_file": "wnli",
2701
- "benchmark_type": "other",
2702
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2703
- },
2704
- "wsc": {
2705
- "evaluator": "log_likelihoods",
2706
- "extractor_location": "lm_eval_pairs",
2707
- "extractor_file": "wsc",
2708
- "benchmark_type": "other",
2709
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2710
- },
2711
- "wsc273": {
2712
- "evaluator": "log_likelihoods",
2713
- "extractor_location": "lm_eval_pairs",
2714
- "extractor_file": "wsc273",
2715
- "benchmark_type": "other",
2716
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2717
- },
2718
- "xcopa": {
2719
- "evaluator": "log_likelihoods",
2720
- "extractor_location": "lm_eval_pairs",
2721
- "extractor_file": "xcopa",
2722
- "benchmark_type": "other",
2723
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2724
- },
2725
- "xlsum": {
2726
- "evaluator": "generation",
2727
- "extractor_location": "lm_eval_pairs",
2728
- "extractor_file": "xlsum",
2729
- "benchmark_type": "other",
2730
- "explanation": "Text generation evaluation - assesses quality of generated text"
2731
- },
2732
- "xnli": {
2733
- "evaluator": "log_likelihoods",
2734
- "extractor_location": "lm_eval_pairs",
2735
- "extractor_file": "xnli",
2736
- "benchmark_type": "other",
2737
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2738
- },
2739
- "xquad": {
2740
- "evaluator": "generation",
2741
- "extractor_location": "lm_eval_pairs",
2742
- "extractor_file": "xquad",
2743
- "benchmark_type": "other",
2744
- "explanation": "Text generation evaluation - assesses quality of generated text"
2745
- },
2746
- "xstorycloze": {
2747
- "evaluator": "log_likelihoods",
2748
- "extractor_location": "lm_eval_pairs",
2749
- "extractor_file": "xstorycloze",
2750
- "benchmark_type": "other",
2751
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2752
- },
2753
- "xsum": {
2754
- "evaluator": "exact_match",
2755
- "extractor_location": "huggingface_pairs",
2756
- "extractor_file": "xsum",
2757
- "benchmark_type": "other",
2758
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
2759
- },
2760
- "xwinograd": {
2761
- "evaluator": "log_likelihoods",
2762
- "extractor_location": "lm_eval_pairs",
2763
- "extractor_file": "xwinograd",
2764
- "benchmark_type": "other",
2765
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2766
- },
2767
- "yahoo": {
2768
- "evaluator": "log_likelihoods",
2769
- "extractor_location": "lm_eval_pairs",
2770
- "extractor_file": "yahoo",
2771
- "benchmark_type": "other",
2772
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2773
- },
2774
- "zhoblimp": {
2775
- "evaluator": "log_likelihoods",
2776
- "extractor_location": "lm_eval_pairs",
2777
- "extractor_file": "zhoblimp",
2778
- "benchmark_type": "other",
2779
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2780
- }
2781
- }