wisent 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1020) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/__init__.py +22 -6
  3. wisent/core/activations/activation_cache.py +393 -0
  4. wisent/core/activations/activations.py +22 -40
  5. wisent/core/activations/activations_collector.py +145 -373
  6. wisent/core/activations/classifier_inference_strategy.py +195 -0
  7. wisent/core/activations/core/atoms.py +8 -92
  8. wisent/core/activations/extraction_strategy.py +480 -0
  9. wisent/core/agent/diagnose/response_diagnostics.py +3 -3
  10. wisent/core/agent/diagnose.py +3 -3
  11. wisent/core/autonomous_agent.py +2 -2
  12. wisent/core/classifiers/classifiers/core/atoms.py +3 -2
  13. wisent/core/cli/__init__.py +2 -1
  14. wisent/core/cli/agent/apply_steering.py +25 -31
  15. wisent/core/cli/agent/evaluate_response.py +18 -20
  16. wisent/core/cli/agent/train_classifier.py +36 -26
  17. wisent/core/cli/check_linearity.py +35 -3
  18. wisent/core/cli/cluster_benchmarks.py +470 -0
  19. wisent/core/cli/create_steering_vector.py +19 -9
  20. wisent/core/cli/diagnose_vectors.py +7 -4
  21. wisent/core/cli/estimate_unified_goodness_time.py +6 -4
  22. wisent/core/cli/generate_pairs_from_task.py +9 -56
  23. wisent/core/cli/generate_vector_from_task.py +4 -0
  24. wisent/core/cli/geometry_search.py +137 -0
  25. wisent/core/cli/get_activations.py +13 -37
  26. wisent/core/cli/method_optimizer.py +860 -0
  27. wisent/core/cli/modify_weights.py +3 -2
  28. wisent/core/cli/optimize.py +44 -5
  29. wisent/core/cli/optimize_classification.py +5 -6
  30. wisent/core/cli/optimize_sample_size.py +9 -23
  31. wisent/core/cli/optimize_steering.py +433 -159
  32. wisent/core/cli/optimize_weights.py +67 -7
  33. wisent/core/cli/preview_pairs.py +203 -0
  34. wisent/core/cli/steering_method_trainer.py +8 -7
  35. wisent/core/cli/steering_search_space.py +20 -15
  36. wisent/core/cli/tasks.py +31 -117
  37. wisent/core/cli/train_unified_goodness.py +18 -19
  38. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +1582 -177
  39. wisent/core/contrastive_pairs/diagnostics/linearity.py +70 -80
  40. wisent/core/contrastive_pairs/diagnostics/vector_quality.py +6 -5
  41. wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py +5 -19
  42. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py +11 -5
  43. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
  44. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
  45. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py +146 -32
  46. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
  47. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py +2 -2
  48. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
  49. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
  50. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
  51. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
  52. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
  53. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
  54. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py +98 -57
  55. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
  56. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
  57. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
  58. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
  59. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
  60. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
  61. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
  62. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
  63. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
  64. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
  65. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
  66. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
  67. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
  68. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
  69. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py +8 -8
  70. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py +1 -1
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +11 -5
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval_aqua_rat.py +129 -0
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
  82. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
  83. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
  84. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
  85. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
  86. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
  87. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
  88. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
  89. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
  90. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
  91. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
  92. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
  93. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
  94. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
  95. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
  96. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
  102. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
  103. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
  104. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
  105. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
  106. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
  107. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
  108. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
  109. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
  110. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
  111. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
  112. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
  113. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
  114. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
  115. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
  116. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
  117. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
  118. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
  119. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
  120. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py +11 -6
  121. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
  122. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
  123. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
  124. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
  125. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
  126. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
  127. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
  128. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
  129. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
  130. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
  131. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
  132. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
  133. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
  134. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
  135. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
  136. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
  137. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
  138. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
  139. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
  140. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
  141. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
  142. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
  143. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
  144. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
  145. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
  146. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
  147. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
  148. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
  149. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
  150. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
  151. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
  152. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
  153. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
  154. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
  155. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
  156. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
  157. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
  158. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
  159. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +1 -1
  160. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
  161. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
  162. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
  163. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
  164. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
  165. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
  166. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
  167. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
  168. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
  169. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
  170. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
  171. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
  172. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
  173. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
  174. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
  175. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
  176. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
  177. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
  178. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
  179. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
  180. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
  181. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
  182. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
  183. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
  184. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
  185. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
  186. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
  187. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
  188. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
  189. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py +47 -6
  190. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
  191. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
  192. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
  193. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
  194. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
  195. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
  196. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
  197. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
  198. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
  199. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
  200. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
  201. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
  202. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
  203. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
  204. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
  205. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
  206. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
  207. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
  208. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
  209. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
  210. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
  211. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
  212. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
  213. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
  214. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
  215. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
  216. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
  217. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
  218. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
  219. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
  220. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
  221. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
  222. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
  223. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
  224. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
  225. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
  226. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
  227. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
  228. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
  229. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
  230. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
  231. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
  232. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
  233. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
  234. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
  235. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
  236. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
  237. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
  238. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
  239. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
  240. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
  241. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
  242. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
  243. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
  244. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
  245. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
  246. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
  247. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
  248. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
  249. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
  250. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
  251. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
  252. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
  253. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
  254. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
  255. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
  256. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
  257. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
  258. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
  259. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
  260. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
  261. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
  262. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
  263. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
  264. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
  265. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
  266. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
  267. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
  268. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
  269. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
  270. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
  271. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
  272. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
  273. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
  274. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
  275. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
  276. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
  277. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
  278. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
  279. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
  280. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
  281. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
  282. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
  283. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
  284. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
  285. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
  286. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
  287. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
  288. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
  289. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
  290. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
  291. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
  292. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
  293. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
  294. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
  295. wisent/core/data_loaders/loaders/lm_loader.py +12 -1
  296. wisent/core/evaluators/benchmark_specific/apps_evaluator.py +133 -0
  297. wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py +6 -1
  298. wisent/core/evaluators/benchmark_specific/conala_evaluator.py +31 -168
  299. wisent/core/evaluators/custom/examples/humanization_coherent.py +89 -35
  300. wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py +2 -20
  301. wisent/core/evaluators/personalization/coherence.py +46 -0
  302. wisent/core/geometry_runner.py +995 -0
  303. wisent/core/geometry_search_space.py +237 -0
  304. wisent/core/hyperparameter_optimizer.py +14 -14
  305. wisent/core/lm_eval_harness_ground_truth.py +7 -11
  306. wisent/core/main.py +6 -0
  307. wisent/core/models/core/atoms.py +5 -3
  308. wisent/core/models/wisent_model.py +9 -8
  309. wisent/core/opti/methods/opti_weights.py +29 -2
  310. wisent/core/optuna/classifier/activation_generator.py +14 -12
  311. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  312. wisent/core/optuna/steering/steering_optimization.py +14 -9
  313. wisent/core/parser_arguments/check_linearity_parser.py +12 -2
  314. wisent/core/parser_arguments/cluster_benchmarks_parser.py +31 -0
  315. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
  316. wisent/core/parser_arguments/generate_vector_from_task_parser.py +22 -2
  317. wisent/core/parser_arguments/geometry_search_parser.py +61 -0
  318. wisent/core/parser_arguments/main_parser.py +16 -0
  319. wisent/core/parser_arguments/optimize_steering_parser.py +117 -10
  320. wisent/core/parser_arguments/optimize_weights_parser.py +6 -0
  321. wisent/core/parser_arguments/tasks_parser.py +7 -19
  322. wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
  323. wisent/core/steering.py +5 -3
  324. wisent/core/steering_methods/core/atoms.py +1 -2
  325. wisent/core/steering_methods/methods/caa.py +1 -1
  326. wisent/core/steering_methods/methods/hyperplane.py +75 -0
  327. wisent/core/steering_methods/methods/prism.py +1 -2
  328. wisent/core/steering_methods/methods/pulse.py +39 -8
  329. wisent/core/steering_methods/methods/titan.py +59 -14
  330. wisent/core/steering_methods/registry.py +52 -12
  331. wisent/core/steering_optimizer.py +15 -15
  332. wisent/core/synthetic/generators/nonsense_generator.py +30 -18
  333. wisent/core/trainers/steering_trainer.py +11 -20
  334. wisent/core/utils/device.py +27 -27
  335. wisent/core/utils/layer_combinations.py +70 -0
  336. wisent/examples/__init__.py +1 -0
  337. wisent/examples/scripts/__init__.py +1 -0
  338. wisent/examples/scripts/count_all_benchmarks.py +121 -0
  339. wisent/examples/scripts/discover_directions.py +469 -0
  340. wisent/examples/scripts/extract_benchmark_info.py +71 -0
  341. wisent/examples/scripts/generate_paper_data.py +384 -0
  342. wisent/examples/scripts/intervention_validation.py +626 -0
  343. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
  344. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
  345. wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
  346. wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
  347. wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
  348. wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
  349. wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
  350. wisent/examples/scripts/search_all_short_names.py +31 -0
  351. wisent/examples/scripts/test_all_benchmarks.py +138 -0
  352. wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
  353. wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
  354. wisent/examples/scripts/test_nonsense_baseline.py +261 -0
  355. wisent/examples/scripts/test_one_benchmark.py +324 -0
  356. wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
  357. wisent/examples/scripts/threshold_analysis.py +434 -0
  358. wisent/examples/scripts/visualization_gallery.py +582 -0
  359. wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
  360. wisent/parameters/lm_eval/category_directions.json +137 -0
  361. wisent/parameters/lm_eval/repair_plan.json +282 -0
  362. wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json +19 -70
  363. wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
  364. wisent/parameters/lm_eval/working_benchmarks.json +206 -0
  365. wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
  366. wisent/scripts/run_quality_metrics_sweep.sh +22 -27
  367. wisent/tests/test_aggregation_geometry.py +236 -0
  368. wisent/tests/test_detector_accuracy.py +163 -0
  369. wisent/tests/test_geometry_exhaustive.py +1202 -0
  370. wisent/tests/visualize_geometry.py +255 -61
  371. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
  372. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/RECORD +376 -974
  373. wisent/core/activations/prompt_construction_strategy.py +0 -47
  374. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
  375. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text.py +0 -15
  376. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_go.py +0 -64
  377. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_java.py +0 -65
  378. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_javascript.py +0 -65
  379. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_php.py +0 -65
  380. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_python.py +0 -65
  381. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_ruby.py +0 -65
  382. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/freebase.py +0 -99
  383. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py +0 -180
  384. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py +0 -129
  385. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py +0 -142
  386. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py +0 -155
  387. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code2text.py +0 -161
  388. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/codexglue.py +0 -107
  389. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livemathbench.py +0 -155
  390. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polymath.py +0 -155
  391. wisent/examples/scripts/results/benchmark_descriptions.json +0 -1244
  392. wisent/examples/scripts/results/benchmark_evaluation_methods.json +0 -66
  393. wisent/examples/scripts/results/benchmark_evaluator_mapping.json +0 -2781
  394. wisent/examples/scripts/results/benchmark_evaluator_mapping_updated.json +0 -30536
  395. wisent/examples/scripts/results/benchmark_evaluators_clean.json +0 -469
  396. wisent/examples/scripts/results/benchmark_methods_summary.json +0 -260
  397. wisent/examples/scripts/results/benchmark_pair_creation_methods.json +0 -66
  398. wisent/examples/scripts/results/benchmark_pair_totals.json +0 -269
  399. wisent/examples/scripts/results/benchmark_tags.json +0 -917
  400. wisent/examples/scripts/results/benchmark_test_summary_nov4.json +0 -71
  401. wisent/examples/scripts/results/coding_benchmarks_test_code_status.json +0 -150
  402. wisent/examples/scripts/results/failing_benchmarks.json +0 -946
  403. wisent/examples/scripts/results/failing_benchmarks_list.json +0 -41
  404. wisent/examples/scripts/results/failing_benchmarks_test_results.json +0 -945
  405. wisent/examples/scripts/results/missing_benchmark_tags.json +0 -341
  406. wisent/examples/scripts/results/test_20_newsgroups_evaluation.json +0 -30
  407. wisent/examples/scripts/results/test_20_newsgroups_pairs.json +0 -8
  408. wisent/examples/scripts/results/test_AraDICE_evaluation.json +0 -51
  409. wisent/examples/scripts/results/test_AraDICE_pairs.json +0 -14
  410. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_evaluation.json +0 -30
  411. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_pairs.json +0 -8
  412. wisent/examples/scripts/results/test_ArabCulture_evaluation.json +0 -51
  413. wisent/examples/scripts/results/test_ArabCulture_pairs.json +0 -14
  414. wisent/examples/scripts/results/test_Tag_evaluation.json +0 -30
  415. wisent/examples/scripts/results/test_Tag_pairs.json +0 -8
  416. wisent/examples/scripts/results/test_aclue_evaluation.json +0 -51
  417. wisent/examples/scripts/results/test_aclue_pairs.json +0 -14
  418. wisent/examples/scripts/results/test_acp_bench_evaluation.json +0 -51
  419. wisent/examples/scripts/results/test_acp_bench_hard_evaluation.json +0 -51
  420. wisent/examples/scripts/results/test_acp_bench_hard_pairs.json +0 -14
  421. wisent/examples/scripts/results/test_acp_bench_pairs.json +0 -14
  422. wisent/examples/scripts/results/test_advanced_ai_risk_evaluation.json +0 -51
  423. wisent/examples/scripts/results/test_advanced_ai_risk_pairs.json +0 -14
  424. wisent/examples/scripts/results/test_aexams_evaluation.json +0 -51
  425. wisent/examples/scripts/results/test_aexams_pairs.json +0 -14
  426. wisent/examples/scripts/results/test_afrimgsm_direct_amh_evaluation.json +0 -30
  427. wisent/examples/scripts/results/test_afrimgsm_direct_amh_pairs.json +0 -8
  428. wisent/examples/scripts/results/test_afrimmlu_direct_amh_evaluation.json +0 -30
  429. wisent/examples/scripts/results/test_afrimmlu_direct_amh_pairs.json +0 -8
  430. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_evaluation.json +0 -30
  431. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_pairs.json +0 -8
  432. wisent/examples/scripts/results/test_ag_news_evaluation.json +0 -30
  433. wisent/examples/scripts/results/test_ag_news_pairs.json +0 -8
  434. wisent/examples/scripts/results/test_agieval_evaluation.json +0 -51
  435. wisent/examples/scripts/results/test_agieval_pairs.json +0 -14
  436. wisent/examples/scripts/results/test_aime2024_evaluation.json +0 -30
  437. wisent/examples/scripts/results/test_aime2024_pairs.json +0 -8
  438. wisent/examples/scripts/results/test_aime2025_evaluation.json +0 -30
  439. wisent/examples/scripts/results/test_aime2025_pairs.json +0 -8
  440. wisent/examples/scripts/results/test_aime_evaluation.json +0 -30
  441. wisent/examples/scripts/results/test_aime_pairs.json +0 -8
  442. wisent/examples/scripts/results/test_anagrams1_evaluation.json +0 -30
  443. wisent/examples/scripts/results/test_anagrams1_pairs.json +0 -8
  444. wisent/examples/scripts/results/test_anagrams2_evaluation.json +0 -30
  445. wisent/examples/scripts/results/test_anagrams2_pairs.json +0 -8
  446. wisent/examples/scripts/results/test_anli_evaluation.json +0 -30
  447. wisent/examples/scripts/results/test_anli_pairs.json +0 -8
  448. wisent/examples/scripts/results/test_apps_evaluation.json +0 -30
  449. wisent/examples/scripts/results/test_apps_pairs.json +0 -8
  450. wisent/examples/scripts/results/test_arabic_exams_evaluation.json +0 -30
  451. wisent/examples/scripts/results/test_arabic_exams_pairs.json +0 -8
  452. wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json +0 -51
  453. wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json +0 -14
  454. wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json +0 -51
  455. wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json +0 -14
  456. wisent/examples/scripts/results/test_arabicmmlu_evaluation.json +0 -51
  457. wisent/examples/scripts/results/test_arabicmmlu_pairs.json +0 -14
  458. wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json +0 -51
  459. wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json +0 -14
  460. wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json +0 -51
  461. wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json +0 -14
  462. wisent/examples/scripts/results/test_arc_ar_evaluation.json +0 -30
  463. wisent/examples/scripts/results/test_arc_ar_pairs.json +0 -8
  464. wisent/examples/scripts/results/test_arc_challenge_evaluation.json +0 -30
  465. wisent/examples/scripts/results/test_arc_challenge_pairs.json +0 -8
  466. wisent/examples/scripts/results/test_arc_easy_evaluation.json +0 -30
  467. wisent/examples/scripts/results/test_arc_easy_pairs.json +0 -8
  468. wisent/examples/scripts/results/test_argument_topic_evaluation.json +0 -30
  469. wisent/examples/scripts/results/test_argument_topic_pairs.json +0 -8
  470. wisent/examples/scripts/results/test_arithmetic_evaluation.json +0 -51
  471. wisent/examples/scripts/results/test_arithmetic_pairs.json +0 -14
  472. wisent/examples/scripts/results/test_asdiv_evaluation.json +0 -30
  473. wisent/examples/scripts/results/test_asdiv_pairs.json +0 -8
  474. wisent/examples/scripts/results/test_assin_entailment_evaluation.json +0 -30
  475. wisent/examples/scripts/results/test_assin_entailment_pairs.json +0 -8
  476. wisent/examples/scripts/results/test_atis_evaluation.json +0 -30
  477. wisent/examples/scripts/results/test_atis_pairs.json +0 -8
  478. wisent/examples/scripts/results/test_babi_evaluation.json +0 -30
  479. wisent/examples/scripts/results/test_babi_pairs.json +0 -8
  480. wisent/examples/scripts/results/test_babilong_evaluation.json +0 -30
  481. wisent/examples/scripts/results/test_babilong_pairs.json +0 -8
  482. wisent/examples/scripts/results/test_bangla_mmlu_evaluation.json +0 -30
  483. wisent/examples/scripts/results/test_bangla_mmlu_pairs.json +0 -8
  484. wisent/examples/scripts/results/test_banking77_evaluation.json +0 -30
  485. wisent/examples/scripts/results/test_banking77_pairs.json +0 -8
  486. wisent/examples/scripts/results/test_basque/test_basque-glue_pairs.json +0 -14
  487. wisent/examples/scripts/results/test_basque-glue_evaluation.json +0 -51
  488. wisent/examples/scripts/results/test_basque-glue_pairs.json +0 -14
  489. wisent/examples/scripts/results/test_basque2/test_basque-glue_evaluation.json +0 -51
  490. wisent/examples/scripts/results/test_basque2/test_basque-glue_pairs.json +0 -14
  491. wisent/examples/scripts/results/test_basque_bench_evaluation.json +0 -51
  492. wisent/examples/scripts/results/test_basque_bench_pairs.json +0 -14
  493. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_evaluation.json +0 -51
  494. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_pairs.json +0 -14
  495. wisent/examples/scripts/results/test_basqueglue_evaluation.json +0 -51
  496. wisent/examples/scripts/results/test_basqueglue_pairs.json +0 -14
  497. wisent/examples/scripts/results/test_bbh_evaluation.json +0 -51
  498. wisent/examples/scripts/results/test_bbh_pairs.json +0 -14
  499. wisent/examples/scripts/results/test_bbq_evaluation.json +0 -30
  500. wisent/examples/scripts/results/test_bbq_pairs.json +0 -8
  501. wisent/examples/scripts/results/test_bec2016eu_evaluation.json +0 -51
  502. wisent/examples/scripts/results/test_bec2016eu_pairs.json +0 -14
  503. wisent/examples/scripts/results/test_belebele_evaluation.json +0 -51
  504. wisent/examples/scripts/results/test_belebele_pairs.json +0 -14
  505. wisent/examples/scripts/results/test_benchmarks_evaluation.json +0 -51
  506. wisent/examples/scripts/results/test_benchmarks_pairs.json +0 -14
  507. wisent/examples/scripts/results/test_bertaqa_evaluation.json +0 -51
  508. wisent/examples/scripts/results/test_bertaqa_pairs.json +0 -14
  509. wisent/examples/scripts/results/test_bhtc_v2_evaluation.json +0 -30
  510. wisent/examples/scripts/results/test_bhtc_v2_pairs.json +0 -8
  511. wisent/examples/scripts/results/test_bigbench_evaluation.json +0 -51
  512. wisent/examples/scripts/results/test_bigbench_pairs.json +0 -14
  513. wisent/examples/scripts/results/test_blimp_evaluation.json +0 -51
  514. wisent/examples/scripts/results/test_blimp_pairs.json +0 -14
  515. wisent/examples/scripts/results/test_boolq/test_boolq_evaluation.json +0 -30
  516. wisent/examples/scripts/results/test_boolq/test_boolq_pairs.json +0 -8
  517. wisent/examples/scripts/results/test_boolq-seq2seq_evaluation.json +0 -30
  518. wisent/examples/scripts/results/test_boolq-seq2seq_pairs.json +0 -8
  519. wisent/examples/scripts/results/test_boolq_evaluation.json +0 -30
  520. wisent/examples/scripts/results/test_boolq_pairs.json +0 -8
  521. wisent/examples/scripts/results/test_c4_evaluation.json +0 -30
  522. wisent/examples/scripts/results/test_c4_pairs.json +0 -8
  523. wisent/examples/scripts/results/test_cabreu_evaluation.json +0 -30
  524. wisent/examples/scripts/results/test_cabreu_pairs.json +0 -8
  525. wisent/examples/scripts/results/test_careqa_evaluation.json +0 -30
  526. wisent/examples/scripts/results/test_careqa_pairs.json +0 -8
  527. wisent/examples/scripts/results/test_catalan_bench_evaluation.json +0 -51
  528. wisent/examples/scripts/results/test_catalan_bench_pairs.json +0 -14
  529. wisent/examples/scripts/results/test_catalanqa_evaluation.json +0 -30
  530. wisent/examples/scripts/results/test_catalanqa_pairs.json +0 -8
  531. wisent/examples/scripts/results/test_catcola_evaluation.json +0 -30
  532. wisent/examples/scripts/results/test_catcola_pairs.json +0 -8
  533. wisent/examples/scripts/results/test_cb_evaluation.json +0 -30
  534. wisent/examples/scripts/results/test_cb_pairs.json +0 -8
  535. wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json +0 -51
  536. wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json +0 -14
  537. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json +0 -30
  538. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json +0 -8
  539. wisent/examples/scripts/results/test_ceval_evaluation.json +0 -51
  540. wisent/examples/scripts/results/test_ceval_pairs.json +0 -14
  541. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json +0 -51
  542. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json +0 -14
  543. wisent/examples/scripts/results/test_chain_of_thought_evaluation.json +0 -51
  544. wisent/examples/scripts/results/test_chain_of_thought_pairs.json +0 -14
  545. wisent/examples/scripts/results/test_chartqa_evaluation.json +0 -30
  546. wisent/examples/scripts/results/test_chartqa_pairs.json +0 -8
  547. wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json +0 -30
  548. wisent/examples/scripts/results/test_claim_stance_topic_pairs.json +0 -8
  549. wisent/examples/scripts/results/test_cmmlu_evaluation.json +0 -51
  550. wisent/examples/scripts/results/test_cmmlu_pairs.json +0 -14
  551. wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json +0 -30
  552. wisent/examples/scripts/results/test_cnn_dailymail_pairs.json +0 -8
  553. wisent/examples/scripts/results/test_cocoteros_es_evaluation.json +0 -30
  554. wisent/examples/scripts/results/test_cocoteros_es_pairs.json +0 -8
  555. wisent/examples/scripts/results/test_codexglue_code_to_text_go_evaluation.json +0 -30
  556. wisent/examples/scripts/results/test_codexglue_code_to_text_go_pairs.json +0 -8
  557. wisent/examples/scripts/results/test_codexglue_code_to_text_java_evaluation.json +0 -30
  558. wisent/examples/scripts/results/test_codexglue_code_to_text_java_pairs.json +0 -8
  559. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_evaluation.json +0 -30
  560. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_pairs.json +0 -8
  561. wisent/examples/scripts/results/test_codexglue_code_to_text_php_evaluation.json +0 -30
  562. wisent/examples/scripts/results/test_codexglue_code_to_text_php_pairs.json +0 -8
  563. wisent/examples/scripts/results/test_codexglue_code_to_text_python_evaluation.json +0 -30
  564. wisent/examples/scripts/results/test_codexglue_code_to_text_python_pairs.json +0 -8
  565. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_evaluation.json +0 -30
  566. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_pairs.json +0 -8
  567. wisent/examples/scripts/results/test_coedit_gec_evaluation.json +0 -30
  568. wisent/examples/scripts/results/test_coedit_gec_pairs.json +0 -8
  569. wisent/examples/scripts/results/test_cola_evaluation.json +0 -30
  570. wisent/examples/scripts/results/test_cola_pairs.json +0 -8
  571. wisent/examples/scripts/results/test_commonsense_qa_evaluation.json +0 -30
  572. wisent/examples/scripts/results/test_commonsense_qa_pairs.json +0 -8
  573. wisent/examples/scripts/results/test_conala_evaluation.json +0 -30
  574. wisent/examples/scripts/results/test_conala_pairs.json +0 -8
  575. wisent/examples/scripts/results/test_concode_evaluation.json +0 -30
  576. wisent/examples/scripts/results/test_concode_pairs.json +0 -8
  577. wisent/examples/scripts/results/test_copa_evaluation.json +0 -30
  578. wisent/examples/scripts/results/test_copa_pairs.json +0 -8
  579. wisent/examples/scripts/results/test_copal_id_evaluation.json +0 -30
  580. wisent/examples/scripts/results/test_copal_id_pairs.json +0 -8
  581. wisent/examples/scripts/results/test_coqa_evaluation.json +0 -30
  582. wisent/examples/scripts/results/test_coqa_pairs.json +0 -8
  583. wisent/examples/scripts/results/test_coqcat_evaluation.json +0 -30
  584. wisent/examples/scripts/results/test_coqcat_pairs.json +0 -8
  585. wisent/examples/scripts/results/test_crows_pairs_evaluation.json +0 -51
  586. wisent/examples/scripts/results/test_crows_pairs_pairs.json +0 -14
  587. wisent/examples/scripts/results/test_csatqa_evaluation.json +0 -51
  588. wisent/examples/scripts/results/test_csatqa_pairs.json +0 -14
  589. wisent/examples/scripts/results/test_cycle_letters_evaluation.json +0 -30
  590. wisent/examples/scripts/results/test_cycle_letters_pairs.json +0 -8
  591. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_evaluation.json +0 -51
  592. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_pairs.json +0 -14
  593. wisent/examples/scripts/results/test_darija_bench_evaluation.json +0 -51
  594. wisent/examples/scripts/results/test_darija_bench_pairs.json +0 -14
  595. wisent/examples/scripts/results/test_darijahellaswag_evaluation.json +0 -30
  596. wisent/examples/scripts/results/test_darijahellaswag_pairs.json +0 -8
  597. wisent/examples/scripts/results/test_darijammlu_evaluation.json +0 -51
  598. wisent/examples/scripts/results/test_darijammlu_pairs.json +0 -14
  599. wisent/examples/scripts/results/test_dbpedia_14_evaluation.json +0 -30
  600. wisent/examples/scripts/results/test_dbpedia_14_pairs.json +0 -8
  601. wisent/examples/scripts/results/test_drop_evaluation.json +0 -30
  602. wisent/examples/scripts/results/test_drop_pairs.json +0 -8
  603. wisent/examples/scripts/results/test_ds1000_evaluation.json +0 -30
  604. wisent/examples/scripts/results/test_ds1000_pairs.json +0 -8
  605. wisent/examples/scripts/results/test_egyhellaswag_evaluation.json +0 -30
  606. wisent/examples/scripts/results/test_egyhellaswag_pairs.json +0 -8
  607. wisent/examples/scripts/results/test_egymmlu_evaluation.json +0 -51
  608. wisent/examples/scripts/results/test_egymmlu_pairs.json +0 -14
  609. wisent/examples/scripts/results/test_epec_koref_bin_evaluation.json +0 -30
  610. wisent/examples/scripts/results/test_epec_koref_bin_pairs.json +0 -8
  611. wisent/examples/scripts/results/test_eq_bench_evaluation.json +0 -30
  612. wisent/examples/scripts/results/test_eq_bench_pairs.json +0 -8
  613. wisent/examples/scripts/results/test_escola_evaluation.json +0 -30
  614. wisent/examples/scripts/results/test_escola_pairs.json +0 -8
  615. wisent/examples/scripts/results/test_ethics_cm_evaluation.json +0 -30
  616. wisent/examples/scripts/results/test_ethics_cm_pairs.json +0 -8
  617. wisent/examples/scripts/results/test_ethos_binary_evaluation.json +0 -30
  618. wisent/examples/scripts/results/test_ethos_binary_pairs.json +0 -8
  619. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_evaluation.json +0 -51
  620. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_pairs.json +0 -14
  621. wisent/examples/scripts/results/test_eus_exams_es_evaluation.json +0 -51
  622. wisent/examples/scripts/results/test_eus_exams_es_pairs.json +0 -14
  623. wisent/examples/scripts/results/test_eus_exams_evaluation.json +0 -51
  624. wisent/examples/scripts/results/test_eus_exams_pairs.json +0 -14
  625. wisent/examples/scripts/results/test_eus_proficiency_evaluation.json +0 -30
  626. wisent/examples/scripts/results/test_eus_proficiency_pairs.json +0 -8
  627. wisent/examples/scripts/results/test_eus_reading_evaluation.json +0 -30
  628. wisent/examples/scripts/results/test_eus_reading_pairs.json +0 -8
  629. wisent/examples/scripts/results/test_eus_trivia_evaluation.json +0 -30
  630. wisent/examples/scripts/results/test_eus_trivia_pairs.json +0 -8
  631. wisent/examples/scripts/results/test_evalita-mp_evaluation.json +0 -51
  632. wisent/examples/scripts/results/test_evalita-mp_pairs.json +0 -14
  633. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
  634. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
  635. wisent/examples/scripts/results/test_evalita_LLM_evaluation.json +0 -51
  636. wisent/examples/scripts/results/test_evalita_LLM_pairs.json +0 -14
  637. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_evaluation.json +0 -51
  638. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_pairs.json +0 -14
  639. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_evaluation.json +0 -30
  640. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_pairs.json +0 -8
  641. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_evaluation.json +0 -51
  642. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_pairs.json +0 -14
  643. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
  644. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
  645. wisent/examples/scripts/results/test_fda_evaluation.json +0 -30
  646. wisent/examples/scripts/results/test_fda_pairs.json +0 -8
  647. wisent/examples/scripts/results/test_financial_tweets_evaluation.json +0 -30
  648. wisent/examples/scripts/results/test_financial_tweets_pairs.json +0 -8
  649. wisent/examples/scripts/results/test_fld/test_fld_evaluation.json +0 -30
  650. wisent/examples/scripts/results/test_fld/test_fld_pairs.json +0 -8
  651. wisent/examples/scripts/results/test_fld_evaluation.json +0 -30
  652. wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json +0 -30
  653. wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json +0 -8
  654. wisent/examples/scripts/results/test_fld_pairs.json +0 -8
  655. wisent/examples/scripts/results/test_flores_evaluation.json +0 -51
  656. wisent/examples/scripts/results/test_flores_pairs.json +0 -14
  657. wisent/examples/scripts/results/test_freebase_evaluation.json +0 -30
  658. wisent/examples/scripts/results/test_freebase_pairs.json +0 -8
  659. wisent/examples/scripts/results/test_french_bench_evaluation.json +0 -51
  660. wisent/examples/scripts/results/test_french_bench_pairs.json +0 -14
  661. wisent/examples/scripts/results/test_galcola_evaluation.json +0 -30
  662. wisent/examples/scripts/results/test_galcola_pairs.json +0 -8
  663. wisent/examples/scripts/results/test_galician_bench_evaluation.json +0 -51
  664. wisent/examples/scripts/results/test_galician_bench_pairs.json +0 -14
  665. wisent/examples/scripts/results/test_glianorex_evaluation.json +0 -30
  666. wisent/examples/scripts/results/test_glianorex_pairs.json +0 -8
  667. wisent/examples/scripts/results/test_global_mmlu_evaluation.json +0 -51
  668. wisent/examples/scripts/results/test_global_mmlu_pairs.json +0 -14
  669. wisent/examples/scripts/results/test_glue_evaluation.json +0 -51
  670. wisent/examples/scripts/results/test_glue_pairs.json +0 -14
  671. wisent/examples/scripts/results/test_gpqa_evaluation.json +0 -51
  672. wisent/examples/scripts/results/test_gpqa_pairs.json +0 -14
  673. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_evaluation.json +0 -51
  674. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_pairs.json +0 -14
  675. wisent/examples/scripts/results/test_groundcocoa_evaluation.json +0 -30
  676. wisent/examples/scripts/results/test_groundcocoa_pairs.json +0 -8
  677. wisent/examples/scripts/results/test_gsm8k_evaluation.json +0 -30
  678. wisent/examples/scripts/results/test_gsm8k_pairs.json +0 -8
  679. wisent/examples/scripts/results/test_haerae_evaluation.json +0 -51
  680. wisent/examples/scripts/results/test_haerae_pairs.json +0 -14
  681. wisent/examples/scripts/results/test_headqa_evaluation.json +0 -30
  682. wisent/examples/scripts/results/test_headqa_pairs.json +0 -8
  683. wisent/examples/scripts/results/test_hellaswag_evaluation.json +0 -30
  684. wisent/examples/scripts/results/test_hellaswag_pairs.json +0 -8
  685. wisent/examples/scripts/results/test_hendrycks_ethics_evaluation.json +0 -51
  686. wisent/examples/scripts/results/test_hendrycks_ethics_pairs.json +0 -14
  687. wisent/examples/scripts/results/test_hendrycks_math_evaluation.json +0 -51
  688. wisent/examples/scripts/results/test_hendrycks_math_pairs.json +0 -14
  689. wisent/examples/scripts/results/test_histoires_morales_evaluation.json +0 -30
  690. wisent/examples/scripts/results/test_histoires_morales_pairs.json +0 -8
  691. wisent/examples/scripts/results/test_hmmt_evaluation.json +0 -30
  692. wisent/examples/scripts/results/test_hmmt_feb_2025_evaluation.json +0 -30
  693. wisent/examples/scripts/results/test_hmmt_feb_2025_pairs.json +0 -8
  694. wisent/examples/scripts/results/test_hmmt_pairs.json +0 -8
  695. wisent/examples/scripts/results/test_hrm8k_evaluation.json +0 -51
  696. wisent/examples/scripts/results/test_hrm8k_pairs.json +0 -14
  697. wisent/examples/scripts/results/test_humaneval_evaluation.json +0 -30
  698. wisent/examples/scripts/results/test_humaneval_pairs.json +0 -8
  699. wisent/examples/scripts/results/test_humaneval_plus_evaluation.json +0 -30
  700. wisent/examples/scripts/results/test_humaneval_plus_pairs.json +0 -8
  701. wisent/examples/scripts/results/test_ifeval_evaluation.json +0 -30
  702. wisent/examples/scripts/results/test_ifeval_pairs.json +0 -8
  703. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_evaluation.json +0 -30
  704. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_pairs.json +0 -8
  705. wisent/examples/scripts/results/test_instruct_humaneval_evaluation.json +0 -30
  706. wisent/examples/scripts/results/test_instruct_humaneval_pairs.json +0 -8
  707. wisent/examples/scripts/results/test_inverse_scaling_evaluation.json +0 -51
  708. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_evaluation.json +0 -30
  709. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_pairs.json +0 -8
  710. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_evaluation.json +0 -51
  711. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_pairs.json +0 -14
  712. wisent/examples/scripts/results/test_inverse_scaling_pairs.json +0 -14
  713. wisent/examples/scripts/results/test_iwslt2017-ar-en_evaluation.json +0 -30
  714. wisent/examples/scripts/results/test_iwslt2017-ar-en_pairs.json +0 -8
  715. wisent/examples/scripts/results/test_iwslt2017-en-ar_evaluation.json +0 -30
  716. wisent/examples/scripts/results/test_iwslt2017-en-ar_pairs.json +0 -8
  717. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_evaluation.json +0 -30
  718. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_pairs.json +0 -8
  719. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_evaluation.json +0 -30
  720. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_pairs.json +0 -8
  721. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_evaluation.json +0 -30
  722. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_pairs.json +0 -8
  723. wisent/examples/scripts/results/test_japanese_leaderboard_evaluation.json +0 -51
  724. wisent/examples/scripts/results/test_japanese_leaderboard_pairs.json +0 -14
  725. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_evaluation.json +0 -30
  726. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_pairs.json +0 -8
  727. wisent/examples/scripts/results/test_jsonschema_bench_evaluation.json +0 -30
  728. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_evaluation.json +0 -30
  729. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_pairs.json +0 -8
  730. wisent/examples/scripts/results/test_jsonschema_bench_pairs.json +0 -8
  731. wisent/examples/scripts/results/test_kbl_evaluation.json +0 -51
  732. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_evaluation.json +0 -51
  733. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_pairs.json +0 -14
  734. wisent/examples/scripts/results/test_kbl_pairs.json +0 -14
  735. wisent/examples/scripts/results/test_kmmlu_evaluation.json +0 -51
  736. wisent/examples/scripts/results/test_kmmlu_pairs.json +0 -14
  737. wisent/examples/scripts/results/test_kobest_evaluation.json +0 -51
  738. wisent/examples/scripts/results/test_kobest_pairs.json +0 -14
  739. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json +0 -30
  740. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json +0 -8
  741. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json +0 -30
  742. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json +0 -8
  743. wisent/examples/scripts/results/test_kormedmcqa_evaluation.json +0 -30
  744. wisent/examples/scripts/results/test_kormedmcqa_pairs.json +0 -8
  745. wisent/examples/scripts/results/test_lambada_cloze_evaluation.json +0 -30
  746. wisent/examples/scripts/results/test_lambada_cloze_pairs.json +0 -8
  747. wisent/examples/scripts/results/test_lambada_evaluation.json +0 -30
  748. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  749. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  750. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json +0 -51
  751. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json +0 -14
  752. wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json +0 -51
  753. wisent/examples/scripts/results/test_lambada_multilingual_pairs.json +0 -14
  754. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json +0 -51
  755. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json +0 -14
  756. wisent/examples/scripts/results/test_lambada_openai_evaluation.json +0 -30
  757. wisent/examples/scripts/results/test_lambada_openai_pairs.json +0 -8
  758. wisent/examples/scripts/results/test_lambada_pairs.json +0 -8
  759. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  760. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  761. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  762. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  763. wisent/examples/scripts/results/test_lambada_standard_evaluation.json +0 -30
  764. wisent/examples/scripts/results/test_lambada_standard_pairs.json +0 -8
  765. wisent/examples/scripts/results/test_leaderboard_evaluation.json +0 -51
  766. wisent/examples/scripts/results/test_leaderboard_pairs.json +0 -14
  767. wisent/examples/scripts/results/test_libra/test_libra_evaluation.json +0 -51
  768. wisent/examples/scripts/results/test_libra/test_libra_pairs.json +0 -14
  769. wisent/examples/scripts/results/test_libra_evaluation.json +0 -51
  770. wisent/examples/scripts/results/test_libra_pairs.json +0 -14
  771. wisent/examples/scripts/results/test_lingoly_evaluation.json +0 -30
  772. wisent/examples/scripts/results/test_lingoly_pairs.json +0 -8
  773. wisent/examples/scripts/results/test_livecodebench_evaluation.json +0 -30
  774. wisent/examples/scripts/results/test_livecodebench_pairs.json +0 -8
  775. wisent/examples/scripts/results/test_livemathbench_cnmo_en_evaluation.json +0 -30
  776. wisent/examples/scripts/results/test_livemathbench_cnmo_en_pairs.json +0 -8
  777. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_evaluation.json +0 -30
  778. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_pairs.json +0 -8
  779. wisent/examples/scripts/results/test_llama_evaluation.json +0 -30
  780. wisent/examples/scripts/results/test_llama_pairs.json +0 -8
  781. wisent/examples/scripts/results/test_logiqa2_evaluation.json +0 -30
  782. wisent/examples/scripts/results/test_logiqa2_pairs.json +0 -8
  783. wisent/examples/scripts/results/test_logiqa_evaluation.json +0 -30
  784. wisent/examples/scripts/results/test_logiqa_pairs.json +0 -8
  785. wisent/examples/scripts/results/test_m_mmlu_evaluation.json +0 -51
  786. wisent/examples/scripts/results/test_m_mmlu_pairs.json +0 -14
  787. wisent/examples/scripts/results/test_mastermind/test_mastermind_evaluation.json +0 -51
  788. wisent/examples/scripts/results/test_mastermind/test_mastermind_pairs.json +0 -14
  789. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_evaluation.json +0 -30
  790. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_pairs.json +0 -8
  791. wisent/examples/scripts/results/test_mastermind_evaluation.json +0 -51
  792. wisent/examples/scripts/results/test_mastermind_pairs.json +0 -14
  793. wisent/examples/scripts/results/test_math500_evaluation.json +0 -30
  794. wisent/examples/scripts/results/test_math500_pairs.json +0 -8
  795. wisent/examples/scripts/results/test_math_evaluation.json +0 -30
  796. wisent/examples/scripts/results/test_math_pairs.json +0 -8
  797. wisent/examples/scripts/results/test_mathqa_evaluation.json +0 -30
  798. wisent/examples/scripts/results/test_mathqa_pairs.json +0 -8
  799. wisent/examples/scripts/results/test_mbpp_evaluation.json +0 -30
  800. wisent/examples/scripts/results/test_mbpp_pairs.json +0 -8
  801. wisent/examples/scripts/results/test_mbpp_plus_evaluation.json +0 -30
  802. wisent/examples/scripts/results/test_mbpp_plus_pairs.json +0 -8
  803. wisent/examples/scripts/results/test_mc_taco_evaluation.json +0 -30
  804. wisent/examples/scripts/results/test_mc_taco_pairs.json +0 -8
  805. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_evaluation.json +0 -51
  806. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_pairs.json +0 -14
  807. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_evaluation.json +0 -30
  808. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_pairs.json +0 -8
  809. wisent/examples/scripts/results/test_med_concepts_qa_evaluation.json +0 -51
  810. wisent/examples/scripts/results/test_med_concepts_qa_pairs.json +0 -14
  811. wisent/examples/scripts/results/test_meddialog_evaluation.json +0 -30
  812. wisent/examples/scripts/results/test_meddialog_pairs.json +0 -8
  813. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_evaluation.json +0 -30
  814. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_pairs.json +0 -8
  815. wisent/examples/scripts/results/test_mediqa_qa2019_evaluation.json +0 -30
  816. wisent/examples/scripts/results/test_mediqa_qa2019_pairs.json +0 -8
  817. wisent/examples/scripts/results/test_medmcqa_evaluation.json +0 -30
  818. wisent/examples/scripts/results/test_medmcqa_pairs.json +0 -8
  819. wisent/examples/scripts/results/test_medqa_evaluation.json +0 -30
  820. wisent/examples/scripts/results/test_medqa_pairs.json +0 -8
  821. wisent/examples/scripts/results/test_medtext_evaluation.json +0 -30
  822. wisent/examples/scripts/results/test_medtext_pairs.json +0 -8
  823. wisent/examples/scripts/results/test_mela_evaluation.json +0 -51
  824. wisent/examples/scripts/results/test_mela_pairs.json +0 -14
  825. wisent/examples/scripts/results/test_meqsum_evaluation.json +0 -30
  826. wisent/examples/scripts/results/test_meqsum_pairs.json +0 -8
  827. wisent/examples/scripts/results/test_mercury_evaluation.json +0 -30
  828. wisent/examples/scripts/results/test_mercury_pairs.json +0 -8
  829. wisent/examples/scripts/results/test_metabench_evaluation.json +0 -51
  830. wisent/examples/scripts/results/test_metabench_pairs.json +0 -14
  831. wisent/examples/scripts/results/test_mgsm_evaluation.json +0 -51
  832. wisent/examples/scripts/results/test_mgsm_pairs.json +0 -14
  833. wisent/examples/scripts/results/test_mimic_repsum_evaluation.json +0 -30
  834. wisent/examples/scripts/results/test_mimic_repsum_pairs.json +0 -8
  835. wisent/examples/scripts/results/test_minerva_math_evaluation.json +0 -51
  836. wisent/examples/scripts/results/test_minerva_math_pairs.json +0 -14
  837. wisent/examples/scripts/results/test_mlqa_evaluation.json +0 -51
  838. wisent/examples/scripts/results/test_mlqa_pairs.json +0 -14
  839. wisent/examples/scripts/results/test_mmlu-pro-plus_evaluation.json +0 -51
  840. wisent/examples/scripts/results/test_mmlu-pro-plus_pairs.json +0 -14
  841. wisent/examples/scripts/results/test_mmlu_evaluation.json +0 -51
  842. wisent/examples/scripts/results/test_mmlu_pairs.json +0 -14
  843. wisent/examples/scripts/results/test_mmlu_pro_evaluation.json +0 -51
  844. wisent/examples/scripts/results/test_mmlu_pro_pairs.json +0 -14
  845. wisent/examples/scripts/results/test_mmlu_prox_evaluation.json +0 -51
  846. wisent/examples/scripts/results/test_mmlu_prox_pairs.json +0 -14
  847. wisent/examples/scripts/results/test_mmlusr_evaluation.json +0 -30
  848. wisent/examples/scripts/results/test_mmlusr_pairs.json +0 -8
  849. wisent/examples/scripts/results/test_mmmu_evaluation.json +0 -51
  850. wisent/examples/scripts/results/test_mmmu_pairs.json +0 -14
  851. wisent/examples/scripts/results/test_mnli_evaluation.json +0 -30
  852. wisent/examples/scripts/results/test_mnli_pairs.json +0 -8
  853. wisent/examples/scripts/results/test_model_written_evals_evaluation.json +0 -51
  854. wisent/examples/scripts/results/test_model_written_evals_pairs.json +0 -14
  855. wisent/examples/scripts/results/test_moral_stories_evaluation.json +0 -30
  856. wisent/examples/scripts/results/test_moral_stories_pairs.json +0 -8
  857. wisent/examples/scripts/results/test_mts_dialog_evaluation.json +0 -30
  858. wisent/examples/scripts/results/test_mts_dialog_pairs.json +0 -8
  859. wisent/examples/scripts/results/test_multiblimp_evaluation.json +0 -51
  860. wisent/examples/scripts/results/test_multiblimp_pairs.json +0 -14
  861. wisent/examples/scripts/results/test_multimedqa_evaluation.json +0 -51
  862. wisent/examples/scripts/results/test_multimedqa_pairs.json +0 -14
  863. wisent/examples/scripts/results/test_multipl_e_evaluation.json +0 -30
  864. wisent/examples/scripts/results/test_multipl_e_pairs.json +0 -8
  865. wisent/examples/scripts/results/test_mutual_evaluation.json +0 -30
  866. wisent/examples/scripts/results/test_mutual_pairs.json +0 -8
  867. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_evaluation.json +0 -30
  868. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_pairs.json +0 -8
  869. wisent/examples/scripts/results/test_noreval_evaluation.json +0 -51
  870. wisent/examples/scripts/results/test_noreval_pairs.json +0 -14
  871. wisent/examples/scripts/results/test_noticia_evaluation.json +0 -30
  872. wisent/examples/scripts/results/test_noticia_pairs.json +0 -8
  873. wisent/examples/scripts/results/test_nq_open_evaluation.json +0 -30
  874. wisent/examples/scripts/results/test_nq_open_pairs.json +0 -8
  875. wisent/examples/scripts/results/test_olaph_evaluation.json +0 -30
  876. wisent/examples/scripts/results/test_olaph_pairs.json +0 -8
  877. wisent/examples/scripts/results/test_openbookqa_evaluation.json +0 -30
  878. wisent/examples/scripts/results/test_openbookqa_pairs.json +0 -8
  879. wisent/examples/scripts/results/test_openllm_evaluation.json +0 -51
  880. wisent/examples/scripts/results/test_openllm_pairs.json +0 -14
  881. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_evaluation.json +0 -30
  882. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_pairs.json +0 -8
  883. wisent/examples/scripts/results/test_paloma_evaluation.json +0 -51
  884. wisent/examples/scripts/results/test_paloma_pairs.json +0 -14
  885. wisent/examples/scripts/results/test_passkey/test_passkey_evaluation.json +0 -30
  886. wisent/examples/scripts/results/test_passkey/test_passkey_pairs.json +0 -8
  887. wisent/examples/scripts/results/test_paws-x_evaluation.json +0 -51
  888. wisent/examples/scripts/results/test_paws-x_pairs.json +0 -14
  889. wisent/examples/scripts/results/test_paws_en/test_paws_en_evaluation.json +0 -30
  890. wisent/examples/scripts/results/test_paws_en/test_paws_en_pairs.json +0 -8
  891. wisent/examples/scripts/results/test_penn_treebank_evaluation.json +0 -30
  892. wisent/examples/scripts/results/test_penn_treebank_pairs.json +0 -8
  893. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_evaluation.json +0 -30
  894. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_pairs.json +0 -8
  895. wisent/examples/scripts/results/test_piqa_evaluation.json +0 -30
  896. wisent/examples/scripts/results/test_piqa_pairs.json +0 -8
  897. wisent/examples/scripts/results/test_polemo2_evaluation.json +0 -30
  898. wisent/examples/scripts/results/test_polemo2_pairs.json +0 -8
  899. wisent/examples/scripts/results/test_polymath_en_high_evaluation.json +0 -30
  900. wisent/examples/scripts/results/test_polymath_en_high_pairs.json +0 -8
  901. wisent/examples/scripts/results/test_polymath_en_medium_evaluation.json +0 -30
  902. wisent/examples/scripts/results/test_polymath_en_medium_pairs.json +0 -8
  903. wisent/examples/scripts/results/test_polymath_zh_high_evaluation.json +0 -30
  904. wisent/examples/scripts/results/test_polymath_zh_high_pairs.json +0 -8
  905. wisent/examples/scripts/results/test_polymath_zh_medium_evaluation.json +0 -30
  906. wisent/examples/scripts/results/test_polymath_zh_medium_pairs.json +0 -8
  907. wisent/examples/scripts/results/test_portuguese_bench_evaluation.json +0 -51
  908. wisent/examples/scripts/results/test_portuguese_bench_pairs.json +0 -14
  909. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
  910. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
  911. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
  912. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
  913. wisent/examples/scripts/results/test_prost_evaluation.json +0 -30
  914. wisent/examples/scripts/results/test_prost_pairs.json +0 -8
  915. wisent/examples/scripts/results/test_ptb_evaluation.json +0 -30
  916. wisent/examples/scripts/results/test_ptb_pairs.json +0 -8
  917. wisent/examples/scripts/results/test_pubmedqa_evaluation.json +0 -30
  918. wisent/examples/scripts/results/test_pubmedqa_pairs.json +0 -8
  919. wisent/examples/scripts/results/test_pythia_evaluation.json +0 -51
  920. wisent/examples/scripts/results/test_pythia_pairs.json +0 -14
  921. wisent/examples/scripts/results/test_qa4mre_evaluation.json +0 -30
  922. wisent/examples/scripts/results/test_qa4mre_pairs.json +0 -8
  923. wisent/examples/scripts/results/test_qasper_evaluation.json +0 -30
  924. wisent/examples/scripts/results/test_qasper_pairs.json +0 -8
  925. wisent/examples/scripts/results/test_race_evaluation.json +0 -30
  926. wisent/examples/scripts/results/test_race_pairs.json +0 -8
  927. wisent/examples/scripts/results/test_realtoxicityprompts_evaluation.json +0 -30
  928. wisent/examples/scripts/results/test_realtoxicityprompts_pairs.json +0 -8
  929. wisent/examples/scripts/results/test_recode_evaluation.json +0 -30
  930. wisent/examples/scripts/results/test_recode_pairs.json +0 -8
  931. wisent/examples/scripts/results/test_record_evaluation.json +0 -30
  932. wisent/examples/scripts/results/test_record_pairs.json +0 -8
  933. wisent/examples/scripts/results/test_ruler_evaluation.json +0 -51
  934. wisent/examples/scripts/results/test_ruler_pairs.json +0 -14
  935. wisent/examples/scripts/results/test_sciq_evaluation.json +0 -30
  936. wisent/examples/scripts/results/test_sciq_pairs.json +0 -8
  937. wisent/examples/scripts/results/test_score_evaluation.json +0 -51
  938. wisent/examples/scripts/results/test_score_pairs.json +0 -14
  939. wisent/examples/scripts/results/test_self_consistency_evaluation.json +0 -30
  940. wisent/examples/scripts/results/test_self_consistency_pairs.json +0 -8
  941. wisent/examples/scripts/results/test_siqa/test_siqa_evaluation.json +0 -30
  942. wisent/examples/scripts/results/test_siqa/test_siqa_pairs.json +0 -8
  943. wisent/examples/scripts/results/test_siqa_evaluation.json +0 -30
  944. wisent/examples/scripts/results/test_siqa_pairs.json +0 -8
  945. wisent/examples/scripts/results/test_spanish_bench_evaluation.json +0 -51
  946. wisent/examples/scripts/results/test_spanish_bench_pairs.json +0 -14
  947. wisent/examples/scripts/results/test_squad2_evaluation.json +0 -30
  948. wisent/examples/scripts/results/test_squad2_pairs.json +0 -8
  949. wisent/examples/scripts/results/test_squadv2_evaluation.json +0 -30
  950. wisent/examples/scripts/results/test_squadv2_pairs.json +0 -8
  951. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_evaluation.json +0 -30
  952. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_pairs.json +0 -8
  953. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_evaluation.json +0 -51
  954. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_pairs.json +0 -14
  955. wisent/examples/scripts/results/test_swag_evaluation.json +0 -30
  956. wisent/examples/scripts/results/test_swag_pairs.json +0 -8
  957. wisent/examples/scripts/results/test_tinyBenchmarks_evaluation.json +0 -51
  958. wisent/examples/scripts/results/test_tinyBenchmarks_pairs.json +0 -14
  959. wisent/examples/scripts/results/test_tmmluplus_evaluation.json +0 -51
  960. wisent/examples/scripts/results/test_tmmluplus_pairs.json +0 -14
  961. wisent/examples/scripts/results/test_translation_evaluation.json +0 -51
  962. wisent/examples/scripts/results/test_translation_pairs.json +0 -14
  963. wisent/examples/scripts/results/test_triviaqa_evaluation.json +0 -30
  964. wisent/examples/scripts/results/test_triviaqa_pairs.json +0 -8
  965. wisent/examples/scripts/results/test_truthfulqa-multi_evaluation.json +0 -51
  966. wisent/examples/scripts/results/test_truthfulqa-multi_pairs.json +0 -14
  967. wisent/examples/scripts/results/test_truthfulqa_evaluation.json +0 -30
  968. wisent/examples/scripts/results/test_truthfulqa_mc1_evaluation.json +0 -30
  969. wisent/examples/scripts/results/test_truthfulqa_mc1_pairs.json +0 -8
  970. wisent/examples/scripts/results/test_truthfulqa_mc2_evaluation.json +0 -30
  971. wisent/examples/scripts/results/test_truthfulqa_mc2_pairs.json +0 -8
  972. wisent/examples/scripts/results/test_truthfulqa_pairs.json +0 -8
  973. wisent/examples/scripts/results/test_turkishmmlu_evaluation.json +0 -51
  974. wisent/examples/scripts/results/test_turkishmmlu_pairs.json +0 -14
  975. wisent/examples/scripts/results/test_unfair_tos_evaluation.json +0 -30
  976. wisent/examples/scripts/results/test_unfair_tos_pairs.json +0 -8
  977. wisent/examples/scripts/results/test_unscramble_evaluation.json +0 -51
  978. wisent/examples/scripts/results/test_unscramble_pairs.json +0 -14
  979. wisent/examples/scripts/results/test_webqs_evaluation.json +0 -30
  980. wisent/examples/scripts/results/test_webqs_pairs.json +0 -8
  981. wisent/examples/scripts/results/test_wikitext103_evaluation.json +0 -30
  982. wisent/examples/scripts/results/test_wikitext103_pairs.json +0 -8
  983. wisent/examples/scripts/results/test_wikitext_evaluation.json +0 -30
  984. wisent/examples/scripts/results/test_wikitext_pairs.json +0 -8
  985. wisent/examples/scripts/results/test_winogender_evaluation.json +0 -51
  986. wisent/examples/scripts/results/test_winogender_pairs.json +0 -14
  987. wisent/examples/scripts/results/test_winogrande_evaluation.json +0 -30
  988. wisent/examples/scripts/results/test_winogrande_pairs.json +0 -8
  989. wisent/examples/scripts/results/test_wmdp_evaluation.json +0 -30
  990. wisent/examples/scripts/results/test_wmdp_pairs.json +0 -8
  991. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_evaluation.json +0 -30
  992. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_pairs.json +0 -8
  993. wisent/examples/scripts/results/test_wmt14_en_fr_evaluation.json +0 -30
  994. wisent/examples/scripts/results/test_wmt14_en_fr_pairs.json +0 -8
  995. wisent/examples/scripts/results/test_wmt16_en_de_evaluation.json +0 -30
  996. wisent/examples/scripts/results/test_wmt16_en_de_pairs.json +0 -8
  997. wisent/examples/scripts/results/test_wmt16_ro_en_evaluation.json +0 -30
  998. wisent/examples/scripts/results/test_wmt16_ro_en_pairs.json +0 -8
  999. wisent/examples/scripts/results/test_wsc273_evaluation.json +0 -30
  1000. wisent/examples/scripts/results/test_wsc273_pairs.json +0 -8
  1001. wisent/examples/scripts/results/test_xcopa_evaluation.json +0 -51
  1002. wisent/examples/scripts/results/test_xcopa_pairs.json +0 -14
  1003. wisent/examples/scripts/results/test_xnli_eu_evaluation.json +0 -30
  1004. wisent/examples/scripts/results/test_xnli_eu_pairs.json +0 -8
  1005. wisent/examples/scripts/results/test_xnli_evaluation.json +0 -51
  1006. wisent/examples/scripts/results/test_xnli_pairs.json +0 -14
  1007. wisent/examples/scripts/results/test_xquad_evaluation.json +0 -51
  1008. wisent/examples/scripts/results/test_xquad_pairs.json +0 -14
  1009. wisent/examples/scripts/results/test_xstorycloze_evaluation.json +0 -51
  1010. wisent/examples/scripts/results/test_xstorycloze_pairs.json +0 -14
  1011. wisent/examples/scripts/results/test_xsum_evaluation.json +0 -30
  1012. wisent/examples/scripts/results/test_xsum_pairs.json +0 -8
  1013. wisent/examples/scripts/results/test_xwinograd_evaluation.json +0 -51
  1014. wisent/examples/scripts/results/test_xwinograd_pairs.json +0 -14
  1015. wisent/examples/scripts/results/test_yahoo_answers_topics_evaluation.json +0 -30
  1016. wisent/examples/scripts/results/test_yahoo_answers_topics_pairs.json +0 -8
  1017. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
  1018. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
  1019. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
  1020. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
@@ -1,2781 +0,0 @@
1
- {
2
- "Tag": {
3
- "evaluator": null,
4
- "extractor_location": "huggingface_pairs",
5
- "extractor_file": "tag",
6
- "benchmark_type": "other",
7
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
8
- },
9
- "aclue": {
10
- "evaluator": "log_likelihoods",
11
- "extractor_location": "lm_eval_pairs",
12
- "extractor_file": "aclue",
13
- "benchmark_type": "other",
14
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
15
- },
16
- "acp_bench": {
17
- "evaluator": "log_likelihoods",
18
- "extractor_location": "lm_eval_pairs",
19
- "extractor_file": "acp_bench",
20
- "benchmark_type": "other",
21
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
22
- },
23
- "acp_bench_hard": {
24
- "evaluator": "generation",
25
- "extractor_location": "lm_eval_pairs",
26
- "extractor_file": "acp_bench_hard",
27
- "benchmark_type": "other",
28
- "explanation": "Text generation evaluation - assesses quality of generated text"
29
- },
30
- "advanced": {
31
- "evaluator": "log_likelihoods",
32
- "extractor_location": "lm_eval_pairs",
33
- "extractor_file": "advanced",
34
- "benchmark_type": "other",
35
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
36
- },
37
- "aexams": {
38
- "evaluator": "log_likelihoods",
39
- "extractor_location": "lm_eval_pairs",
40
- "extractor_file": "aexams",
41
- "benchmark_type": "knowledge",
42
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
43
- },
44
- "afrimgsm": {
45
- "evaluator": "generation",
46
- "extractor_location": "lm_eval_pairs",
47
- "extractor_file": "afrimgsm",
48
- "benchmark_type": "mathematics",
49
- "explanation": "Text generation evaluation - assesses quality of generated text"
50
- },
51
- "afrimmlu": {
52
- "evaluator": "log_likelihoods",
53
- "extractor_location": "lm_eval_pairs",
54
- "extractor_file": "afrimmlu",
55
- "benchmark_type": "knowledge",
56
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
57
- },
58
- "afrixnli": {
59
- "evaluator": "log_likelihoods",
60
- "extractor_location": "lm_eval_pairs",
61
- "extractor_file": "afrixnli",
62
- "benchmark_type": "other",
63
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
64
- },
65
- "ag": {
66
- "evaluator": "exact_match",
67
- "extractor_location": "lm_eval_pairs",
68
- "extractor_file": "ag",
69
- "benchmark_type": "other",
70
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
71
- },
72
- "agieval": {
73
- "evaluator": "exact_match",
74
- "extractor_location": "lm_eval_pairs",
75
- "extractor_file": "agieval",
76
- "benchmark_type": "other",
77
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
78
- },
79
- "ai2_arc": {
80
- "evaluator": "log_likelihoods",
81
- "extractor_location": "lm_eval_pairs",
82
- "extractor_file": "ai2_arc",
83
- "benchmark_type": "knowledge",
84
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
85
- },
86
- "aime": {
87
- "evaluator": null,
88
- "extractor_location": "huggingface_pairs",
89
- "extractor_file": "aime",
90
- "benchmark_type": "mathematics",
91
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
92
- },
93
- "aime2024": {
94
- "evaluator": null,
95
- "extractor_location": "huggingface_pairs",
96
- "extractor_file": "aime",
97
- "benchmark_type": "mathematics",
98
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
99
- },
100
- "aime2025": {
101
- "evaluator": null,
102
- "extractor_location": "huggingface_pairs",
103
- "extractor_file": "aime",
104
- "benchmark_type": "mathematics",
105
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
106
- },
107
- "anagrams1": {
108
- "evaluator": "exact_match",
109
- "extractor_location": "lm_eval_pairs",
110
- "extractor_file": "anagrams1",
111
- "benchmark_type": "other",
112
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
113
- },
114
- "anagrams2": {
115
- "evaluator": "exact_match",
116
- "extractor_location": "lm_eval_pairs",
117
- "extractor_file": "anagrams2",
118
- "benchmark_type": "other",
119
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
120
- },
121
- "anli": {
122
- "evaluator": "log_likelihoods",
123
- "extractor_location": "lm_eval_pairs",
124
- "extractor_file": "anli",
125
- "benchmark_type": "other",
126
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
127
- },
128
- "apps": {
129
- "evaluator": null,
130
- "extractor_location": "huggingface_pairs",
131
- "extractor_file": "apps",
132
- "benchmark_type": "coding",
133
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
134
- },
135
- "arabculture": {
136
- "evaluator": "log_likelihoods",
137
- "extractor_location": "lm_eval_pairs",
138
- "extractor_file": "arabculture",
139
- "benchmark_type": "other",
140
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
141
- },
142
- "arabic": {
143
- "evaluator": "log_likelihoods",
144
- "extractor_location": "lm_eval_pairs",
145
- "extractor_file": "arabic",
146
- "benchmark_type": "other",
147
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
148
- },
149
- "arabic_leaderboard_complete": {
150
- "evaluator": "log_likelihoods",
151
- "extractor_location": "lm_eval_pairs",
152
- "extractor_file": "arabic_leaderboard_complete",
153
- "benchmark_type": "other",
154
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
155
- },
156
- "arabic_leaderboard_light": {
157
- "evaluator": "log_likelihoods",
158
- "extractor_location": "lm_eval_pairs",
159
- "extractor_file": "arabic_leaderboard_light",
160
- "benchmark_type": "other",
161
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
162
- },
163
- "arabicmmlu": {
164
- "evaluator": "log_likelihoods",
165
- "extractor_location": "lm_eval_pairs",
166
- "extractor_file": "arabicmmlu",
167
- "benchmark_type": "knowledge",
168
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
169
- },
170
- "aradice": {
171
- "evaluator": "log_likelihoods",
172
- "extractor_location": "lm_eval_pairs",
173
- "extractor_file": "aradice",
174
- "benchmark_type": "other",
175
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
176
- },
177
- "arc": {
178
- "evaluator": "log_likelihoods",
179
- "extractor_location": "lm_eval_pairs",
180
- "extractor_file": "arc",
181
- "benchmark_type": "knowledge",
182
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
183
- },
184
- "arc_challenge": {
185
- "evaluator": "log_likelihoods",
186
- "extractor_location": "lm_eval_pairs",
187
- "extractor_file": "arc_challenge",
188
- "benchmark_type": "knowledge",
189
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
190
- },
191
- "arc_easy": {
192
- "evaluator": "log_likelihoods",
193
- "extractor_location": "lm_eval_pairs",
194
- "extractor_file": "arc_easy",
195
- "benchmark_type": "knowledge",
196
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
197
- },
198
- "argument": {
199
- "evaluator": "exact_match",
200
- "extractor_location": "lm_eval_pairs",
201
- "extractor_file": "argument",
202
- "benchmark_type": "other",
203
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
204
- },
205
- "arithmetic": {
206
- "evaluator": "exact_match",
207
- "extractor_location": "lm_eval_pairs",
208
- "extractor_file": "arithmetic",
209
- "benchmark_type": "mathematics",
210
- "explanation": "Text comparison (WARNING: should use execution for mathematics)"
211
- },
212
- "asdiv": {
213
- "evaluator": "exact_match",
214
- "extractor_location": "lm_eval_pairs",
215
- "extractor_file": "asdiv",
216
- "benchmark_type": "mathematics",
217
- "explanation": "Text comparison (WARNING: should use execution for mathematics)"
218
- },
219
- "asdiv_cot_llama": {
220
- "evaluator": null,
221
- "extractor_location": "huggingface_pairs",
222
- "extractor_file": "math",
223
- "benchmark_type": "mathematics",
224
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
225
- },
226
- "assin": {
227
- "evaluator": "log_likelihoods",
228
- "extractor_location": "lm_eval_pairs",
229
- "extractor_file": "assin",
230
- "benchmark_type": "other",
231
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
232
- },
233
- "atis": {
234
- "evaluator": "generation",
235
- "extractor_location": "huggingface_pairs",
236
- "extractor_file": "atis",
237
- "benchmark_type": "other",
238
- "explanation": "Text generation evaluation - assesses quality of generated text"
239
- },
240
- "babi": {
241
- "evaluator": "generation",
242
- "extractor_location": "lm_eval_pairs",
243
- "extractor_file": "babi",
244
- "benchmark_type": "other",
245
- "explanation": "Text generation evaluation - assesses quality of generated text"
246
- },
247
- "babilong": {
248
- "evaluator": "generation",
249
- "extractor_location": "huggingface_pairs",
250
- "extractor_file": "babilong",
251
- "benchmark_type": "other",
252
- "explanation": "Text generation evaluation - assesses quality of generated text"
253
- },
254
- "bangla_mmlu": {
255
- "evaluator": "log_likelihoods",
256
- "extractor_location": "huggingface_pairs",
257
- "extractor_file": "bangla_mmlu",
258
- "benchmark_type": "knowledge",
259
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
260
- },
261
- "banking77": {
262
- "evaluator": "exact_match",
263
- "extractor_location": "huggingface_pairs",
264
- "extractor_file": "banking77",
265
- "benchmark_type": "other",
266
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
267
- },
268
- "basque_bench": {
269
- "evaluator": "log_likelihoods",
270
- "extractor_location": "lm_eval_pairs",
271
- "extractor_file": "basque_bench",
272
- "benchmark_type": "other",
273
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
274
- },
275
- "basque_glue": {
276
- "evaluator": "log_likelihoods",
277
- "extractor_location": "lm_eval_pairs",
278
- "extractor_file": "basque_glue",
279
- "benchmark_type": "other",
280
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
281
- },
282
- "basqueglue": {
283
- "evaluator": "log_likelihoods",
284
- "extractor_location": "huggingface_pairs",
285
- "extractor_file": "basqueglue",
286
- "benchmark_type": "other",
287
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
288
- },
289
- "bbh": {
290
- "evaluator": "exact_match",
291
- "extractor_location": "lm_eval_pairs",
292
- "extractor_file": "bbh",
293
- "benchmark_type": "other",
294
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
295
- },
296
- "bbq": {
297
- "evaluator": "log_likelihoods",
298
- "extractor_location": "lm_eval_pairs",
299
- "extractor_file": "bbq",
300
- "benchmark_type": "other",
301
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
302
- },
303
- "bec2016eu": {
304
- "evaluator": "log_likelihoods",
305
- "extractor_location": "huggingface_pairs",
306
- "extractor_file": "bec2016eu",
307
- "benchmark_type": "other",
308
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
309
- },
310
- "belebele": {
311
- "evaluator": "log_likelihoods",
312
- "extractor_location": "lm_eval_pairs",
313
- "extractor_file": "belebele",
314
- "benchmark_type": "other",
315
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
316
- },
317
- "benchmarks": {
318
- "evaluator": "log_likelihoods",
319
- "extractor_location": "lm_eval_pairs",
320
- "extractor_file": "benchmarks",
321
- "benchmark_type": "other",
322
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
323
- },
324
- "bertaqa": {
325
- "evaluator": "log_likelihoods",
326
- "extractor_location": "lm_eval_pairs",
327
- "extractor_file": "bertaqa",
328
- "benchmark_type": "question_answering",
329
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
330
- },
331
- "bhs": {
332
- "evaluator": "log_likelihoods",
333
- "extractor_location": "lm_eval_pairs",
334
- "extractor_file": "bhs",
335
- "benchmark_type": "other",
336
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
337
- },
338
- "bhtc": {
339
- "evaluator": "log_likelihoods",
340
- "extractor_location": "lm_eval_pairs",
341
- "extractor_file": "bhtc",
342
- "benchmark_type": "other",
343
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
344
- },
345
- "bigbench": {
346
- "evaluator": "exact_match",
347
- "extractor_location": "lm_eval_pairs",
348
- "extractor_file": "bigbench",
349
- "benchmark_type": "other",
350
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
351
- },
352
- "blimp": {
353
- "evaluator": "log_likelihoods",
354
- "extractor_location": "lm_eval_pairs",
355
- "extractor_file": "blimp",
356
- "benchmark_type": "other",
357
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
358
- },
359
- "blimp_nl": {
360
- "evaluator": null,
361
- "extractor_location": "lm_eval_pairs",
362
- "extractor_file": "blimp_nl",
363
- "benchmark_type": "other",
364
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
365
- },
366
- "boolq": {
367
- "evaluator": "log_likelihoods",
368
- "extractor_location": "huggingface_pairs",
369
- "extractor_file": "boolq",
370
- "benchmark_type": "other",
371
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
372
- },
373
- "boolq_seq2seq": {
374
- "evaluator": null,
375
- "extractor_location": "huggingface_pairs",
376
- "extractor_file": "boolq_seq2seq",
377
- "benchmark_type": "other",
378
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
379
- },
380
- "c4": {
381
- "evaluator": null,
382
- "extractor_location": "lm_eval_pairs",
383
- "extractor_file": "c4",
384
- "benchmark_type": "other",
385
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
386
- },
387
- "cabbq": {
388
- "evaluator": null,
389
- "extractor_location": "lm_eval_pairs",
390
- "extractor_file": "cabbq",
391
- "benchmark_type": "other",
392
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
393
- },
394
- "cabreu": {
395
- "evaluator": "log_likelihoods",
396
- "extractor_location": "lm_eval_pairs",
397
- "extractor_file": "cabreu",
398
- "benchmark_type": "other",
399
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
400
- },
401
- "careqa": {
402
- "evaluator": "log_likelihoods",
403
- "extractor_location": "lm_eval_pairs",
404
- "extractor_file": "careqa",
405
- "benchmark_type": "question_answering",
406
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
407
- },
408
- "catalan_bench": {
409
- "evaluator": "log_likelihoods",
410
- "extractor_location": "lm_eval_pairs",
411
- "extractor_file": "catalan_bench",
412
- "benchmark_type": "other",
413
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
414
- },
415
- "catalanqa": {
416
- "evaluator": "log_likelihoods",
417
- "extractor_location": "lm_eval_pairs",
418
- "extractor_file": "catalanqa",
419
- "benchmark_type": "question_answering",
420
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
421
- },
422
- "catcola": {
423
- "evaluator": "log_likelihoods",
424
- "extractor_location": "lm_eval_pairs",
425
- "extractor_file": "catcola",
426
- "benchmark_type": "other",
427
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
428
- },
429
- "cb": {
430
- "evaluator": null,
431
- "extractor_location": "huggingface_pairs",
432
- "extractor_file": "cb",
433
- "benchmark_type": "other",
434
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
435
- },
436
- "ceval": {
437
- "evaluator": "log_likelihoods",
438
- "extractor_location": "lm_eval_pairs",
439
- "extractor_file": "ceval",
440
- "benchmark_type": "other",
441
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
442
- },
443
- "ceval_valid": {
444
- "evaluator": "log_likelihoods",
445
- "extractor_location": "lm_eval_pairs",
446
- "extractor_file": "ceval_valid",
447
- "benchmark_type": "other",
448
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
449
- },
450
- "chain": {
451
- "evaluator": "log_likelihoods",
452
- "extractor_location": "lm_eval_pairs",
453
- "extractor_file": "chain",
454
- "benchmark_type": "other",
455
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
456
- },
457
- "chain_of_thought": {
458
- "evaluator": null,
459
- "extractor_location": "huggingface_pairs",
460
- "extractor_file": "math",
461
- "benchmark_type": "other",
462
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
463
- },
464
- "chartqa": {
465
- "evaluator": "generation",
466
- "extractor_location": "lm_eval_pairs",
467
- "extractor_file": "chartqa",
468
- "benchmark_type": "question_answering",
469
- "explanation": "Text generation evaluation - assesses quality of generated text"
470
- },
471
- "claim": {
472
- "evaluator": "log_likelihoods",
473
- "extractor_location": "lm_eval_pairs",
474
- "extractor_file": "claim",
475
- "benchmark_type": "other",
476
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
477
- },
478
- "click": {
479
- "evaluator": null,
480
- "extractor_location": "lm_eval_pairs",
481
- "extractor_file": "click",
482
- "benchmark_type": "other",
483
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
484
- },
485
- "cmmlu": {
486
- "evaluator": null,
487
- "extractor_location": "lm_eval_pairs",
488
- "extractor_file": "cmmlu",
489
- "benchmark_type": "knowledge",
490
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
491
- },
492
- "cnn": {
493
- "evaluator": "exact_match",
494
- "extractor_location": "lm_eval_pairs",
495
- "extractor_file": "cnn",
496
- "benchmark_type": "other",
497
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
498
- },
499
- "cocoteros": {
500
- "evaluator": "generation",
501
- "extractor_location": "lm_eval_pairs",
502
- "extractor_file": "cocoteros",
503
- "benchmark_type": "other",
504
- "explanation": "Text generation evaluation - assesses quality of generated text"
505
- },
506
- "code2text": {
507
- "evaluator": "generation",
508
- "extractor_location": "lm_eval_pairs",
509
- "extractor_file": "code2text",
510
- "benchmark_type": "coding",
511
- "explanation": "Text generation evaluation - assesses quality of generated text"
512
- },
513
- "code_x_glue": {
514
- "evaluator": "generation",
515
- "extractor_location": "lm_eval_pairs",
516
- "extractor_file": "code_x_glue",
517
- "benchmark_type": "coding",
518
- "explanation": "Text generation evaluation - assesses quality of generated text"
519
- },
520
- "codexglue": {
521
- "evaluator": null,
522
- "extractor_location": "huggingface_pairs",
523
- "extractor_file": "codexglue",
524
- "benchmark_type": "coding",
525
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
526
- },
527
- "codexglue_code_to_text_go": {
528
- "evaluator": null,
529
- "extractor_location": "huggingface_pairs",
530
- "extractor_file": "codexglue",
531
- "benchmark_type": "coding",
532
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
533
- },
534
- "codexglue_code_to_text_java": {
535
- "evaluator": null,
536
- "extractor_location": "huggingface_pairs",
537
- "extractor_file": "codexglue",
538
- "benchmark_type": "coding",
539
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
540
- },
541
- "codexglue_code_to_text_javascript": {
542
- "evaluator": null,
543
- "extractor_location": "huggingface_pairs",
544
- "extractor_file": "codexglue",
545
- "benchmark_type": "coding",
546
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
547
- },
548
- "codexglue_code_to_text_php": {
549
- "evaluator": null,
550
- "extractor_location": "huggingface_pairs",
551
- "extractor_file": "codexglue",
552
- "benchmark_type": "coding",
553
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
554
- },
555
- "codexglue_code_to_text_python": {
556
- "evaluator": null,
557
- "extractor_location": "huggingface_pairs",
558
- "extractor_file": "codexglue",
559
- "benchmark_type": "coding",
560
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
561
- },
562
- "codexglue_code_to_text_ruby": {
563
- "evaluator": null,
564
- "extractor_location": "huggingface_pairs",
565
- "extractor_file": "codexglue",
566
- "benchmark_type": "coding",
567
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
568
- },
569
- "coedit": {
570
- "evaluator": "generation",
571
- "extractor_location": "lm_eval_pairs",
572
- "extractor_file": "coedit",
573
- "benchmark_type": "other",
574
- "explanation": "Text generation evaluation - assesses quality of generated text"
575
- },
576
- "cola": {
577
- "evaluator": "log_likelihoods",
578
- "extractor_location": "lm_eval_pairs",
579
- "extractor_file": "cola",
580
- "benchmark_type": "other",
581
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
582
- },
583
- "commonsense": {
584
- "evaluator": null,
585
- "extractor_location": "lm_eval_pairs",
586
- "extractor_file": "commonsense",
587
- "benchmark_type": "other",
588
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
589
- },
590
- "commonsense_qa": {
591
- "evaluator": "log_likelihoods",
592
- "extractor_location": "lm_eval_pairs",
593
- "extractor_file": "commonsense_qa",
594
- "benchmark_type": "question_answering",
595
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
596
- },
597
- "conala": {
598
- "evaluator": null,
599
- "extractor_location": "huggingface_pairs",
600
- "extractor_file": "conala",
601
- "benchmark_type": "coding",
602
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
603
- },
604
- "concode": {
605
- "evaluator": null,
606
- "extractor_location": "huggingface_pairs",
607
- "extractor_file": "concode",
608
- "benchmark_type": "coding",
609
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
610
- },
611
- "copa": {
612
- "evaluator": null,
613
- "extractor_location": "huggingface_pairs",
614
- "extractor_file": "copa",
615
- "benchmark_type": "other",
616
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
617
- },
618
- "copal_id": {
619
- "evaluator": "log_likelihoods",
620
- "extractor_location": "lm_eval_pairs",
621
- "extractor_file": "copal_id",
622
- "benchmark_type": "other",
623
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
624
- },
625
- "coqa": {
626
- "evaluator": null,
627
- "extractor_location": "lm_eval_pairs",
628
- "extractor_file": "coqa",
629
- "benchmark_type": "question_answering",
630
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
631
- },
632
- "coqcat": {
633
- "evaluator": "generation",
634
- "extractor_location": "lm_eval_pairs",
635
- "extractor_file": "coqcat",
636
- "benchmark_type": "other",
637
- "explanation": "Text generation evaluation - assesses quality of generated text"
638
- },
639
- "crows_pairs": {
640
- "evaluator": "log_likelihoods",
641
- "extractor_location": "lm_eval_pairs",
642
- "extractor_file": "crows_pairs",
643
- "benchmark_type": "other",
644
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
645
- },
646
- "csatqa": {
647
- "evaluator": null,
648
- "extractor_location": "lm_eval_pairs",
649
- "extractor_file": "csatqa",
650
- "benchmark_type": "question_answering",
651
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
652
- },
653
- "cycle": {
654
- "evaluator": null,
655
- "extractor_location": "lm_eval_pairs",
656
- "extractor_file": "cycle",
657
- "benchmark_type": "other",
658
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
659
- },
660
- "cycle_letters": {
661
- "evaluator": "exact_match",
662
- "extractor_location": "lm_eval_pairs",
663
- "extractor_file": "cycle_letters",
664
- "benchmark_type": "other",
665
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
666
- },
667
- "darija_bench": {
668
- "evaluator": "log_likelihoods",
669
- "extractor_location": "lm_eval_pairs",
670
- "extractor_file": "darija_bench",
671
- "benchmark_type": "other",
672
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
673
- },
674
- "darijahellaswag": {
675
- "evaluator": "log_likelihoods",
676
- "extractor_location": "lm_eval_pairs",
677
- "extractor_file": "darijahellaswag",
678
- "benchmark_type": "knowledge",
679
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
680
- },
681
- "darijammlu": {
682
- "evaluator": null,
683
- "extractor_location": "lm_eval_pairs",
684
- "extractor_file": "darijammlu",
685
- "benchmark_type": "knowledge",
686
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
687
- },
688
- "dbpedia": {
689
- "evaluator": "generation",
690
- "extractor_location": "lm_eval_pairs",
691
- "extractor_file": "dbpedia",
692
- "benchmark_type": "other",
693
- "explanation": "Text generation evaluation - assesses quality of generated text"
694
- },
695
- "discrim_eval": {
696
- "evaluator": null,
697
- "extractor_location": "lm_eval_pairs",
698
- "extractor_file": "discrim_eval",
699
- "benchmark_type": "other",
700
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
701
- },
702
- "doc": {
703
- "evaluator": null,
704
- "extractor_location": "lm_eval_pairs",
705
- "extractor_file": "doc",
706
- "benchmark_type": "other",
707
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
708
- },
709
- "doc_vqa": {
710
- "evaluator": "generation",
711
- "extractor_location": "huggingface_pairs",
712
- "extractor_file": "doc_vqa",
713
- "benchmark_type": "question_answering",
714
- "explanation": "Text generation evaluation - assesses quality of generated text"
715
- },
716
- "drop": {
717
- "evaluator": null,
718
- "extractor_location": "lm_eval_pairs",
719
- "extractor_file": "drop",
720
- "benchmark_type": "other",
721
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
722
- },
723
- "ds1000": {
724
- "evaluator": "exact_match",
725
- "extractor_location": "huggingface_pairs",
726
- "extractor_file": "ds1000",
727
- "benchmark_type": "coding",
728
- "explanation": "Text comparison (WARNING: should use execution for coding)"
729
- },
730
- "ds_1000": {
731
- "evaluator": null,
732
- "extractor_location": "huggingface_pairs",
733
- "extractor_file": "ds_1000",
734
- "benchmark_type": "other",
735
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
736
- },
737
- "egyhellaswag": {
738
- "evaluator": "log_likelihoods",
739
- "extractor_location": "lm_eval_pairs",
740
- "extractor_file": "egyhellaswag",
741
- "benchmark_type": "knowledge",
742
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
743
- },
744
- "egymmlu": {
745
- "evaluator": "log_likelihoods",
746
- "extractor_location": "lm_eval_pairs",
747
- "extractor_file": "egymmlu",
748
- "benchmark_type": "knowledge",
749
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
750
- },
751
- "epec": {
752
- "evaluator": "log_likelihoods",
753
- "extractor_location": "lm_eval_pairs",
754
- "extractor_file": "epec",
755
- "benchmark_type": "other",
756
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
757
- },
758
- "eq": {
759
- "evaluator": null,
760
- "extractor_location": "lm_eval_pairs",
761
- "extractor_file": "eq",
762
- "benchmark_type": "other",
763
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
764
- },
765
- "eq_bench": {
766
- "evaluator": "exact_match",
767
- "extractor_location": "lm_eval_pairs",
768
- "extractor_file": "eq_bench",
769
- "benchmark_type": "other",
770
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
771
- },
772
- "eq_bench_ca": {
773
- "evaluator": null,
774
- "extractor_location": "lm_eval_pairs",
775
- "extractor_file": "eq_bench_ca",
776
- "benchmark_type": "other",
777
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
778
- },
779
- "eq_bench_es": {
780
- "evaluator": null,
781
- "extractor_location": "lm_eval_pairs",
782
- "extractor_file": "eq_bench_es",
783
- "benchmark_type": "other",
784
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
785
- },
786
- "esbbq": {
787
- "evaluator": null,
788
- "extractor_location": "lm_eval_pairs",
789
- "extractor_file": "esbbq",
790
- "benchmark_type": "other",
791
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
792
- },
793
- "escola": {
794
- "evaluator": "log_likelihoods",
795
- "extractor_location": "lm_eval_pairs",
796
- "extractor_file": "escola",
797
- "benchmark_type": "other",
798
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
799
- },
800
- "ethics": {
801
- "evaluator": "log_likelihoods",
802
- "extractor_location": "lm_eval_pairs",
803
- "extractor_file": "ethics",
804
- "benchmark_type": "other",
805
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
806
- },
807
- "ethos": {
808
- "evaluator": "generation",
809
- "extractor_location": "lm_eval_pairs",
810
- "extractor_file": "ethos",
811
- "benchmark_type": "other",
812
- "explanation": "Text generation evaluation - assesses quality of generated text"
813
- },
814
- "eus": {
815
- "evaluator": null,
816
- "extractor_location": "lm_eval_pairs",
817
- "extractor_file": "eus",
818
- "benchmark_type": "other",
819
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
820
- },
821
- "eus_exams": {
822
- "evaluator": "log_likelihoods",
823
- "extractor_location": "lm_eval_pairs",
824
- "extractor_file": "eus_exams",
825
- "benchmark_type": "knowledge",
826
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
827
- },
828
- "eus_proficiency": {
829
- "evaluator": "log_likelihoods",
830
- "extractor_location": "lm_eval_pairs",
831
- "extractor_file": "eus_proficiency",
832
- "benchmark_type": "other",
833
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
834
- },
835
- "eus_reading": {
836
- "evaluator": "log_likelihoods",
837
- "extractor_location": "lm_eval_pairs",
838
- "extractor_file": "eus_reading",
839
- "benchmark_type": "other",
840
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
841
- },
842
- "eus_trivia": {
843
- "evaluator": "log_likelihoods",
844
- "extractor_location": "lm_eval_pairs",
845
- "extractor_file": "eus_trivia",
846
- "benchmark_type": "other",
847
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
848
- },
849
- "evalita_llm": {
850
- "evaluator": "log_likelihoods",
851
- "extractor_location": "lm_eval_pairs",
852
- "extractor_file": "evalita_llm",
853
- "benchmark_type": "other",
854
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
855
- },
856
- "evalita_mp": {
857
- "evaluator": "log_likelihoods",
858
- "extractor_location": "huggingface_pairs",
859
- "extractor_file": "evalita_mp",
860
- "benchmark_type": "other",
861
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
862
- },
863
- "evalita_sp": {
864
- "evaluator": "generation",
865
- "extractor_location": "lm_eval_pairs",
866
- "extractor_file": "evalita_sp",
867
- "benchmark_type": "other",
868
- "explanation": "Text generation evaluation - assesses quality of generated text"
869
- },
870
- "fda": {
871
- "evaluator": "generation",
872
- "extractor_location": "lm_eval_pairs",
873
- "extractor_file": "fda",
874
- "benchmark_type": "other",
875
- "explanation": "Text generation evaluation - assesses quality of generated text"
876
- },
877
- "financial": {
878
- "evaluator": null,
879
- "extractor_location": "lm_eval_pairs",
880
- "extractor_file": "financial",
881
- "benchmark_type": "other",
882
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
883
- },
884
- "financial_tweets": {
885
- "evaluator": "generation",
886
- "extractor_location": "huggingface_pairs",
887
- "extractor_file": "financial_tweets",
888
- "benchmark_type": "other",
889
- "explanation": "Text generation evaluation - assesses quality of generated text"
890
- },
891
- "flan": {
892
- "evaluator": "log_likelihoods",
893
- "extractor_location": "lm_eval_pairs",
894
- "extractor_file": "flan",
895
- "benchmark_type": "other",
896
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
897
- },
898
- "fld": {
899
- "evaluator": "exact_match",
900
- "extractor_location": "lm_eval_pairs",
901
- "extractor_file": "fld",
902
- "benchmark_type": "other",
903
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
904
- },
905
- "flores": {
906
- "evaluator": "generation",
907
- "extractor_location": "huggingface_pairs",
908
- "extractor_file": "flores",
909
- "benchmark_type": "translation",
910
- "explanation": "Text generation evaluation - assesses quality of generated text"
911
- },
912
- "freebase": {
913
- "evaluator": "log_likelihoods",
914
- "extractor_location": "huggingface_pairs",
915
- "extractor_file": "freebase",
916
- "benchmark_type": "other",
917
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
918
- },
919
- "french_bench": {
920
- "evaluator": "log_likelihoods",
921
- "extractor_location": "lm_eval_pairs",
922
- "extractor_file": "french_bench",
923
- "benchmark_type": "other",
924
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
925
- },
926
- "galcola": {
927
- "evaluator": "log_likelihoods",
928
- "extractor_location": "lm_eval_pairs",
929
- "extractor_file": "galcola",
930
- "benchmark_type": "other",
931
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
932
- },
933
- "galician_bench": {
934
- "evaluator": "log_likelihoods",
935
- "extractor_location": "lm_eval_pairs",
936
- "extractor_file": "galician_bench",
937
- "benchmark_type": "other",
938
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
939
- },
940
- "gaokao": {
941
- "evaluator": "log_likelihoods",
942
- "extractor_location": "lm_eval_pairs",
943
- "extractor_file": "gaokao",
944
- "benchmark_type": "other",
945
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
946
- },
947
- "glianorex": {
948
- "evaluator": "log_likelihoods",
949
- "extractor_location": "huggingface_pairs",
950
- "extractor_file": "glianorex",
951
- "benchmark_type": "other",
952
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
953
- },
954
- "global_mmlu": {
955
- "evaluator": "log_likelihoods",
956
- "extractor_location": "lm_eval_pairs",
957
- "extractor_file": "global_mmlu",
958
- "benchmark_type": "knowledge",
959
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
960
- },
961
- "global_piqa": {
962
- "evaluator": null,
963
- "extractor_location": "lm_eval_pairs",
964
- "extractor_file": "global_piqa",
965
- "benchmark_type": "question_answering",
966
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
967
- },
968
- "glue": {
969
- "evaluator": "log_likelihoods",
970
- "extractor_location": "lm_eval_pairs",
971
- "extractor_file": "glue",
972
- "benchmark_type": "other",
973
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
974
- },
975
- "gpqa": {
976
- "evaluator": "log_likelihoods",
977
- "extractor_location": "lm_eval_pairs",
978
- "extractor_file": "gpqa",
979
- "benchmark_type": "question_answering",
980
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
981
- },
982
- "gpt3": {
983
- "evaluator": "log_likelihoods",
984
- "extractor_location": "lm_eval_pairs",
985
- "extractor_file": "gpt3",
986
- "benchmark_type": "other",
987
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
988
- },
989
- "groundcocoa": {
990
- "evaluator": "generation",
991
- "extractor_location": "lm_eval_pairs",
992
- "extractor_file": "groundcocoa",
993
- "benchmark_type": "other",
994
- "explanation": "Text generation evaluation - assesses quality of generated text"
995
- },
996
- "gsm": {
997
- "evaluator": "exact_match",
998
- "extractor_location": "lm_eval_pairs",
999
- "extractor_file": "gsm",
1000
- "benchmark_type": "mathematics",
1001
- "explanation": "Text comparison (WARNING: should use execution for mathematics)"
1002
- },
1003
- "gsm8k": {
1004
- "evaluator": "exact_match",
1005
- "extractor_location": "lm_eval_pairs",
1006
- "extractor_file": "gsm8k",
1007
- "benchmark_type": "mathematics",
1008
- "explanation": "Text comparison (WARNING: should use execution for mathematics)"
1009
- },
1010
- "gsm8k_cot": {
1011
- "evaluator": null,
1012
- "extractor_location": "huggingface_pairs",
1013
- "extractor_file": "math",
1014
- "benchmark_type": "mathematics",
1015
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1016
- },
1017
- "gsm8k_cot_llama": {
1018
- "evaluator": null,
1019
- "extractor_location": "huggingface_pairs",
1020
- "extractor_file": "math",
1021
- "benchmark_type": "mathematics",
1022
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1023
- },
1024
- "gsm8k_cot_self_consistency": {
1025
- "evaluator": null,
1026
- "extractor_location": "huggingface_pairs",
1027
- "extractor_file": "math",
1028
- "benchmark_type": "mathematics",
1029
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1030
- },
1031
- "gsm8k_llama": {
1032
- "evaluator": null,
1033
- "extractor_location": "huggingface_pairs",
1034
- "extractor_file": "math",
1035
- "benchmark_type": "mathematics",
1036
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1037
- },
1038
- "gsm8k_platinum_cot": {
1039
- "evaluator": null,
1040
- "extractor_location": "huggingface_pairs",
1041
- "extractor_file": "math",
1042
- "benchmark_type": "mathematics",
1043
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1044
- },
1045
- "gsm8k_platinum_cot_llama": {
1046
- "evaluator": null,
1047
- "extractor_location": "huggingface_pairs",
1048
- "extractor_file": "math",
1049
- "benchmark_type": "mathematics",
1050
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1051
- },
1052
- "gsm8k_platinum_cot_self_consistency": {
1053
- "evaluator": null,
1054
- "extractor_location": "huggingface_pairs",
1055
- "extractor_file": "math",
1056
- "benchmark_type": "mathematics",
1057
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1058
- },
1059
- "gsm_plus": {
1060
- "evaluator": "exact_match",
1061
- "extractor_location": "huggingface_pairs",
1062
- "extractor_file": "gsm_plus",
1063
- "benchmark_type": "mathematics",
1064
- "explanation": "Text comparison (WARNING: should use execution for mathematics)"
1065
- },
1066
- "haerae": {
1067
- "evaluator": null,
1068
- "extractor_location": "lm_eval_pairs",
1069
- "extractor_file": "haerae",
1070
- "benchmark_type": "other",
1071
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1072
- },
1073
- "headqa": {
1074
- "evaluator": null,
1075
- "extractor_location": "lm_eval_pairs",
1076
- "extractor_file": "headqa",
1077
- "benchmark_type": "question_answering",
1078
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1079
- },
1080
- "hellaswag": {
1081
- "evaluator": "log_likelihoods",
1082
- "extractor_location": "lm_eval_pairs",
1083
- "extractor_file": "hellaswag",
1084
- "benchmark_type": "knowledge",
1085
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1086
- },
1087
- "hendrycks_ethics": {
1088
- "evaluator": null,
1089
- "extractor_location": "lm_eval_pairs",
1090
- "extractor_file": "hendrycks_ethics",
1091
- "benchmark_type": "other",
1092
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1093
- },
1094
- "hendrycks_math": {
1095
- "evaluator": null,
1096
- "extractor_location": "lm_eval_pairs",
1097
- "extractor_file": "hendrycks_math",
1098
- "benchmark_type": "mathematics",
1099
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1100
- },
1101
- "histoires_morales": {
1102
- "evaluator": "generation",
1103
- "extractor_location": "lm_eval_pairs",
1104
- "extractor_file": "histoires_morales",
1105
- "benchmark_type": "other",
1106
- "explanation": "Text generation evaluation - assesses quality of generated text"
1107
- },
1108
- "hle": {
1109
- "evaluator": null,
1110
- "extractor_location": "huggingface_pairs",
1111
- "extractor_file": "hle",
1112
- "benchmark_type": "other",
1113
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1114
- },
1115
- "hle_exact_match": {
1116
- "evaluator": null,
1117
- "extractor_location": "huggingface_pairs",
1118
- "extractor_file": "hle",
1119
- "benchmark_type": "other",
1120
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1121
- },
1122
- "hle_multiple_choice": {
1123
- "evaluator": null,
1124
- "extractor_location": "huggingface_pairs",
1125
- "extractor_file": "hle",
1126
- "benchmark_type": "other",
1127
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1128
- },
1129
- "hmmt": {
1130
- "evaluator": null,
1131
- "extractor_location": "huggingface_pairs",
1132
- "extractor_file": "hmmt",
1133
- "benchmark_type": "mathematics",
1134
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1135
- },
1136
- "hmmt_feb_2025": {
1137
- "evaluator": null,
1138
- "extractor_location": "huggingface_pairs",
1139
- "extractor_file": "hmmt",
1140
- "benchmark_type": "mathematics",
1141
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1142
- },
1143
- "hrm8k": {
1144
- "evaluator": "exact_match",
1145
- "extractor_location": "lm_eval_pairs",
1146
- "extractor_file": "hrm8k",
1147
- "benchmark_type": "other",
1148
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
1149
- },
1150
- "humaneval": {
1151
- "evaluator": null,
1152
- "extractor_location": "huggingface_pairs",
1153
- "extractor_file": "humaneval",
1154
- "benchmark_type": "coding",
1155
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1156
- },
1157
- "humaneval_64_instruct": {
1158
- "evaluator": null,
1159
- "extractor_location": "huggingface_pairs",
1160
- "extractor_file": "instructhumaneval",
1161
- "benchmark_type": "coding",
1162
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1163
- },
1164
- "humaneval_infilling": {
1165
- "evaluator": null,
1166
- "extractor_location": "lm_eval_pairs",
1167
- "extractor_file": "humaneval_infilling",
1168
- "benchmark_type": "coding",
1169
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1170
- },
1171
- "humaneval_instruct": {
1172
- "evaluator": null,
1173
- "extractor_location": "huggingface_pairs",
1174
- "extractor_file": "instructhumaneval",
1175
- "benchmark_type": "coding",
1176
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1177
- },
1178
- "humaneval_plus": {
1179
- "evaluator": null,
1180
- "extractor_location": "huggingface_pairs",
1181
- "extractor_file": "humaneval",
1182
- "benchmark_type": "coding",
1183
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1184
- },
1185
- "humanevalpack": {
1186
- "evaluator": "exact_match",
1187
- "extractor_location": "huggingface_pairs",
1188
- "extractor_file": "humanevalpack",
1189
- "benchmark_type": "coding",
1190
- "explanation": "Text comparison (WARNING: should use execution for coding)"
1191
- },
1192
- "icelandic_winogrande": {
1193
- "evaluator": null,
1194
- "extractor_location": "lm_eval_pairs",
1195
- "extractor_file": "icelandic_winogrande",
1196
- "benchmark_type": "other",
1197
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1198
- },
1199
- "ifeval": {
1200
- "evaluator": "exact_match",
1201
- "extractor_location": "lm_eval_pairs",
1202
- "extractor_file": "ifeval",
1203
- "benchmark_type": "other",
1204
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
1205
- },
1206
- "instruct_humaneval": {
1207
- "evaluator": null,
1208
- "extractor_location": "huggingface_pairs",
1209
- "extractor_file": "instructhumaneval",
1210
- "benchmark_type": "coding",
1211
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1212
- },
1213
- "instructhumaneval": {
1214
- "evaluator": null,
1215
- "extractor_location": "huggingface_pairs",
1216
- "extractor_file": "instructhumaneval",
1217
- "benchmark_type": "coding",
1218
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1219
- },
1220
- "inverse": {
1221
- "evaluator": null,
1222
- "extractor_location": "lm_eval_pairs",
1223
- "extractor_file": "inverse",
1224
- "benchmark_type": "other",
1225
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1226
- },
1227
- "inverse_scaling": {
1228
- "evaluator": "log_likelihoods",
1229
- "extractor_location": "lm_eval_pairs",
1230
- "extractor_file": "inverse_scaling",
1231
- "benchmark_type": "other",
1232
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1233
- },
1234
- "iwslt2017": {
1235
- "evaluator": "generation",
1236
- "extractor_location": "lm_eval_pairs",
1237
- "extractor_file": "iwslt2017",
1238
- "benchmark_type": "translation",
1239
- "explanation": "Text generation evaluation - assesses quality of generated text"
1240
- },
1241
- "iwslt2017_ar_en": {
1242
- "evaluator": "generation",
1243
- "extractor_location": "huggingface_pairs",
1244
- "extractor_file": "iwslt2017_ar_en",
1245
- "benchmark_type": "translation",
1246
- "explanation": "Text generation evaluation - assesses quality of generated text"
1247
- },
1248
- "iwslt2017_en_ar": {
1249
- "evaluator": "generation",
1250
- "extractor_location": "huggingface_pairs",
1251
- "extractor_file": "iwslt2017_en_ar",
1252
- "benchmark_type": "translation",
1253
- "explanation": "Text generation evaluation - assesses quality of generated text"
1254
- },
1255
- "ja": {
1256
- "evaluator": null,
1257
- "extractor_location": "lm_eval_pairs",
1258
- "extractor_file": "ja",
1259
- "benchmark_type": "other",
1260
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1261
- },
1262
- "japanese_leaderboard": {
1263
- "evaluator": "log_likelihoods",
1264
- "extractor_location": "lm_eval_pairs",
1265
- "extractor_file": "japanese_leaderboard",
1266
- "benchmark_type": "other",
1267
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1268
- },
1269
- "jsonschema_bench": {
1270
- "evaluator": "generation",
1271
- "extractor_location": "lm_eval_pairs",
1272
- "extractor_file": "jsonschema_bench",
1273
- "benchmark_type": "other",
1274
- "explanation": "Text generation evaluation - assesses quality of generated text"
1275
- },
1276
- "kbl": {
1277
- "evaluator": "log_likelihoods",
1278
- "extractor_location": "lm_eval_pairs",
1279
- "extractor_file": "kbl",
1280
- "benchmark_type": "other",
1281
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1282
- },
1283
- "kmmlu": {
1284
- "evaluator": "log_likelihoods",
1285
- "extractor_location": "lm_eval_pairs",
1286
- "extractor_file": "kmmlu",
1287
- "benchmark_type": "knowledge",
1288
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1289
- },
1290
- "kobest": {
1291
- "evaluator": null,
1292
- "extractor_location": "lm_eval_pairs",
1293
- "extractor_file": "kobest",
1294
- "benchmark_type": "other",
1295
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1296
- },
1297
- "kormedmcqa": {
1298
- "evaluator": "generation",
1299
- "extractor_location": "lm_eval_pairs",
1300
- "extractor_file": "kormedmcqa",
1301
- "benchmark_type": "question_answering",
1302
- "explanation": "Text generation evaluation - assesses quality of generated text"
1303
- },
1304
- "lambada": {
1305
- "evaluator": "exact_match",
1306
- "extractor_location": "lm_eval_pairs",
1307
- "extractor_file": "lambada",
1308
- "benchmark_type": "other",
1309
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
1310
- },
1311
- "lambada_cloze": {
1312
- "evaluator": null,
1313
- "extractor_location": "lm_eval_pairs",
1314
- "extractor_file": "lambada_cloze",
1315
- "benchmark_type": "other",
1316
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1317
- },
1318
- "lambada_multilingual": {
1319
- "evaluator": null,
1320
- "extractor_location": "lm_eval_pairs",
1321
- "extractor_file": "lambada_multilingual",
1322
- "benchmark_type": "other",
1323
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1324
- },
1325
- "lambada_multilingual_stablelm": {
1326
- "evaluator": "log_likelihoods",
1327
- "extractor_location": "lm_eval_pairs",
1328
- "extractor_file": "lambada_multilingual_stablelm",
1329
- "benchmark_type": "other",
1330
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1331
- },
1332
- "law": {
1333
- "evaluator": null,
1334
- "extractor_location": "lm_eval_pairs",
1335
- "extractor_file": "law",
1336
- "benchmark_type": "other",
1337
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1338
- },
1339
- "law_stack_exchange": {
1340
- "evaluator": "generation",
1341
- "extractor_location": "huggingface_pairs",
1342
- "extractor_file": "law_stack_exchange",
1343
- "benchmark_type": "other",
1344
- "explanation": "Text generation evaluation - assesses quality of generated text"
1345
- },
1346
- "leaderboard": {
1347
- "evaluator": null,
1348
- "extractor_location": "lm_eval_pairs",
1349
- "extractor_file": "leaderboard",
1350
- "benchmark_type": "other",
1351
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1352
- },
1353
- "ledgar": {
1354
- "evaluator": null,
1355
- "extractor_location": "huggingface_pairs",
1356
- "extractor_file": "ledgar",
1357
- "benchmark_type": "other",
1358
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1359
- },
1360
- "libra": {
1361
- "evaluator": "generation",
1362
- "extractor_location": "lm_eval_pairs",
1363
- "extractor_file": "libra",
1364
- "benchmark_type": "other",
1365
- "explanation": "Text generation evaluation - assesses quality of generated text"
1366
- },
1367
- "lingoly": {
1368
- "evaluator": "log_likelihoods",
1369
- "extractor_location": "lm_eval_pairs",
1370
- "extractor_file": "lingoly",
1371
- "benchmark_type": "other",
1372
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1373
- },
1374
- "livecodebench": {
1375
- "evaluator": null,
1376
- "extractor_location": "huggingface_pairs",
1377
- "extractor_file": "livecodebench",
1378
- "benchmark_type": "coding",
1379
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1380
- },
1381
- "livemathbench": {
1382
- "evaluator": null,
1383
- "extractor_location": "huggingface_pairs",
1384
- "extractor_file": "livemathbench",
1385
- "benchmark_type": "mathematics",
1386
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1387
- },
1388
- "livemathbench_cnmo_en": {
1389
- "evaluator": null,
1390
- "extractor_location": "huggingface_pairs",
1391
- "extractor_file": "livemathbench_configs",
1392
- "benchmark_type": "mathematics",
1393
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1394
- },
1395
- "livemathbench_cnmo_zh": {
1396
- "evaluator": null,
1397
- "extractor_location": "huggingface_pairs",
1398
- "extractor_file": "livemathbench_configs",
1399
- "benchmark_type": "mathematics",
1400
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1401
- },
1402
- "llama": {
1403
- "evaluator": null,
1404
- "extractor_location": "huggingface_pairs",
1405
- "extractor_file": "llama",
1406
- "benchmark_type": "other",
1407
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1408
- },
1409
- "llama3": {
1410
- "evaluator": null,
1411
- "extractor_location": "lm_eval_pairs",
1412
- "extractor_file": "llama3",
1413
- "benchmark_type": "other",
1414
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1415
- },
1416
- "lm_syneval": {
1417
- "evaluator": null,
1418
- "extractor_location": "lm_eval_pairs",
1419
- "extractor_file": "lm_syneval",
1420
- "benchmark_type": "other",
1421
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1422
- },
1423
- "logieval": {
1424
- "evaluator": null,
1425
- "extractor_location": "huggingface_pairs",
1426
- "extractor_file": "logieval",
1427
- "benchmark_type": "other",
1428
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1429
- },
1430
- "logiqa": {
1431
- "evaluator": null,
1432
- "extractor_location": "lm_eval_pairs",
1433
- "extractor_file": "logiqa",
1434
- "benchmark_type": "question_answering",
1435
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1436
- },
1437
- "logiqa2": {
1438
- "evaluator": null,
1439
- "extractor_location": "lm_eval_pairs",
1440
- "extractor_file": "logiqa2",
1441
- "benchmark_type": "question_answering",
1442
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1443
- },
1444
- "longbench": {
1445
- "evaluator": null,
1446
- "extractor_location": "lm_eval_pairs",
1447
- "extractor_file": "longbench",
1448
- "benchmark_type": "other",
1449
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1450
- },
1451
- "longbenchv2": {
1452
- "evaluator": null,
1453
- "extractor_location": "lm_eval_pairs",
1454
- "extractor_file": "longbenchv2",
1455
- "benchmark_type": "other",
1456
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1457
- },
1458
- "m_mmlu": {
1459
- "evaluator": null,
1460
- "extractor_location": "huggingface_pairs",
1461
- "extractor_file": "m_mmlu",
1462
- "benchmark_type": "knowledge",
1463
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1464
- },
1465
- "mastermind": {
1466
- "evaluator": "log_likelihoods",
1467
- "extractor_location": "lm_eval_pairs",
1468
- "extractor_file": "mastermind",
1469
- "benchmark_type": "other",
1470
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1471
- },
1472
- "math": {
1473
- "evaluator": null,
1474
- "extractor_location": "huggingface_pairs",
1475
- "extractor_file": "math",
1476
- "benchmark_type": "mathematics",
1477
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1478
- },
1479
- "math500": {
1480
- "evaluator": null,
1481
- "extractor_location": "huggingface_pairs",
1482
- "extractor_file": "math",
1483
- "benchmark_type": "mathematics",
1484
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1485
- },
1486
- "math_500": {
1487
- "evaluator": null,
1488
- "extractor_location": "huggingface_pairs",
1489
- "extractor_file": "math",
1490
- "benchmark_type": "mathematics",
1491
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1492
- },
1493
- "mathqa": {
1494
- "evaluator": null,
1495
- "extractor_location": "lm_eval_pairs",
1496
- "extractor_file": "mathqa",
1497
- "benchmark_type": "mathematics",
1498
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1499
- },
1500
- "mbpp": {
1501
- "evaluator": null,
1502
- "extractor_location": "huggingface_pairs",
1503
- "extractor_file": "mbpp",
1504
- "benchmark_type": "coding",
1505
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1506
- },
1507
- "mbpp_plus": {
1508
- "evaluator": null,
1509
- "extractor_location": "huggingface_pairs",
1510
- "extractor_file": "mbpp",
1511
- "benchmark_type": "coding",
1512
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1513
- },
1514
- "mc-taco": {
1515
- "evaluator": null,
1516
- "extractor_location": "lm_eval_pairs",
1517
- "extractor_file": "mc-taco",
1518
- "benchmark_type": "other",
1519
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1520
- },
1521
- "med_concepts_qa": {
1522
- "evaluator": "log_likelihoods",
1523
- "extractor_location": "lm_eval_pairs",
1524
- "extractor_file": "med_concepts_qa",
1525
- "benchmark_type": "question_answering",
1526
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1527
- },
1528
- "meddialog": {
1529
- "evaluator": null,
1530
- "extractor_location": "huggingface_pairs",
1531
- "extractor_file": "meddialog",
1532
- "benchmark_type": "other",
1533
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1534
- },
1535
- "meddialog_qsumm": {
1536
- "evaluator": null,
1537
- "extractor_location": "huggingface_pairs",
1538
- "extractor_file": "meddialog",
1539
- "benchmark_type": "other",
1540
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1541
- },
1542
- "meddialog_qsumm_perplexity": {
1543
- "evaluator": null,
1544
- "extractor_location": "huggingface_pairs",
1545
- "extractor_file": "meddialog",
1546
- "benchmark_type": "other",
1547
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1548
- },
1549
- "meddialog_raw_dialogues": {
1550
- "evaluator": null,
1551
- "extractor_location": "huggingface_pairs",
1552
- "extractor_file": "meddialog",
1553
- "benchmark_type": "other",
1554
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1555
- },
1556
- "meddialog_raw_perplexity": {
1557
- "evaluator": null,
1558
- "extractor_location": "huggingface_pairs",
1559
- "extractor_file": "meddialog",
1560
- "benchmark_type": "other",
1561
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1562
- },
1563
- "medical": {
1564
- "evaluator": null,
1565
- "extractor_location": "lm_eval_pairs",
1566
- "extractor_file": "medical",
1567
- "benchmark_type": "other",
1568
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1569
- },
1570
- "medical_abstracts": {
1571
- "evaluator": "generation",
1572
- "extractor_location": "huggingface_pairs",
1573
- "extractor_file": "medical_abstracts",
1574
- "benchmark_type": "other",
1575
- "explanation": "Text generation evaluation - assesses quality of generated text"
1576
- },
1577
- "mediqa_qa2019": {
1578
- "evaluator": "generation",
1579
- "extractor_location": "lm_eval_pairs",
1580
- "extractor_file": "mediqa_qa2019",
1581
- "benchmark_type": "question_answering",
1582
- "explanation": "Text generation evaluation - assesses quality of generated text"
1583
- },
1584
- "medmcqa": {
1585
- "evaluator": "log_likelihoods",
1586
- "extractor_location": "lm_eval_pairs",
1587
- "extractor_file": "medmcqa",
1588
- "benchmark_type": "question_answering",
1589
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1590
- },
1591
- "medqa": {
1592
- "evaluator": null,
1593
- "extractor_location": "lm_eval_pairs",
1594
- "extractor_file": "medqa",
1595
- "benchmark_type": "question_answering",
1596
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1597
- },
1598
- "medtext": {
1599
- "evaluator": "generation",
1600
- "extractor_location": "lm_eval_pairs",
1601
- "extractor_file": "medtext",
1602
- "benchmark_type": "other",
1603
- "explanation": "Text generation evaluation - assesses quality of generated text"
1604
- },
1605
- "mela": {
1606
- "evaluator": "log_likelihoods",
1607
- "extractor_location": "huggingface_pairs",
1608
- "extractor_file": "mela",
1609
- "benchmark_type": "other",
1610
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1611
- },
1612
- "meqsum": {
1613
- "evaluator": "generation",
1614
- "extractor_location": "lm_eval_pairs",
1615
- "extractor_file": "meqsum",
1616
- "benchmark_type": "other",
1617
- "explanation": "Text generation evaluation - assesses quality of generated text"
1618
- },
1619
- "mercury": {
1620
- "evaluator": null,
1621
- "extractor_location": "huggingface_pairs",
1622
- "extractor_file": "mercury",
1623
- "benchmark_type": "other",
1624
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1625
- },
1626
- "metabench": {
1627
- "evaluator": null,
1628
- "extractor_location": "lm_eval_pairs",
1629
- "extractor_file": "metabench",
1630
- "benchmark_type": "other",
1631
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1632
- },
1633
- "mgsm": {
1634
- "evaluator": "generation",
1635
- "extractor_location": "lm_eval_pairs",
1636
- "extractor_file": "mgsm",
1637
- "benchmark_type": "mathematics",
1638
- "explanation": "Text generation evaluation - assesses quality of generated text"
1639
- },
1640
- "mimic_repsum": {
1641
- "evaluator": "generation",
1642
- "extractor_location": "lm_eval_pairs",
1643
- "extractor_file": "mimic_repsum",
1644
- "benchmark_type": "other",
1645
- "explanation": "Text generation evaluation - assesses quality of generated text"
1646
- },
1647
- "minerva_math": {
1648
- "evaluator": "generation",
1649
- "extractor_location": "lm_eval_pairs",
1650
- "extractor_file": "minerva_math",
1651
- "benchmark_type": "mathematics",
1652
- "explanation": "Text generation evaluation - assesses quality of generated text"
1653
- },
1654
- "mlqa": {
1655
- "evaluator": "generation",
1656
- "extractor_location": "lm_eval_pairs",
1657
- "extractor_file": "mlqa",
1658
- "benchmark_type": "question_answering",
1659
- "explanation": "Text generation evaluation - assesses quality of generated text"
1660
- },
1661
- "mmlu": {
1662
- "evaluator": "log_likelihoods",
1663
- "extractor_location": "lm_eval_pairs",
1664
- "extractor_file": "mmlu",
1665
- "benchmark_type": "knowledge",
1666
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1667
- },
1668
- "mmlu_pro": {
1669
- "evaluator": null,
1670
- "extractor_location": "lm_eval_pairs",
1671
- "extractor_file": "mmlu_pro",
1672
- "benchmark_type": "knowledge",
1673
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1674
- },
1675
- "mmlusr": {
1676
- "evaluator": null,
1677
- "extractor_location": "huggingface_pairs",
1678
- "extractor_file": "mmlusr",
1679
- "benchmark_type": "knowledge",
1680
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1681
- },
1682
- "mmlusr_answer_only": {
1683
- "evaluator": null,
1684
- "extractor_location": "huggingface_pairs",
1685
- "extractor_file": "mmlusr",
1686
- "benchmark_type": "knowledge",
1687
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1688
- },
1689
- "mmlusr_question_and_answer": {
1690
- "evaluator": null,
1691
- "extractor_location": "huggingface_pairs",
1692
- "extractor_file": "mmlusr",
1693
- "benchmark_type": "knowledge",
1694
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1695
- },
1696
- "mmlusr_question_only": {
1697
- "evaluator": null,
1698
- "extractor_location": "huggingface_pairs",
1699
- "extractor_file": "mmlusr",
1700
- "benchmark_type": "knowledge",
1701
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1702
- },
1703
- "mmmu": {
1704
- "evaluator": "log_likelihoods",
1705
- "extractor_location": "lm_eval_pairs",
1706
- "extractor_file": "mmmu",
1707
- "benchmark_type": "other",
1708
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1709
- },
1710
- "mnli": {
1711
- "evaluator": "log_likelihoods",
1712
- "extractor_location": "lm_eval_pairs",
1713
- "extractor_file": "mnli",
1714
- "benchmark_type": "other",
1715
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1716
- },
1717
- "model_written_evals": {
1718
- "evaluator": "log_likelihoods",
1719
- "extractor_location": "lm_eval_pairs",
1720
- "extractor_file": "model_written_evals",
1721
- "benchmark_type": "other",
1722
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1723
- },
1724
- "moral_stories": {
1725
- "evaluator": "log_likelihoods",
1726
- "extractor_location": "lm_eval_pairs",
1727
- "extractor_file": "moral_stories",
1728
- "benchmark_type": "other",
1729
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1730
- },
1731
- "mrpc": {
1732
- "evaluator": null,
1733
- "extractor_location": "lm_eval_pairs",
1734
- "extractor_file": "mrpc",
1735
- "benchmark_type": "other",
1736
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1737
- },
1738
- "mts_dialog": {
1739
- "evaluator": "generation",
1740
- "extractor_location": "lm_eval_pairs",
1741
- "extractor_file": "mts_dialog",
1742
- "benchmark_type": "other",
1743
- "explanation": "Text generation evaluation - assesses quality of generated text"
1744
- },
1745
- "multiblimp": {
1746
- "evaluator": "log_likelihoods",
1747
- "extractor_location": "lm_eval_pairs",
1748
- "extractor_file": "multiblimp",
1749
- "benchmark_type": "other",
1750
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1751
- },
1752
- "multilingual": {
1753
- "evaluator": null,
1754
- "extractor_location": "lm_eval_pairs",
1755
- "extractor_file": "multilingual",
1756
- "benchmark_type": "other",
1757
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1758
- },
1759
- "multimedqa": {
1760
- "evaluator": "log_likelihoods",
1761
- "extractor_location": "huggingface_pairs",
1762
- "extractor_file": "multimedqa",
1763
- "benchmark_type": "question_answering",
1764
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1765
- },
1766
- "multipl_e": {
1767
- "evaluator": null,
1768
- "extractor_location": "huggingface_pairs",
1769
- "extractor_file": "multipl_e",
1770
- "benchmark_type": "other",
1771
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1772
- },
1773
- "multiple": {
1774
- "evaluator": null,
1775
- "extractor_location": "huggingface_pairs",
1776
- "extractor_file": "multiple",
1777
- "benchmark_type": "other",
1778
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1779
- },
1780
- "multiple_cpp": {
1781
- "evaluator": null,
1782
- "extractor_location": "huggingface_pairs",
1783
- "extractor_file": "multipl_e",
1784
- "benchmark_type": "other",
1785
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1786
- },
1787
- "multiple_go": {
1788
- "evaluator": null,
1789
- "extractor_location": "huggingface_pairs",
1790
- "extractor_file": "multipl_e",
1791
- "benchmark_type": "other",
1792
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1793
- },
1794
- "multiple_java": {
1795
- "evaluator": null,
1796
- "extractor_location": "huggingface_pairs",
1797
- "extractor_file": "multipl_e",
1798
- "benchmark_type": "other",
1799
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1800
- },
1801
- "multiple_js": {
1802
- "evaluator": null,
1803
- "extractor_location": "huggingface_pairs",
1804
- "extractor_file": "multipl_e",
1805
- "benchmark_type": "other",
1806
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1807
- },
1808
- "multiple_py": {
1809
- "evaluator": null,
1810
- "extractor_location": "huggingface_pairs",
1811
- "extractor_file": "multipl_e",
1812
- "benchmark_type": "other",
1813
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1814
- },
1815
- "multiple_rs": {
1816
- "evaluator": null,
1817
- "extractor_location": "huggingface_pairs",
1818
- "extractor_file": "multipl_e",
1819
- "benchmark_type": "other",
1820
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1821
- },
1822
- "multirc": {
1823
- "evaluator": null,
1824
- "extractor_location": "lm_eval_pairs",
1825
- "extractor_file": "multirc",
1826
- "benchmark_type": "other",
1827
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1828
- },
1829
- "mutual": {
1830
- "evaluator": null,
1831
- "extractor_location": "lm_eval_pairs",
1832
- "extractor_file": "mutual",
1833
- "benchmark_type": "other",
1834
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1835
- },
1836
- "non": {
1837
- "evaluator": null,
1838
- "extractor_location": "lm_eval_pairs",
1839
- "extractor_file": "non",
1840
- "benchmark_type": "other",
1841
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1842
- },
1843
- "noreval": {
1844
- "evaluator": "log_likelihoods",
1845
- "extractor_location": "lm_eval_pairs",
1846
- "extractor_file": "noreval",
1847
- "benchmark_type": "other",
1848
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1849
- },
1850
- "noreval_gen": {
1851
- "evaluator": "generation",
1852
- "extractor_location": "lm_eval_pairs",
1853
- "extractor_file": "noreval_gen",
1854
- "benchmark_type": "other",
1855
- "explanation": "Text generation evaluation - assesses quality of generated text"
1856
- },
1857
- "noreval_mc": {
1858
- "evaluator": "log_likelihoods",
1859
- "extractor_location": "lm_eval_pairs",
1860
- "extractor_file": "noreval_mc",
1861
- "benchmark_type": "other",
1862
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1863
- },
1864
- "noticia": {
1865
- "evaluator": "generation",
1866
- "extractor_location": "huggingface_pairs",
1867
- "extractor_file": "noticia",
1868
- "benchmark_type": "other",
1869
- "explanation": "Text generation evaluation - assesses quality of generated text"
1870
- },
1871
- "nq_open": {
1872
- "evaluator": "generation",
1873
- "extractor_location": "lm_eval_pairs",
1874
- "extractor_file": "nq_open",
1875
- "benchmark_type": "other",
1876
- "explanation": "Text generation evaluation - assesses quality of generated text"
1877
- },
1878
- "okapi": {
1879
- "evaluator": "log_likelihoods",
1880
- "extractor_location": "lm_eval_pairs",
1881
- "extractor_file": "okapi",
1882
- "benchmark_type": "other",
1883
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1884
- },
1885
- "okapi_arc_multilingual": {
1886
- "evaluator": "log_likelihoods",
1887
- "extractor_location": "lm_eval_pairs",
1888
- "extractor_file": "okapi_arc_multilingual",
1889
- "benchmark_type": "knowledge",
1890
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1891
- },
1892
- "okapi_hellaswag_multilingual": {
1893
- "evaluator": "log_likelihoods",
1894
- "extractor_location": "lm_eval_pairs",
1895
- "extractor_file": "okapi_hellaswag_multilingual",
1896
- "benchmark_type": "knowledge",
1897
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1898
- },
1899
- "okapi_mmlu_multilingual": {
1900
- "evaluator": "log_likelihoods",
1901
- "extractor_location": "lm_eval_pairs",
1902
- "extractor_file": "okapi_mmlu_multilingual",
1903
- "benchmark_type": "knowledge",
1904
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1905
- },
1906
- "okapi_truthfulqa_multilingual": {
1907
- "evaluator": "log_likelihoods",
1908
- "extractor_location": "lm_eval_pairs",
1909
- "extractor_file": "okapi_truthfulqa_multilingual",
1910
- "benchmark_type": "question_answering",
1911
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1912
- },
1913
- "olaph": {
1914
- "evaluator": "generation",
1915
- "extractor_location": "lm_eval_pairs",
1916
- "extractor_file": "olaph",
1917
- "benchmark_type": "other",
1918
- "explanation": "Text generation evaluation - assesses quality of generated text"
1919
- },
1920
- "openbookqa": {
1921
- "evaluator": "log_likelihoods",
1922
- "extractor_location": "lm_eval_pairs",
1923
- "extractor_file": "openbookqa",
1924
- "benchmark_type": "question_answering",
1925
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1926
- },
1927
- "openllm": {
1928
- "evaluator": "log_likelihoods",
1929
- "extractor_location": "huggingface_pairs",
1930
- "extractor_file": "openllm",
1931
- "benchmark_type": "other",
1932
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1933
- },
1934
- "option": {
1935
- "evaluator": null,
1936
- "extractor_location": "lm_eval_pairs",
1937
- "extractor_file": "option",
1938
- "benchmark_type": "other",
1939
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1940
- },
1941
- "paloma": {
1942
- "evaluator": "perplexity",
1943
- "extractor_location": "lm_eval_pairs",
1944
- "extractor_file": "paloma",
1945
- "benchmark_type": "other",
1946
- "explanation": "Perplexity measurement - evaluates model's prediction confidence"
1947
- },
1948
- "parafraseja": {
1949
- "evaluator": "log_likelihoods",
1950
- "extractor_location": "lm_eval_pairs",
1951
- "extractor_file": "parafraseja",
1952
- "benchmark_type": "other",
1953
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1954
- },
1955
- "parafrases": {
1956
- "evaluator": "log_likelihoods",
1957
- "extractor_location": "lm_eval_pairs",
1958
- "extractor_file": "parafrases",
1959
- "benchmark_type": "other",
1960
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1961
- },
1962
- "paws": {
1963
- "evaluator": null,
1964
- "extractor_location": "lm_eval_pairs",
1965
- "extractor_file": "paws",
1966
- "benchmark_type": "other",
1967
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1968
- },
1969
- "paws_x": {
1970
- "evaluator": "log_likelihoods",
1971
- "extractor_location": "lm_eval_pairs",
1972
- "extractor_file": "paws_x",
1973
- "benchmark_type": "other",
1974
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1975
- },
1976
- "pawsx": {
1977
- "evaluator": "log_likelihoods",
1978
- "extractor_location": "lm_eval_pairs",
1979
- "extractor_file": "pawsx",
1980
- "benchmark_type": "other",
1981
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1982
- },
1983
- "persona": {
1984
- "evaluator": "log_likelihoods",
1985
- "extractor_location": "lm_eval_pairs",
1986
- "extractor_file": "persona",
1987
- "benchmark_type": "other",
1988
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
1989
- },
1990
- "phrases": {
1991
- "evaluator": null,
1992
- "extractor_location": "lm_eval_pairs",
1993
- "extractor_file": "phrases",
1994
- "benchmark_type": "other",
1995
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
1996
- },
1997
- "pile": {
1998
- "evaluator": "exact_match",
1999
- "extractor_location": "lm_eval_pairs",
2000
- "extractor_file": "pile",
2001
- "benchmark_type": "other",
2002
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
2003
- },
2004
- "pile_10k": {
2005
- "evaluator": "generation",
2006
- "extractor_location": "lm_eval_pairs",
2007
- "extractor_file": "pile_10k",
2008
- "benchmark_type": "other",
2009
- "explanation": "Text generation evaluation - assesses quality of generated text"
2010
- },
2011
- "piqa": {
2012
- "evaluator": "log_likelihoods",
2013
- "extractor_location": "lm_eval_pairs",
2014
- "extractor_file": "piqa",
2015
- "benchmark_type": "question_answering",
2016
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2017
- },
2018
- "polemo2": {
2019
- "evaluator": "generation",
2020
- "extractor_location": "lm_eval_pairs",
2021
- "extractor_file": "polemo2",
2022
- "benchmark_type": "other",
2023
- "explanation": "Text generation evaluation - assesses quality of generated text"
2024
- },
2025
- "polymath": {
2026
- "evaluator": null,
2027
- "extractor_location": "huggingface_pairs",
2028
- "extractor_file": "polymath",
2029
- "benchmark_type": "mathematics",
2030
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2031
- },
2032
- "polymath_en_high": {
2033
- "evaluator": null,
2034
- "extractor_location": "huggingface_pairs",
2035
- "extractor_file": "polymath_configs",
2036
- "benchmark_type": "mathematics",
2037
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2038
- },
2039
- "polymath_en_medium": {
2040
- "evaluator": null,
2041
- "extractor_location": "huggingface_pairs",
2042
- "extractor_file": "polymath_configs",
2043
- "benchmark_type": "mathematics",
2044
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2045
- },
2046
- "polymath_zh_high": {
2047
- "evaluator": null,
2048
- "extractor_location": "huggingface_pairs",
2049
- "extractor_file": "polymath_configs",
2050
- "benchmark_type": "mathematics",
2051
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2052
- },
2053
- "polymath_zh_medium": {
2054
- "evaluator": null,
2055
- "extractor_location": "huggingface_pairs",
2056
- "extractor_file": "polymath_configs",
2057
- "benchmark_type": "mathematics",
2058
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2059
- },
2060
- "portuguese_bench": {
2061
- "evaluator": "log_likelihoods",
2062
- "extractor_location": "lm_eval_pairs",
2063
- "extractor_file": "portuguese_bench",
2064
- "benchmark_type": "other",
2065
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2066
- },
2067
- "prompt": {
2068
- "evaluator": null,
2069
- "extractor_location": "lm_eval_pairs",
2070
- "extractor_file": "prompt",
2071
- "benchmark_type": "other",
2072
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2073
- },
2074
- "prost": {
2075
- "evaluator": null,
2076
- "extractor_location": "lm_eval_pairs",
2077
- "extractor_file": "prost",
2078
- "benchmark_type": "other",
2079
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2080
- },
2081
- "pubmedqa": {
2082
- "evaluator": null,
2083
- "extractor_location": "lm_eval_pairs",
2084
- "extractor_file": "pubmedqa",
2085
- "benchmark_type": "question_answering",
2086
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2087
- },
2088
- "pythia": {
2089
- "evaluator": "log_likelihoods",
2090
- "extractor_location": "huggingface_pairs",
2091
- "extractor_file": "pythia",
2092
- "benchmark_type": "other",
2093
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2094
- },
2095
- "qa4mre": {
2096
- "evaluator": "log_likelihoods",
2097
- "extractor_location": "lm_eval_pairs",
2098
- "extractor_file": "qa4mre",
2099
- "benchmark_type": "question_answering",
2100
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2101
- },
2102
- "qasper": {
2103
- "evaluator": "generation",
2104
- "extractor_location": "lm_eval_pairs",
2105
- "extractor_file": "qasper",
2106
- "benchmark_type": "question_answering",
2107
- "explanation": "Text generation evaluation - assesses quality of generated text"
2108
- },
2109
- "qnli": {
2110
- "evaluator": null,
2111
- "extractor_location": "lm_eval_pairs",
2112
- "extractor_file": "qnli",
2113
- "benchmark_type": "other",
2114
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2115
- },
2116
- "qnlieu": {
2117
- "evaluator": null,
2118
- "extractor_location": "lm_eval_pairs",
2119
- "extractor_file": "qnlieu",
2120
- "benchmark_type": "other",
2121
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2122
- },
2123
- "qqp": {
2124
- "evaluator": null,
2125
- "extractor_location": "lm_eval_pairs",
2126
- "extractor_file": "qqp",
2127
- "benchmark_type": "other",
2128
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2129
- },
2130
- "quac": {
2131
- "evaluator": null,
2132
- "extractor_location": "lm_eval_pairs",
2133
- "extractor_file": "quac",
2134
- "benchmark_type": "other",
2135
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2136
- },
2137
- "race": {
2138
- "evaluator": "log_likelihoods",
2139
- "extractor_location": "lm_eval_pairs",
2140
- "extractor_file": "race",
2141
- "benchmark_type": "other",
2142
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2143
- },
2144
- "random": {
2145
- "evaluator": null,
2146
- "extractor_location": "lm_eval_pairs",
2147
- "extractor_file": "random",
2148
- "benchmark_type": "other",
2149
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2150
- },
2151
- "realtoxicityprompts": {
2152
- "evaluator": "generation",
2153
- "extractor_location": "lm_eval_pairs",
2154
- "extractor_file": "realtoxicityprompts",
2155
- "benchmark_type": "other",
2156
- "explanation": "Text generation evaluation - assesses quality of generated text"
2157
- },
2158
- "recode": {
2159
- "evaluator": null,
2160
- "extractor_location": "huggingface_pairs",
2161
- "extractor_file": "recode",
2162
- "benchmark_type": "coding",
2163
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2164
- },
2165
- "record": {
2166
- "evaluator": null,
2167
- "extractor_location": "huggingface_pairs",
2168
- "extractor_file": "record",
2169
- "benchmark_type": "other",
2170
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2171
- },
2172
- "reversed": {
2173
- "evaluator": "exact_match",
2174
- "extractor_location": "lm_eval_pairs",
2175
- "extractor_file": "reversed",
2176
- "benchmark_type": "other",
2177
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
2178
- },
2179
- "rte": {
2180
- "evaluator": null,
2181
- "extractor_location": "lm_eval_pairs",
2182
- "extractor_file": "rte",
2183
- "benchmark_type": "other",
2184
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2185
- },
2186
- "ruler": {
2187
- "evaluator": null,
2188
- "extractor_location": "lm_eval_pairs",
2189
- "extractor_file": "ruler",
2190
- "benchmark_type": "other",
2191
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2192
- },
2193
- "sciq": {
2194
- "evaluator": "log_likelihoods",
2195
- "extractor_location": "lm_eval_pairs",
2196
- "extractor_file": "sciq",
2197
- "benchmark_type": "other",
2198
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2199
- },
2200
- "score": {
2201
- "evaluator": "log_likelihoods",
2202
- "extractor_location": "lm_eval_pairs",
2203
- "extractor_file": "score",
2204
- "benchmark_type": "other",
2205
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2206
- },
2207
- "scrolls": {
2208
- "evaluator": "generation",
2209
- "extractor_location": "lm_eval_pairs",
2210
- "extractor_file": "scrolls",
2211
- "benchmark_type": "other",
2212
- "explanation": "Text generation evaluation - assesses quality of generated text"
2213
- },
2214
- "self": {
2215
- "evaluator": "log_likelihoods",
2216
- "extractor_location": "lm_eval_pairs",
2217
- "extractor_file": "self",
2218
- "benchmark_type": "other",
2219
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2220
- },
2221
- "sglue": {
2222
- "evaluator": null,
2223
- "extractor_location": "lm_eval_pairs",
2224
- "extractor_file": "sglue",
2225
- "benchmark_type": "other",
2226
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2227
- },
2228
- "simple_cooccurrence_bias": {
2229
- "evaluator": null,
2230
- "extractor_location": "lm_eval_pairs",
2231
- "extractor_file": "simple_cooccurrence_bias",
2232
- "benchmark_type": "other",
2233
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2234
- },
2235
- "siqa": {
2236
- "evaluator": "log_likelihoods",
2237
- "extractor_location": "lm_eval_pairs",
2238
- "extractor_file": "siqa",
2239
- "benchmark_type": "question_answering",
2240
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2241
- },
2242
- "social_iqa": {
2243
- "evaluator": null,
2244
- "extractor_location": "lm_eval_pairs",
2245
- "extractor_file": "social_iqa",
2246
- "benchmark_type": "question_answering",
2247
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2248
- },
2249
- "spanish_bench": {
2250
- "evaluator": "log_likelihoods",
2251
- "extractor_location": "lm_eval_pairs",
2252
- "extractor_file": "spanish_bench",
2253
- "benchmark_type": "other",
2254
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2255
- },
2256
- "squad2": {
2257
- "evaluator": null,
2258
- "extractor_location": "huggingface_pairs",
2259
- "extractor_file": "squad2",
2260
- "benchmark_type": "question_answering",
2261
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2262
- },
2263
- "squad_completion": {
2264
- "evaluator": "exact_match",
2265
- "extractor_location": "lm_eval_pairs",
2266
- "extractor_file": "squad_completion",
2267
- "benchmark_type": "question_answering",
2268
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
2269
- },
2270
- "sst2": {
2271
- "evaluator": null,
2272
- "extractor_location": "lm_eval_pairs",
2273
- "extractor_file": "sst2",
2274
- "benchmark_type": "other",
2275
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2276
- },
2277
- "storycloze": {
2278
- "evaluator": "log_likelihoods",
2279
- "extractor_location": "lm_eval_pairs",
2280
- "extractor_file": "storycloze",
2281
- "benchmark_type": "other",
2282
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2283
- },
2284
- "stsb": {
2285
- "evaluator": null,
2286
- "extractor_location": "huggingface_pairs",
2287
- "extractor_file": "stsb",
2288
- "benchmark_type": "other",
2289
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2290
- },
2291
- "summarization": {
2292
- "evaluator": null,
2293
- "extractor_location": "lm_eval_pairs",
2294
- "extractor_file": "summarization",
2295
- "benchmark_type": "other",
2296
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2297
- },
2298
- "super": {
2299
- "evaluator": null,
2300
- "extractor_location": "lm_eval_pairs",
2301
- "extractor_file": "super",
2302
- "benchmark_type": "other",
2303
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2304
- },
2305
- "super_glue": {
2306
- "evaluator": "log_likelihoods",
2307
- "extractor_location": "lm_eval_pairs",
2308
- "extractor_file": "super_glue",
2309
- "benchmark_type": "other",
2310
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2311
- },
2312
- "super_glue_lm_eval_v1": {
2313
- "evaluator": "log_likelihoods",
2314
- "extractor_location": "huggingface_pairs",
2315
- "extractor_file": "super_glue_lm_eval_v1",
2316
- "benchmark_type": "other",
2317
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2318
- },
2319
- "super_glue_lm_eval_v1_seq2seq": {
2320
- "evaluator": "generation",
2321
- "extractor_location": "huggingface_pairs",
2322
- "extractor_file": "super_glue_lm_eval_v1_seq2seq",
2323
- "benchmark_type": "other",
2324
- "explanation": "Text generation evaluation - assesses quality of generated text"
2325
- },
2326
- "super_glue_t5_prompt": {
2327
- "evaluator": "generation",
2328
- "extractor_location": "huggingface_pairs",
2329
- "extractor_file": "super_glue_t5_prompt",
2330
- "benchmark_type": "other",
2331
- "explanation": "Text generation evaluation - assesses quality of generated text"
2332
- },
2333
- "super_gpqa": {
2334
- "evaluator": null,
2335
- "extractor_location": "huggingface_pairs",
2336
- "extractor_file": "super_gpqa",
2337
- "benchmark_type": "question_answering",
2338
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2339
- },
2340
- "superglue": {
2341
- "evaluator": null,
2342
- "extractor_location": "lm_eval_pairs",
2343
- "extractor_file": "superglue",
2344
- "benchmark_type": "other",
2345
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2346
- },
2347
- "supergpqa": {
2348
- "evaluator": null,
2349
- "extractor_location": "lm_eval_pairs",
2350
- "extractor_file": "supergpqa",
2351
- "benchmark_type": "question_answering",
2352
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2353
- },
2354
- "supergpqa_biology": {
2355
- "evaluator": null,
2356
- "extractor_location": "huggingface_pairs",
2357
- "extractor_file": "super_gpqa",
2358
- "benchmark_type": "question_answering",
2359
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2360
- },
2361
- "supergpqa_chemistry": {
2362
- "evaluator": null,
2363
- "extractor_location": "huggingface_pairs",
2364
- "extractor_file": "super_gpqa",
2365
- "benchmark_type": "question_answering",
2366
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2367
- },
2368
- "supergpqa_physics": {
2369
- "evaluator": null,
2370
- "extractor_location": "huggingface_pairs",
2371
- "extractor_file": "super_gpqa",
2372
- "benchmark_type": "question_answering",
2373
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2374
- },
2375
- "swag": {
2376
- "evaluator": "log_likelihoods",
2377
- "extractor_location": "lm_eval_pairs",
2378
- "extractor_file": "swag",
2379
- "benchmark_type": "other",
2380
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2381
- },
2382
- "swde": {
2383
- "evaluator": null,
2384
- "extractor_location": "lm_eval_pairs",
2385
- "extractor_file": "swde",
2386
- "benchmark_type": "other",
2387
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2388
- },
2389
- "sycophancy": {
2390
- "evaluator": "log_likelihoods",
2391
- "extractor_location": "lm_eval_pairs",
2392
- "extractor_file": "sycophancy",
2393
- "benchmark_type": "other",
2394
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2395
- },
2396
- "t0": {
2397
- "evaluator": "generation",
2398
- "extractor_location": "lm_eval_pairs",
2399
- "extractor_file": "t0",
2400
- "benchmark_type": "other",
2401
- "explanation": "Text generation evaluation - assesses quality of generated text"
2402
- },
2403
- "tag": {
2404
- "evaluator": null,
2405
- "extractor_location": "huggingface_pairs",
2406
- "extractor_file": "tag",
2407
- "benchmark_type": "other",
2408
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2409
- },
2410
- "teca": {
2411
- "evaluator": "log_likelihoods",
2412
- "extractor_location": "lm_eval_pairs",
2413
- "extractor_file": "teca",
2414
- "benchmark_type": "other",
2415
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2416
- },
2417
- "tinyarc": {
2418
- "evaluator": "log_likelihoods",
2419
- "extractor_location": "lm_eval_pairs",
2420
- "extractor_file": "tinyarc",
2421
- "benchmark_type": "knowledge",
2422
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2423
- },
2424
- "tinybenchmarks": {
2425
- "evaluator": "log_likelihoods",
2426
- "extractor_location": "lm_eval_pairs",
2427
- "extractor_file": "tinybenchmarks",
2428
- "benchmark_type": "other",
2429
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2430
- },
2431
- "tinygsm8k": {
2432
- "evaluator": "generation",
2433
- "extractor_location": "lm_eval_pairs",
2434
- "extractor_file": "tinygsm8k",
2435
- "benchmark_type": "mathematics",
2436
- "explanation": "Text generation evaluation - assesses quality of generated text"
2437
- },
2438
- "tinyhellaswag": {
2439
- "evaluator": "log_likelihoods",
2440
- "extractor_location": "lm_eval_pairs",
2441
- "extractor_file": "tinyhellaswag",
2442
- "benchmark_type": "knowledge",
2443
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2444
- },
2445
- "tinymmlu": {
2446
- "evaluator": "log_likelihoods",
2447
- "extractor_location": "lm_eval_pairs",
2448
- "extractor_file": "tinymmlu",
2449
- "benchmark_type": "knowledge",
2450
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2451
- },
2452
- "tinytruthfulqa": {
2453
- "evaluator": "log_likelihoods",
2454
- "extractor_location": "lm_eval_pairs",
2455
- "extractor_file": "tinytruthfulqa",
2456
- "benchmark_type": "question_answering",
2457
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2458
- },
2459
- "tinywinogrande": {
2460
- "evaluator": "log_likelihoods",
2461
- "extractor_location": "lm_eval_pairs",
2462
- "extractor_file": "tinywinogrande",
2463
- "benchmark_type": "other",
2464
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2465
- },
2466
- "tmlu": {
2467
- "evaluator": "log_likelihoods",
2468
- "extractor_location": "huggingface_pairs",
2469
- "extractor_file": "tmlu",
2470
- "benchmark_type": "other",
2471
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2472
- },
2473
- "tmmluplus": {
2474
- "evaluator": "log_likelihoods",
2475
- "extractor_location": "lm_eval_pairs",
2476
- "extractor_file": "tmmluplus",
2477
- "benchmark_type": "knowledge",
2478
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2479
- },
2480
- "toxigen": {
2481
- "evaluator": "log_likelihoods",
2482
- "extractor_location": "lm_eval_pairs",
2483
- "extractor_file": "toxigen",
2484
- "benchmark_type": "other",
2485
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2486
- },
2487
- "translation": {
2488
- "evaluator": "generation",
2489
- "extractor_location": "lm_eval_pairs",
2490
- "extractor_file": "translation",
2491
- "benchmark_type": "translation",
2492
- "explanation": "Text generation evaluation - assesses quality of generated text"
2493
- },
2494
- "triviaqa": {
2495
- "evaluator": "generation",
2496
- "extractor_location": "lm_eval_pairs",
2497
- "extractor_file": "triviaqa",
2498
- "benchmark_type": "question_answering",
2499
- "explanation": "Text generation evaluation - assesses quality of generated text"
2500
- },
2501
- "truthfulqa": {
2502
- "evaluator": "log_likelihoods",
2503
- "extractor_location": "lm_eval_pairs",
2504
- "extractor_file": "truthfulqa",
2505
- "benchmark_type": "question_answering",
2506
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2507
- },
2508
- "truthfulqa_gen": {
2509
- "evaluator": "generation",
2510
- "extractor_location": "lm_eval_pairs",
2511
- "extractor_file": "truthfulqa_gen",
2512
- "benchmark_type": "question_answering",
2513
- "explanation": "Text generation evaluation - assesses quality of generated text"
2514
- },
2515
- "truthfulqa_mc1": {
2516
- "evaluator": "log_likelihoods",
2517
- "extractor_location": "lm_eval_pairs",
2518
- "extractor_file": "truthfulqa_mc1",
2519
- "benchmark_type": "question_answering",
2520
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2521
- },
2522
- "truthfulqa_mc2": {
2523
- "evaluator": "log_likelihoods",
2524
- "extractor_location": "lm_eval_pairs",
2525
- "extractor_file": "truthfulqa_mc2",
2526
- "benchmark_type": "question_answering",
2527
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2528
- },
2529
- "truthfulqa_multi": {
2530
- "evaluator": "mixed",
2531
- "extractor_location": "lm_eval_pairs",
2532
- "extractor_file": "truthfulqa_multi",
2533
- "benchmark_type": "question_answering",
2534
- "explanation": "Uses mixed evaluator"
2535
- },
2536
- "turblimp_core": {
2537
- "evaluator": null,
2538
- "extractor_location": "lm_eval_pairs",
2539
- "extractor_file": "turblimp_core",
2540
- "benchmark_type": "other",
2541
- "explanation": "NO EVALUATOR DEFINED - will fail with error"
2542
- },
2543
- "turkishmmlu": {
2544
- "evaluator": "log_likelihoods",
2545
- "extractor_location": "lm_eval_pairs",
2546
- "extractor_file": "turkishmmlu",
2547
- "benchmark_type": "knowledge",
2548
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2549
- },
2550
- "twenty_newsgroups": {
2551
- "evaluator": "exact_match",
2552
- "extractor_location": "lm_eval_pairs",
2553
- "extractor_file": "twenty_newsgroups",
2554
- "benchmark_type": "other",
2555
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
2556
- },
2557
- "unfair": {
2558
- "evaluator": "generation",
2559
- "extractor_location": "lm_eval_pairs",
2560
- "extractor_file": "unfair",
2561
- "benchmark_type": "other",
2562
- "explanation": "Text generation evaluation - assesses quality of generated text"
2563
- },
2564
- "unitxt": {
2565
- "evaluator": "generation",
2566
- "extractor_location": "lm_eval_pairs",
2567
- "extractor_file": "unitxt",
2568
- "benchmark_type": "other",
2569
- "explanation": "Text generation evaluation - assesses quality of generated text"
2570
- },
2571
- "unscramble": {
2572
- "evaluator": "exact_match",
2573
- "extractor_location": "lm_eval_pairs",
2574
- "extractor_file": "unscramble",
2575
- "benchmark_type": "other",
2576
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
2577
- },
2578
- "vaxx": {
2579
- "evaluator": "log_likelihoods",
2580
- "extractor_location": "lm_eval_pairs",
2581
- "extractor_file": "vaxx",
2582
- "benchmark_type": "other",
2583
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2584
- },
2585
- "webqs": {
2586
- "evaluator": "log_likelihoods",
2587
- "extractor_location": "lm_eval_pairs",
2588
- "extractor_file": "webqs",
2589
- "benchmark_type": "other",
2590
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2591
- },
2592
- "wic": {
2593
- "evaluator": "log_likelihoods",
2594
- "extractor_location": "lm_eval_pairs",
2595
- "extractor_file": "wic",
2596
- "benchmark_type": "other",
2597
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2598
- },
2599
- "wiceu": {
2600
- "evaluator": "log_likelihoods",
2601
- "extractor_location": "huggingface_pairs",
2602
- "extractor_file": "wiceu",
2603
- "benchmark_type": "other",
2604
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2605
- },
2606
- "wikitext": {
2607
- "evaluator": "generation",
2608
- "extractor_location": "lm_eval_pairs",
2609
- "extractor_file": "wikitext",
2610
- "benchmark_type": "other",
2611
- "explanation": "Text generation evaluation - assesses quality of generated text"
2612
- },
2613
- "winogender": {
2614
- "evaluator": "generation",
2615
- "extractor_location": "lm_eval_pairs",
2616
- "extractor_file": "winogender",
2617
- "benchmark_type": "other",
2618
- "explanation": "Text generation evaluation - assesses quality of generated text"
2619
- },
2620
- "winogrande": {
2621
- "evaluator": "log_likelihoods",
2622
- "extractor_location": "lm_eval_pairs",
2623
- "extractor_file": "winogrande",
2624
- "benchmark_type": "other",
2625
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2626
- },
2627
- "wmdp": {
2628
- "evaluator": "log_likelihoods",
2629
- "extractor_location": "lm_eval_pairs",
2630
- "extractor_file": "wmdp",
2631
- "benchmark_type": "other",
2632
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2633
- },
2634
- "wmt14": {
2635
- "evaluator": "generation",
2636
- "extractor_location": "lm_eval_pairs",
2637
- "extractor_file": "wmt14",
2638
- "benchmark_type": "translation",
2639
- "explanation": "Text generation evaluation - assesses quality of generated text"
2640
- },
2641
- "wmt14_en_fr": {
2642
- "evaluator": "generation",
2643
- "extractor_location": "huggingface_pairs",
2644
- "extractor_file": "wmt14_en_fr",
2645
- "benchmark_type": "translation",
2646
- "explanation": "Text generation evaluation - assesses quality of generated text"
2647
- },
2648
- "wmt14_fr_en": {
2649
- "evaluator": "generation",
2650
- "extractor_location": "huggingface_pairs",
2651
- "extractor_file": "wmt14_fr_en",
2652
- "benchmark_type": "translation",
2653
- "explanation": "Text generation evaluation - assesses quality of generated text"
2654
- },
2655
- "wmt16": {
2656
- "evaluator": "generation",
2657
- "extractor_location": "lm_eval_pairs",
2658
- "extractor_file": "wmt16",
2659
- "benchmark_type": "translation",
2660
- "explanation": "Text generation evaluation - assesses quality of generated text"
2661
- },
2662
- "wmt16_de_en": {
2663
- "evaluator": "generation",
2664
- "extractor_location": "huggingface_pairs",
2665
- "extractor_file": "wmt16_de_en",
2666
- "benchmark_type": "translation",
2667
- "explanation": "Text generation evaluation - assesses quality of generated text"
2668
- },
2669
- "wmt16_en_de": {
2670
- "evaluator": "generation",
2671
- "extractor_location": "huggingface_pairs",
2672
- "extractor_file": "wmt16_en_de",
2673
- "benchmark_type": "translation",
2674
- "explanation": "Text generation evaluation - assesses quality of generated text"
2675
- },
2676
- "wmt16_en_ro": {
2677
- "evaluator": "generation",
2678
- "extractor_location": "huggingface_pairs",
2679
- "extractor_file": "wmt16_en_ro",
2680
- "benchmark_type": "translation",
2681
- "explanation": "Text generation evaluation - assesses quality of generated text"
2682
- },
2683
- "wmt16_ro_en": {
2684
- "evaluator": "generation",
2685
- "extractor_location": "huggingface_pairs",
2686
- "extractor_file": "wmt16_ro_en",
2687
- "benchmark_type": "translation",
2688
- "explanation": "Text generation evaluation - assesses quality of generated text"
2689
- },
2690
- "wmt_ro_en_t5_prompt": {
2691
- "evaluator": "generation",
2692
- "extractor_location": "huggingface_pairs",
2693
- "extractor_file": "wmt_ro_en_t5_prompt",
2694
- "benchmark_type": "translation",
2695
- "explanation": "Text generation evaluation - assesses quality of generated text"
2696
- },
2697
- "wnli": {
2698
- "evaluator": "log_likelihoods",
2699
- "extractor_location": "lm_eval_pairs",
2700
- "extractor_file": "wnli",
2701
- "benchmark_type": "other",
2702
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2703
- },
2704
- "wsc": {
2705
- "evaluator": "log_likelihoods",
2706
- "extractor_location": "lm_eval_pairs",
2707
- "extractor_file": "wsc",
2708
- "benchmark_type": "other",
2709
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2710
- },
2711
- "wsc273": {
2712
- "evaluator": "log_likelihoods",
2713
- "extractor_location": "lm_eval_pairs",
2714
- "extractor_file": "wsc273",
2715
- "benchmark_type": "other",
2716
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2717
- },
2718
- "xcopa": {
2719
- "evaluator": "log_likelihoods",
2720
- "extractor_location": "lm_eval_pairs",
2721
- "extractor_file": "xcopa",
2722
- "benchmark_type": "other",
2723
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2724
- },
2725
- "xlsum": {
2726
- "evaluator": "generation",
2727
- "extractor_location": "lm_eval_pairs",
2728
- "extractor_file": "xlsum",
2729
- "benchmark_type": "other",
2730
- "explanation": "Text generation evaluation - assesses quality of generated text"
2731
- },
2732
- "xnli": {
2733
- "evaluator": "log_likelihoods",
2734
- "extractor_location": "lm_eval_pairs",
2735
- "extractor_file": "xnli",
2736
- "benchmark_type": "other",
2737
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2738
- },
2739
- "xquad": {
2740
- "evaluator": "generation",
2741
- "extractor_location": "lm_eval_pairs",
2742
- "extractor_file": "xquad",
2743
- "benchmark_type": "other",
2744
- "explanation": "Text generation evaluation - assesses quality of generated text"
2745
- },
2746
- "xstorycloze": {
2747
- "evaluator": "log_likelihoods",
2748
- "extractor_location": "lm_eval_pairs",
2749
- "extractor_file": "xstorycloze",
2750
- "benchmark_type": "other",
2751
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2752
- },
2753
- "xsum": {
2754
- "evaluator": "exact_match",
2755
- "extractor_location": "huggingface_pairs",
2756
- "extractor_file": "xsum",
2757
- "benchmark_type": "other",
2758
- "explanation": "Text comparison - checks if generated answer exactly matches expected answer"
2759
- },
2760
- "xwinograd": {
2761
- "evaluator": "log_likelihoods",
2762
- "extractor_location": "lm_eval_pairs",
2763
- "extractor_file": "xwinograd",
2764
- "benchmark_type": "other",
2765
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2766
- },
2767
- "yahoo": {
2768
- "evaluator": "log_likelihoods",
2769
- "extractor_location": "lm_eval_pairs",
2770
- "extractor_file": "yahoo",
2771
- "benchmark_type": "other",
2772
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2773
- },
2774
- "zhoblimp": {
2775
- "evaluator": "log_likelihoods",
2776
- "extractor_location": "lm_eval_pairs",
2777
- "extractor_file": "zhoblimp",
2778
- "benchmark_type": "other",
2779
- "explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
2780
- }
2781
- }