wisent 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1020) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/__init__.py +22 -6
  3. wisent/core/activations/activation_cache.py +393 -0
  4. wisent/core/activations/activations.py +22 -40
  5. wisent/core/activations/activations_collector.py +145 -373
  6. wisent/core/activations/classifier_inference_strategy.py +195 -0
  7. wisent/core/activations/core/atoms.py +8 -92
  8. wisent/core/activations/extraction_strategy.py +480 -0
  9. wisent/core/agent/diagnose/response_diagnostics.py +3 -3
  10. wisent/core/agent/diagnose.py +3 -3
  11. wisent/core/autonomous_agent.py +2 -2
  12. wisent/core/classifiers/classifiers/core/atoms.py +3 -2
  13. wisent/core/cli/__init__.py +2 -1
  14. wisent/core/cli/agent/apply_steering.py +25 -31
  15. wisent/core/cli/agent/evaluate_response.py +18 -20
  16. wisent/core/cli/agent/train_classifier.py +36 -26
  17. wisent/core/cli/check_linearity.py +35 -3
  18. wisent/core/cli/cluster_benchmarks.py +470 -0
  19. wisent/core/cli/create_steering_vector.py +19 -9
  20. wisent/core/cli/diagnose_vectors.py +7 -4
  21. wisent/core/cli/estimate_unified_goodness_time.py +6 -4
  22. wisent/core/cli/generate_pairs_from_task.py +9 -56
  23. wisent/core/cli/generate_vector_from_task.py +4 -0
  24. wisent/core/cli/geometry_search.py +137 -0
  25. wisent/core/cli/get_activations.py +13 -37
  26. wisent/core/cli/method_optimizer.py +860 -0
  27. wisent/core/cli/modify_weights.py +3 -2
  28. wisent/core/cli/optimize.py +44 -5
  29. wisent/core/cli/optimize_classification.py +5 -6
  30. wisent/core/cli/optimize_sample_size.py +9 -23
  31. wisent/core/cli/optimize_steering.py +433 -159
  32. wisent/core/cli/optimize_weights.py +67 -7
  33. wisent/core/cli/preview_pairs.py +203 -0
  34. wisent/core/cli/steering_method_trainer.py +8 -7
  35. wisent/core/cli/steering_search_space.py +20 -15
  36. wisent/core/cli/tasks.py +31 -117
  37. wisent/core/cli/train_unified_goodness.py +18 -19
  38. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +1582 -177
  39. wisent/core/contrastive_pairs/diagnostics/linearity.py +70 -80
  40. wisent/core/contrastive_pairs/diagnostics/vector_quality.py +6 -5
  41. wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py +5 -19
  42. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py +11 -5
  43. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
  44. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
  45. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py +146 -32
  46. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
  47. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py +2 -2
  48. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
  49. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
  50. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
  51. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
  52. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
  53. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
  54. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py +98 -57
  55. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
  56. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
  57. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
  58. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
  59. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
  60. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
  61. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
  62. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
  63. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
  64. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
  65. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
  66. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
  67. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
  68. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
  69. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py +8 -8
  70. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py +1 -1
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +11 -5
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval_aqua_rat.py +129 -0
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
  82. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
  83. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
  84. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
  85. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
  86. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
  87. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
  88. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
  89. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
  90. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
  91. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
  92. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
  93. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
  94. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
  95. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
  96. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
  102. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
  103. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
  104. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
  105. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
  106. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
  107. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
  108. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
  109. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
  110. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
  111. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
  112. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
  113. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
  114. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
  115. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
  116. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
  117. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
  118. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
  119. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
  120. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py +11 -6
  121. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
  122. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
  123. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
  124. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
  125. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
  126. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
  127. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
  128. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
  129. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
  130. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
  131. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
  132. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
  133. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
  134. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
  135. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
  136. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
  137. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
  138. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
  139. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
  140. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
  141. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
  142. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
  143. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
  144. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
  145. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
  146. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
  147. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
  148. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
  149. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
  150. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
  151. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
  152. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
  153. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
  154. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
  155. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
  156. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
  157. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
  158. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
  159. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +1 -1
  160. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
  161. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
  162. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
  163. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
  164. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
  165. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
  166. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
  167. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
  168. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
  169. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
  170. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
  171. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
  172. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
  173. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
  174. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
  175. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
  176. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
  177. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
  178. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
  179. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
  180. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
  181. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
  182. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
  183. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
  184. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
  185. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
  186. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
  187. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
  188. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
  189. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py +47 -6
  190. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
  191. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
  192. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
  193. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
  194. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
  195. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
  196. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
  197. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
  198. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
  199. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
  200. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
  201. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
  202. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
  203. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
  204. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
  205. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
  206. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
  207. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
  208. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
  209. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
  210. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
  211. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
  212. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
  213. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
  214. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
  215. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
  216. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
  217. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
  218. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
  219. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
  220. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
  221. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
  222. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
  223. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
  224. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
  225. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
  226. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
  227. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
  228. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
  229. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
  230. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
  231. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
  232. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
  233. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
  234. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
  235. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
  236. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
  237. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
  238. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
  239. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
  240. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
  241. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
  242. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
  243. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
  244. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
  245. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
  246. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
  247. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
  248. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
  249. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
  250. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
  251. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
  252. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
  253. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
  254. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
  255. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
  256. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
  257. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
  258. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
  259. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
  260. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
  261. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
  262. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
  263. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
  264. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
  265. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
  266. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
  267. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
  268. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
  269. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
  270. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
  271. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
  272. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
  273. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
  274. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
  275. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
  276. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
  277. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
  278. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
  279. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
  280. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
  281. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
  282. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
  283. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
  284. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
  285. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
  286. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
  287. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
  288. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
  289. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
  290. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
  291. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
  292. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
  293. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
  294. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
  295. wisent/core/data_loaders/loaders/lm_loader.py +12 -1
  296. wisent/core/evaluators/benchmark_specific/apps_evaluator.py +133 -0
  297. wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py +6 -1
  298. wisent/core/evaluators/benchmark_specific/conala_evaluator.py +31 -168
  299. wisent/core/evaluators/custom/examples/humanization_coherent.py +89 -35
  300. wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py +2 -20
  301. wisent/core/evaluators/personalization/coherence.py +46 -0
  302. wisent/core/geometry_runner.py +995 -0
  303. wisent/core/geometry_search_space.py +237 -0
  304. wisent/core/hyperparameter_optimizer.py +14 -14
  305. wisent/core/lm_eval_harness_ground_truth.py +7 -11
  306. wisent/core/main.py +6 -0
  307. wisent/core/models/core/atoms.py +5 -3
  308. wisent/core/models/wisent_model.py +9 -8
  309. wisent/core/opti/methods/opti_weights.py +29 -2
  310. wisent/core/optuna/classifier/activation_generator.py +14 -12
  311. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  312. wisent/core/optuna/steering/steering_optimization.py +14 -9
  313. wisent/core/parser_arguments/check_linearity_parser.py +12 -2
  314. wisent/core/parser_arguments/cluster_benchmarks_parser.py +31 -0
  315. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
  316. wisent/core/parser_arguments/generate_vector_from_task_parser.py +22 -2
  317. wisent/core/parser_arguments/geometry_search_parser.py +61 -0
  318. wisent/core/parser_arguments/main_parser.py +16 -0
  319. wisent/core/parser_arguments/optimize_steering_parser.py +117 -10
  320. wisent/core/parser_arguments/optimize_weights_parser.py +6 -0
  321. wisent/core/parser_arguments/tasks_parser.py +7 -19
  322. wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
  323. wisent/core/steering.py +5 -3
  324. wisent/core/steering_methods/core/atoms.py +1 -2
  325. wisent/core/steering_methods/methods/caa.py +1 -1
  326. wisent/core/steering_methods/methods/hyperplane.py +75 -0
  327. wisent/core/steering_methods/methods/prism.py +1 -2
  328. wisent/core/steering_methods/methods/pulse.py +39 -8
  329. wisent/core/steering_methods/methods/titan.py +59 -14
  330. wisent/core/steering_methods/registry.py +52 -12
  331. wisent/core/steering_optimizer.py +15 -15
  332. wisent/core/synthetic/generators/nonsense_generator.py +30 -18
  333. wisent/core/trainers/steering_trainer.py +11 -20
  334. wisent/core/utils/device.py +27 -27
  335. wisent/core/utils/layer_combinations.py +70 -0
  336. wisent/examples/__init__.py +1 -0
  337. wisent/examples/scripts/__init__.py +1 -0
  338. wisent/examples/scripts/count_all_benchmarks.py +121 -0
  339. wisent/examples/scripts/discover_directions.py +469 -0
  340. wisent/examples/scripts/extract_benchmark_info.py +71 -0
  341. wisent/examples/scripts/generate_paper_data.py +384 -0
  342. wisent/examples/scripts/intervention_validation.py +626 -0
  343. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
  344. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
  345. wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
  346. wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
  347. wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
  348. wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
  349. wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
  350. wisent/examples/scripts/search_all_short_names.py +31 -0
  351. wisent/examples/scripts/test_all_benchmarks.py +138 -0
  352. wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
  353. wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
  354. wisent/examples/scripts/test_nonsense_baseline.py +261 -0
  355. wisent/examples/scripts/test_one_benchmark.py +324 -0
  356. wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
  357. wisent/examples/scripts/threshold_analysis.py +434 -0
  358. wisent/examples/scripts/visualization_gallery.py +582 -0
  359. wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
  360. wisent/parameters/lm_eval/category_directions.json +137 -0
  361. wisent/parameters/lm_eval/repair_plan.json +282 -0
  362. wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json +19 -70
  363. wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
  364. wisent/parameters/lm_eval/working_benchmarks.json +206 -0
  365. wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
  366. wisent/scripts/run_quality_metrics_sweep.sh +22 -27
  367. wisent/tests/test_aggregation_geometry.py +236 -0
  368. wisent/tests/test_detector_accuracy.py +163 -0
  369. wisent/tests/test_geometry_exhaustive.py +1202 -0
  370. wisent/tests/visualize_geometry.py +255 -61
  371. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
  372. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/RECORD +376 -974
  373. wisent/core/activations/prompt_construction_strategy.py +0 -47
  374. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
  375. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text.py +0 -15
  376. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_go.py +0 -64
  377. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_java.py +0 -65
  378. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_javascript.py +0 -65
  379. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_php.py +0 -65
  380. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_python.py +0 -65
  381. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_ruby.py +0 -65
  382. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/freebase.py +0 -99
  383. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py +0 -180
  384. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py +0 -129
  385. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py +0 -142
  386. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py +0 -155
  387. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code2text.py +0 -161
  388. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/codexglue.py +0 -107
  389. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livemathbench.py +0 -155
  390. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polymath.py +0 -155
  391. wisent/examples/scripts/results/benchmark_descriptions.json +0 -1244
  392. wisent/examples/scripts/results/benchmark_evaluation_methods.json +0 -66
  393. wisent/examples/scripts/results/benchmark_evaluator_mapping.json +0 -2781
  394. wisent/examples/scripts/results/benchmark_evaluator_mapping_updated.json +0 -30536
  395. wisent/examples/scripts/results/benchmark_evaluators_clean.json +0 -469
  396. wisent/examples/scripts/results/benchmark_methods_summary.json +0 -260
  397. wisent/examples/scripts/results/benchmark_pair_creation_methods.json +0 -66
  398. wisent/examples/scripts/results/benchmark_pair_totals.json +0 -269
  399. wisent/examples/scripts/results/benchmark_tags.json +0 -917
  400. wisent/examples/scripts/results/benchmark_test_summary_nov4.json +0 -71
  401. wisent/examples/scripts/results/coding_benchmarks_test_code_status.json +0 -150
  402. wisent/examples/scripts/results/failing_benchmarks.json +0 -946
  403. wisent/examples/scripts/results/failing_benchmarks_list.json +0 -41
  404. wisent/examples/scripts/results/failing_benchmarks_test_results.json +0 -945
  405. wisent/examples/scripts/results/missing_benchmark_tags.json +0 -341
  406. wisent/examples/scripts/results/test_20_newsgroups_evaluation.json +0 -30
  407. wisent/examples/scripts/results/test_20_newsgroups_pairs.json +0 -8
  408. wisent/examples/scripts/results/test_AraDICE_evaluation.json +0 -51
  409. wisent/examples/scripts/results/test_AraDICE_pairs.json +0 -14
  410. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_evaluation.json +0 -30
  411. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_pairs.json +0 -8
  412. wisent/examples/scripts/results/test_ArabCulture_evaluation.json +0 -51
  413. wisent/examples/scripts/results/test_ArabCulture_pairs.json +0 -14
  414. wisent/examples/scripts/results/test_Tag_evaluation.json +0 -30
  415. wisent/examples/scripts/results/test_Tag_pairs.json +0 -8
  416. wisent/examples/scripts/results/test_aclue_evaluation.json +0 -51
  417. wisent/examples/scripts/results/test_aclue_pairs.json +0 -14
  418. wisent/examples/scripts/results/test_acp_bench_evaluation.json +0 -51
  419. wisent/examples/scripts/results/test_acp_bench_hard_evaluation.json +0 -51
  420. wisent/examples/scripts/results/test_acp_bench_hard_pairs.json +0 -14
  421. wisent/examples/scripts/results/test_acp_bench_pairs.json +0 -14
  422. wisent/examples/scripts/results/test_advanced_ai_risk_evaluation.json +0 -51
  423. wisent/examples/scripts/results/test_advanced_ai_risk_pairs.json +0 -14
  424. wisent/examples/scripts/results/test_aexams_evaluation.json +0 -51
  425. wisent/examples/scripts/results/test_aexams_pairs.json +0 -14
  426. wisent/examples/scripts/results/test_afrimgsm_direct_amh_evaluation.json +0 -30
  427. wisent/examples/scripts/results/test_afrimgsm_direct_amh_pairs.json +0 -8
  428. wisent/examples/scripts/results/test_afrimmlu_direct_amh_evaluation.json +0 -30
  429. wisent/examples/scripts/results/test_afrimmlu_direct_amh_pairs.json +0 -8
  430. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_evaluation.json +0 -30
  431. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_pairs.json +0 -8
  432. wisent/examples/scripts/results/test_ag_news_evaluation.json +0 -30
  433. wisent/examples/scripts/results/test_ag_news_pairs.json +0 -8
  434. wisent/examples/scripts/results/test_agieval_evaluation.json +0 -51
  435. wisent/examples/scripts/results/test_agieval_pairs.json +0 -14
  436. wisent/examples/scripts/results/test_aime2024_evaluation.json +0 -30
  437. wisent/examples/scripts/results/test_aime2024_pairs.json +0 -8
  438. wisent/examples/scripts/results/test_aime2025_evaluation.json +0 -30
  439. wisent/examples/scripts/results/test_aime2025_pairs.json +0 -8
  440. wisent/examples/scripts/results/test_aime_evaluation.json +0 -30
  441. wisent/examples/scripts/results/test_aime_pairs.json +0 -8
  442. wisent/examples/scripts/results/test_anagrams1_evaluation.json +0 -30
  443. wisent/examples/scripts/results/test_anagrams1_pairs.json +0 -8
  444. wisent/examples/scripts/results/test_anagrams2_evaluation.json +0 -30
  445. wisent/examples/scripts/results/test_anagrams2_pairs.json +0 -8
  446. wisent/examples/scripts/results/test_anli_evaluation.json +0 -30
  447. wisent/examples/scripts/results/test_anli_pairs.json +0 -8
  448. wisent/examples/scripts/results/test_apps_evaluation.json +0 -30
  449. wisent/examples/scripts/results/test_apps_pairs.json +0 -8
  450. wisent/examples/scripts/results/test_arabic_exams_evaluation.json +0 -30
  451. wisent/examples/scripts/results/test_arabic_exams_pairs.json +0 -8
  452. wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json +0 -51
  453. wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json +0 -14
  454. wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json +0 -51
  455. wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json +0 -14
  456. wisent/examples/scripts/results/test_arabicmmlu_evaluation.json +0 -51
  457. wisent/examples/scripts/results/test_arabicmmlu_pairs.json +0 -14
  458. wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json +0 -51
  459. wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json +0 -14
  460. wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json +0 -51
  461. wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json +0 -14
  462. wisent/examples/scripts/results/test_arc_ar_evaluation.json +0 -30
  463. wisent/examples/scripts/results/test_arc_ar_pairs.json +0 -8
  464. wisent/examples/scripts/results/test_arc_challenge_evaluation.json +0 -30
  465. wisent/examples/scripts/results/test_arc_challenge_pairs.json +0 -8
  466. wisent/examples/scripts/results/test_arc_easy_evaluation.json +0 -30
  467. wisent/examples/scripts/results/test_arc_easy_pairs.json +0 -8
  468. wisent/examples/scripts/results/test_argument_topic_evaluation.json +0 -30
  469. wisent/examples/scripts/results/test_argument_topic_pairs.json +0 -8
  470. wisent/examples/scripts/results/test_arithmetic_evaluation.json +0 -51
  471. wisent/examples/scripts/results/test_arithmetic_pairs.json +0 -14
  472. wisent/examples/scripts/results/test_asdiv_evaluation.json +0 -30
  473. wisent/examples/scripts/results/test_asdiv_pairs.json +0 -8
  474. wisent/examples/scripts/results/test_assin_entailment_evaluation.json +0 -30
  475. wisent/examples/scripts/results/test_assin_entailment_pairs.json +0 -8
  476. wisent/examples/scripts/results/test_atis_evaluation.json +0 -30
  477. wisent/examples/scripts/results/test_atis_pairs.json +0 -8
  478. wisent/examples/scripts/results/test_babi_evaluation.json +0 -30
  479. wisent/examples/scripts/results/test_babi_pairs.json +0 -8
  480. wisent/examples/scripts/results/test_babilong_evaluation.json +0 -30
  481. wisent/examples/scripts/results/test_babilong_pairs.json +0 -8
  482. wisent/examples/scripts/results/test_bangla_mmlu_evaluation.json +0 -30
  483. wisent/examples/scripts/results/test_bangla_mmlu_pairs.json +0 -8
  484. wisent/examples/scripts/results/test_banking77_evaluation.json +0 -30
  485. wisent/examples/scripts/results/test_banking77_pairs.json +0 -8
  486. wisent/examples/scripts/results/test_basque/test_basque-glue_pairs.json +0 -14
  487. wisent/examples/scripts/results/test_basque-glue_evaluation.json +0 -51
  488. wisent/examples/scripts/results/test_basque-glue_pairs.json +0 -14
  489. wisent/examples/scripts/results/test_basque2/test_basque-glue_evaluation.json +0 -51
  490. wisent/examples/scripts/results/test_basque2/test_basque-glue_pairs.json +0 -14
  491. wisent/examples/scripts/results/test_basque_bench_evaluation.json +0 -51
  492. wisent/examples/scripts/results/test_basque_bench_pairs.json +0 -14
  493. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_evaluation.json +0 -51
  494. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_pairs.json +0 -14
  495. wisent/examples/scripts/results/test_basqueglue_evaluation.json +0 -51
  496. wisent/examples/scripts/results/test_basqueglue_pairs.json +0 -14
  497. wisent/examples/scripts/results/test_bbh_evaluation.json +0 -51
  498. wisent/examples/scripts/results/test_bbh_pairs.json +0 -14
  499. wisent/examples/scripts/results/test_bbq_evaluation.json +0 -30
  500. wisent/examples/scripts/results/test_bbq_pairs.json +0 -8
  501. wisent/examples/scripts/results/test_bec2016eu_evaluation.json +0 -51
  502. wisent/examples/scripts/results/test_bec2016eu_pairs.json +0 -14
  503. wisent/examples/scripts/results/test_belebele_evaluation.json +0 -51
  504. wisent/examples/scripts/results/test_belebele_pairs.json +0 -14
  505. wisent/examples/scripts/results/test_benchmarks_evaluation.json +0 -51
  506. wisent/examples/scripts/results/test_benchmarks_pairs.json +0 -14
  507. wisent/examples/scripts/results/test_bertaqa_evaluation.json +0 -51
  508. wisent/examples/scripts/results/test_bertaqa_pairs.json +0 -14
  509. wisent/examples/scripts/results/test_bhtc_v2_evaluation.json +0 -30
  510. wisent/examples/scripts/results/test_bhtc_v2_pairs.json +0 -8
  511. wisent/examples/scripts/results/test_bigbench_evaluation.json +0 -51
  512. wisent/examples/scripts/results/test_bigbench_pairs.json +0 -14
  513. wisent/examples/scripts/results/test_blimp_evaluation.json +0 -51
  514. wisent/examples/scripts/results/test_blimp_pairs.json +0 -14
  515. wisent/examples/scripts/results/test_boolq/test_boolq_evaluation.json +0 -30
  516. wisent/examples/scripts/results/test_boolq/test_boolq_pairs.json +0 -8
  517. wisent/examples/scripts/results/test_boolq-seq2seq_evaluation.json +0 -30
  518. wisent/examples/scripts/results/test_boolq-seq2seq_pairs.json +0 -8
  519. wisent/examples/scripts/results/test_boolq_evaluation.json +0 -30
  520. wisent/examples/scripts/results/test_boolq_pairs.json +0 -8
  521. wisent/examples/scripts/results/test_c4_evaluation.json +0 -30
  522. wisent/examples/scripts/results/test_c4_pairs.json +0 -8
  523. wisent/examples/scripts/results/test_cabreu_evaluation.json +0 -30
  524. wisent/examples/scripts/results/test_cabreu_pairs.json +0 -8
  525. wisent/examples/scripts/results/test_careqa_evaluation.json +0 -30
  526. wisent/examples/scripts/results/test_careqa_pairs.json +0 -8
  527. wisent/examples/scripts/results/test_catalan_bench_evaluation.json +0 -51
  528. wisent/examples/scripts/results/test_catalan_bench_pairs.json +0 -14
  529. wisent/examples/scripts/results/test_catalanqa_evaluation.json +0 -30
  530. wisent/examples/scripts/results/test_catalanqa_pairs.json +0 -8
  531. wisent/examples/scripts/results/test_catcola_evaluation.json +0 -30
  532. wisent/examples/scripts/results/test_catcola_pairs.json +0 -8
  533. wisent/examples/scripts/results/test_cb_evaluation.json +0 -30
  534. wisent/examples/scripts/results/test_cb_pairs.json +0 -8
  535. wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json +0 -51
  536. wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json +0 -14
  537. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json +0 -30
  538. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json +0 -8
  539. wisent/examples/scripts/results/test_ceval_evaluation.json +0 -51
  540. wisent/examples/scripts/results/test_ceval_pairs.json +0 -14
  541. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json +0 -51
  542. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json +0 -14
  543. wisent/examples/scripts/results/test_chain_of_thought_evaluation.json +0 -51
  544. wisent/examples/scripts/results/test_chain_of_thought_pairs.json +0 -14
  545. wisent/examples/scripts/results/test_chartqa_evaluation.json +0 -30
  546. wisent/examples/scripts/results/test_chartqa_pairs.json +0 -8
  547. wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json +0 -30
  548. wisent/examples/scripts/results/test_claim_stance_topic_pairs.json +0 -8
  549. wisent/examples/scripts/results/test_cmmlu_evaluation.json +0 -51
  550. wisent/examples/scripts/results/test_cmmlu_pairs.json +0 -14
  551. wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json +0 -30
  552. wisent/examples/scripts/results/test_cnn_dailymail_pairs.json +0 -8
  553. wisent/examples/scripts/results/test_cocoteros_es_evaluation.json +0 -30
  554. wisent/examples/scripts/results/test_cocoteros_es_pairs.json +0 -8
  555. wisent/examples/scripts/results/test_codexglue_code_to_text_go_evaluation.json +0 -30
  556. wisent/examples/scripts/results/test_codexglue_code_to_text_go_pairs.json +0 -8
  557. wisent/examples/scripts/results/test_codexglue_code_to_text_java_evaluation.json +0 -30
  558. wisent/examples/scripts/results/test_codexglue_code_to_text_java_pairs.json +0 -8
  559. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_evaluation.json +0 -30
  560. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_pairs.json +0 -8
  561. wisent/examples/scripts/results/test_codexglue_code_to_text_php_evaluation.json +0 -30
  562. wisent/examples/scripts/results/test_codexglue_code_to_text_php_pairs.json +0 -8
  563. wisent/examples/scripts/results/test_codexglue_code_to_text_python_evaluation.json +0 -30
  564. wisent/examples/scripts/results/test_codexglue_code_to_text_python_pairs.json +0 -8
  565. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_evaluation.json +0 -30
  566. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_pairs.json +0 -8
  567. wisent/examples/scripts/results/test_coedit_gec_evaluation.json +0 -30
  568. wisent/examples/scripts/results/test_coedit_gec_pairs.json +0 -8
  569. wisent/examples/scripts/results/test_cola_evaluation.json +0 -30
  570. wisent/examples/scripts/results/test_cola_pairs.json +0 -8
  571. wisent/examples/scripts/results/test_commonsense_qa_evaluation.json +0 -30
  572. wisent/examples/scripts/results/test_commonsense_qa_pairs.json +0 -8
  573. wisent/examples/scripts/results/test_conala_evaluation.json +0 -30
  574. wisent/examples/scripts/results/test_conala_pairs.json +0 -8
  575. wisent/examples/scripts/results/test_concode_evaluation.json +0 -30
  576. wisent/examples/scripts/results/test_concode_pairs.json +0 -8
  577. wisent/examples/scripts/results/test_copa_evaluation.json +0 -30
  578. wisent/examples/scripts/results/test_copa_pairs.json +0 -8
  579. wisent/examples/scripts/results/test_copal_id_evaluation.json +0 -30
  580. wisent/examples/scripts/results/test_copal_id_pairs.json +0 -8
  581. wisent/examples/scripts/results/test_coqa_evaluation.json +0 -30
  582. wisent/examples/scripts/results/test_coqa_pairs.json +0 -8
  583. wisent/examples/scripts/results/test_coqcat_evaluation.json +0 -30
  584. wisent/examples/scripts/results/test_coqcat_pairs.json +0 -8
  585. wisent/examples/scripts/results/test_crows_pairs_evaluation.json +0 -51
  586. wisent/examples/scripts/results/test_crows_pairs_pairs.json +0 -14
  587. wisent/examples/scripts/results/test_csatqa_evaluation.json +0 -51
  588. wisent/examples/scripts/results/test_csatqa_pairs.json +0 -14
  589. wisent/examples/scripts/results/test_cycle_letters_evaluation.json +0 -30
  590. wisent/examples/scripts/results/test_cycle_letters_pairs.json +0 -8
  591. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_evaluation.json +0 -51
  592. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_pairs.json +0 -14
  593. wisent/examples/scripts/results/test_darija_bench_evaluation.json +0 -51
  594. wisent/examples/scripts/results/test_darija_bench_pairs.json +0 -14
  595. wisent/examples/scripts/results/test_darijahellaswag_evaluation.json +0 -30
  596. wisent/examples/scripts/results/test_darijahellaswag_pairs.json +0 -8
  597. wisent/examples/scripts/results/test_darijammlu_evaluation.json +0 -51
  598. wisent/examples/scripts/results/test_darijammlu_pairs.json +0 -14
  599. wisent/examples/scripts/results/test_dbpedia_14_evaluation.json +0 -30
  600. wisent/examples/scripts/results/test_dbpedia_14_pairs.json +0 -8
  601. wisent/examples/scripts/results/test_drop_evaluation.json +0 -30
  602. wisent/examples/scripts/results/test_drop_pairs.json +0 -8
  603. wisent/examples/scripts/results/test_ds1000_evaluation.json +0 -30
  604. wisent/examples/scripts/results/test_ds1000_pairs.json +0 -8
  605. wisent/examples/scripts/results/test_egyhellaswag_evaluation.json +0 -30
  606. wisent/examples/scripts/results/test_egyhellaswag_pairs.json +0 -8
  607. wisent/examples/scripts/results/test_egymmlu_evaluation.json +0 -51
  608. wisent/examples/scripts/results/test_egymmlu_pairs.json +0 -14
  609. wisent/examples/scripts/results/test_epec_koref_bin_evaluation.json +0 -30
  610. wisent/examples/scripts/results/test_epec_koref_bin_pairs.json +0 -8
  611. wisent/examples/scripts/results/test_eq_bench_evaluation.json +0 -30
  612. wisent/examples/scripts/results/test_eq_bench_pairs.json +0 -8
  613. wisent/examples/scripts/results/test_escola_evaluation.json +0 -30
  614. wisent/examples/scripts/results/test_escola_pairs.json +0 -8
  615. wisent/examples/scripts/results/test_ethics_cm_evaluation.json +0 -30
  616. wisent/examples/scripts/results/test_ethics_cm_pairs.json +0 -8
  617. wisent/examples/scripts/results/test_ethos_binary_evaluation.json +0 -30
  618. wisent/examples/scripts/results/test_ethos_binary_pairs.json +0 -8
  619. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_evaluation.json +0 -51
  620. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_pairs.json +0 -14
  621. wisent/examples/scripts/results/test_eus_exams_es_evaluation.json +0 -51
  622. wisent/examples/scripts/results/test_eus_exams_es_pairs.json +0 -14
  623. wisent/examples/scripts/results/test_eus_exams_evaluation.json +0 -51
  624. wisent/examples/scripts/results/test_eus_exams_pairs.json +0 -14
  625. wisent/examples/scripts/results/test_eus_proficiency_evaluation.json +0 -30
  626. wisent/examples/scripts/results/test_eus_proficiency_pairs.json +0 -8
  627. wisent/examples/scripts/results/test_eus_reading_evaluation.json +0 -30
  628. wisent/examples/scripts/results/test_eus_reading_pairs.json +0 -8
  629. wisent/examples/scripts/results/test_eus_trivia_evaluation.json +0 -30
  630. wisent/examples/scripts/results/test_eus_trivia_pairs.json +0 -8
  631. wisent/examples/scripts/results/test_evalita-mp_evaluation.json +0 -51
  632. wisent/examples/scripts/results/test_evalita-mp_pairs.json +0 -14
  633. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
  634. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
  635. wisent/examples/scripts/results/test_evalita_LLM_evaluation.json +0 -51
  636. wisent/examples/scripts/results/test_evalita_LLM_pairs.json +0 -14
  637. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_evaluation.json +0 -51
  638. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_pairs.json +0 -14
  639. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_evaluation.json +0 -30
  640. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_pairs.json +0 -8
  641. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_evaluation.json +0 -51
  642. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_pairs.json +0 -14
  643. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
  644. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
  645. wisent/examples/scripts/results/test_fda_evaluation.json +0 -30
  646. wisent/examples/scripts/results/test_fda_pairs.json +0 -8
  647. wisent/examples/scripts/results/test_financial_tweets_evaluation.json +0 -30
  648. wisent/examples/scripts/results/test_financial_tweets_pairs.json +0 -8
  649. wisent/examples/scripts/results/test_fld/test_fld_evaluation.json +0 -30
  650. wisent/examples/scripts/results/test_fld/test_fld_pairs.json +0 -8
  651. wisent/examples/scripts/results/test_fld_evaluation.json +0 -30
  652. wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json +0 -30
  653. wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json +0 -8
  654. wisent/examples/scripts/results/test_fld_pairs.json +0 -8
  655. wisent/examples/scripts/results/test_flores_evaluation.json +0 -51
  656. wisent/examples/scripts/results/test_flores_pairs.json +0 -14
  657. wisent/examples/scripts/results/test_freebase_evaluation.json +0 -30
  658. wisent/examples/scripts/results/test_freebase_pairs.json +0 -8
  659. wisent/examples/scripts/results/test_french_bench_evaluation.json +0 -51
  660. wisent/examples/scripts/results/test_french_bench_pairs.json +0 -14
  661. wisent/examples/scripts/results/test_galcola_evaluation.json +0 -30
  662. wisent/examples/scripts/results/test_galcola_pairs.json +0 -8
  663. wisent/examples/scripts/results/test_galician_bench_evaluation.json +0 -51
  664. wisent/examples/scripts/results/test_galician_bench_pairs.json +0 -14
  665. wisent/examples/scripts/results/test_glianorex_evaluation.json +0 -30
  666. wisent/examples/scripts/results/test_glianorex_pairs.json +0 -8
  667. wisent/examples/scripts/results/test_global_mmlu_evaluation.json +0 -51
  668. wisent/examples/scripts/results/test_global_mmlu_pairs.json +0 -14
  669. wisent/examples/scripts/results/test_glue_evaluation.json +0 -51
  670. wisent/examples/scripts/results/test_glue_pairs.json +0 -14
  671. wisent/examples/scripts/results/test_gpqa_evaluation.json +0 -51
  672. wisent/examples/scripts/results/test_gpqa_pairs.json +0 -14
  673. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_evaluation.json +0 -51
  674. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_pairs.json +0 -14
  675. wisent/examples/scripts/results/test_groundcocoa_evaluation.json +0 -30
  676. wisent/examples/scripts/results/test_groundcocoa_pairs.json +0 -8
  677. wisent/examples/scripts/results/test_gsm8k_evaluation.json +0 -30
  678. wisent/examples/scripts/results/test_gsm8k_pairs.json +0 -8
  679. wisent/examples/scripts/results/test_haerae_evaluation.json +0 -51
  680. wisent/examples/scripts/results/test_haerae_pairs.json +0 -14
  681. wisent/examples/scripts/results/test_headqa_evaluation.json +0 -30
  682. wisent/examples/scripts/results/test_headqa_pairs.json +0 -8
  683. wisent/examples/scripts/results/test_hellaswag_evaluation.json +0 -30
  684. wisent/examples/scripts/results/test_hellaswag_pairs.json +0 -8
  685. wisent/examples/scripts/results/test_hendrycks_ethics_evaluation.json +0 -51
  686. wisent/examples/scripts/results/test_hendrycks_ethics_pairs.json +0 -14
  687. wisent/examples/scripts/results/test_hendrycks_math_evaluation.json +0 -51
  688. wisent/examples/scripts/results/test_hendrycks_math_pairs.json +0 -14
  689. wisent/examples/scripts/results/test_histoires_morales_evaluation.json +0 -30
  690. wisent/examples/scripts/results/test_histoires_morales_pairs.json +0 -8
  691. wisent/examples/scripts/results/test_hmmt_evaluation.json +0 -30
  692. wisent/examples/scripts/results/test_hmmt_feb_2025_evaluation.json +0 -30
  693. wisent/examples/scripts/results/test_hmmt_feb_2025_pairs.json +0 -8
  694. wisent/examples/scripts/results/test_hmmt_pairs.json +0 -8
  695. wisent/examples/scripts/results/test_hrm8k_evaluation.json +0 -51
  696. wisent/examples/scripts/results/test_hrm8k_pairs.json +0 -14
  697. wisent/examples/scripts/results/test_humaneval_evaluation.json +0 -30
  698. wisent/examples/scripts/results/test_humaneval_pairs.json +0 -8
  699. wisent/examples/scripts/results/test_humaneval_plus_evaluation.json +0 -30
  700. wisent/examples/scripts/results/test_humaneval_plus_pairs.json +0 -8
  701. wisent/examples/scripts/results/test_ifeval_evaluation.json +0 -30
  702. wisent/examples/scripts/results/test_ifeval_pairs.json +0 -8
  703. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_evaluation.json +0 -30
  704. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_pairs.json +0 -8
  705. wisent/examples/scripts/results/test_instruct_humaneval_evaluation.json +0 -30
  706. wisent/examples/scripts/results/test_instruct_humaneval_pairs.json +0 -8
  707. wisent/examples/scripts/results/test_inverse_scaling_evaluation.json +0 -51
  708. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_evaluation.json +0 -30
  709. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_pairs.json +0 -8
  710. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_evaluation.json +0 -51
  711. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_pairs.json +0 -14
  712. wisent/examples/scripts/results/test_inverse_scaling_pairs.json +0 -14
  713. wisent/examples/scripts/results/test_iwslt2017-ar-en_evaluation.json +0 -30
  714. wisent/examples/scripts/results/test_iwslt2017-ar-en_pairs.json +0 -8
  715. wisent/examples/scripts/results/test_iwslt2017-en-ar_evaluation.json +0 -30
  716. wisent/examples/scripts/results/test_iwslt2017-en-ar_pairs.json +0 -8
  717. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_evaluation.json +0 -30
  718. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_pairs.json +0 -8
  719. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_evaluation.json +0 -30
  720. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_pairs.json +0 -8
  721. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_evaluation.json +0 -30
  722. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_pairs.json +0 -8
  723. wisent/examples/scripts/results/test_japanese_leaderboard_evaluation.json +0 -51
  724. wisent/examples/scripts/results/test_japanese_leaderboard_pairs.json +0 -14
  725. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_evaluation.json +0 -30
  726. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_pairs.json +0 -8
  727. wisent/examples/scripts/results/test_jsonschema_bench_evaluation.json +0 -30
  728. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_evaluation.json +0 -30
  729. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_pairs.json +0 -8
  730. wisent/examples/scripts/results/test_jsonschema_bench_pairs.json +0 -8
  731. wisent/examples/scripts/results/test_kbl_evaluation.json +0 -51
  732. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_evaluation.json +0 -51
  733. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_pairs.json +0 -14
  734. wisent/examples/scripts/results/test_kbl_pairs.json +0 -14
  735. wisent/examples/scripts/results/test_kmmlu_evaluation.json +0 -51
  736. wisent/examples/scripts/results/test_kmmlu_pairs.json +0 -14
  737. wisent/examples/scripts/results/test_kobest_evaluation.json +0 -51
  738. wisent/examples/scripts/results/test_kobest_pairs.json +0 -14
  739. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json +0 -30
  740. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json +0 -8
  741. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json +0 -30
  742. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json +0 -8
  743. wisent/examples/scripts/results/test_kormedmcqa_evaluation.json +0 -30
  744. wisent/examples/scripts/results/test_kormedmcqa_pairs.json +0 -8
  745. wisent/examples/scripts/results/test_lambada_cloze_evaluation.json +0 -30
  746. wisent/examples/scripts/results/test_lambada_cloze_pairs.json +0 -8
  747. wisent/examples/scripts/results/test_lambada_evaluation.json +0 -30
  748. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  749. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  750. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json +0 -51
  751. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json +0 -14
  752. wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json +0 -51
  753. wisent/examples/scripts/results/test_lambada_multilingual_pairs.json +0 -14
  754. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json +0 -51
  755. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json +0 -14
  756. wisent/examples/scripts/results/test_lambada_openai_evaluation.json +0 -30
  757. wisent/examples/scripts/results/test_lambada_openai_pairs.json +0 -8
  758. wisent/examples/scripts/results/test_lambada_pairs.json +0 -8
  759. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  760. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  761. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  762. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  763. wisent/examples/scripts/results/test_lambada_standard_evaluation.json +0 -30
  764. wisent/examples/scripts/results/test_lambada_standard_pairs.json +0 -8
  765. wisent/examples/scripts/results/test_leaderboard_evaluation.json +0 -51
  766. wisent/examples/scripts/results/test_leaderboard_pairs.json +0 -14
  767. wisent/examples/scripts/results/test_libra/test_libra_evaluation.json +0 -51
  768. wisent/examples/scripts/results/test_libra/test_libra_pairs.json +0 -14
  769. wisent/examples/scripts/results/test_libra_evaluation.json +0 -51
  770. wisent/examples/scripts/results/test_libra_pairs.json +0 -14
  771. wisent/examples/scripts/results/test_lingoly_evaluation.json +0 -30
  772. wisent/examples/scripts/results/test_lingoly_pairs.json +0 -8
  773. wisent/examples/scripts/results/test_livecodebench_evaluation.json +0 -30
  774. wisent/examples/scripts/results/test_livecodebench_pairs.json +0 -8
  775. wisent/examples/scripts/results/test_livemathbench_cnmo_en_evaluation.json +0 -30
  776. wisent/examples/scripts/results/test_livemathbench_cnmo_en_pairs.json +0 -8
  777. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_evaluation.json +0 -30
  778. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_pairs.json +0 -8
  779. wisent/examples/scripts/results/test_llama_evaluation.json +0 -30
  780. wisent/examples/scripts/results/test_llama_pairs.json +0 -8
  781. wisent/examples/scripts/results/test_logiqa2_evaluation.json +0 -30
  782. wisent/examples/scripts/results/test_logiqa2_pairs.json +0 -8
  783. wisent/examples/scripts/results/test_logiqa_evaluation.json +0 -30
  784. wisent/examples/scripts/results/test_logiqa_pairs.json +0 -8
  785. wisent/examples/scripts/results/test_m_mmlu_evaluation.json +0 -51
  786. wisent/examples/scripts/results/test_m_mmlu_pairs.json +0 -14
  787. wisent/examples/scripts/results/test_mastermind/test_mastermind_evaluation.json +0 -51
  788. wisent/examples/scripts/results/test_mastermind/test_mastermind_pairs.json +0 -14
  789. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_evaluation.json +0 -30
  790. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_pairs.json +0 -8
  791. wisent/examples/scripts/results/test_mastermind_evaluation.json +0 -51
  792. wisent/examples/scripts/results/test_mastermind_pairs.json +0 -14
  793. wisent/examples/scripts/results/test_math500_evaluation.json +0 -30
  794. wisent/examples/scripts/results/test_math500_pairs.json +0 -8
  795. wisent/examples/scripts/results/test_math_evaluation.json +0 -30
  796. wisent/examples/scripts/results/test_math_pairs.json +0 -8
  797. wisent/examples/scripts/results/test_mathqa_evaluation.json +0 -30
  798. wisent/examples/scripts/results/test_mathqa_pairs.json +0 -8
  799. wisent/examples/scripts/results/test_mbpp_evaluation.json +0 -30
  800. wisent/examples/scripts/results/test_mbpp_pairs.json +0 -8
  801. wisent/examples/scripts/results/test_mbpp_plus_evaluation.json +0 -30
  802. wisent/examples/scripts/results/test_mbpp_plus_pairs.json +0 -8
  803. wisent/examples/scripts/results/test_mc_taco_evaluation.json +0 -30
  804. wisent/examples/scripts/results/test_mc_taco_pairs.json +0 -8
  805. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_evaluation.json +0 -51
  806. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_pairs.json +0 -14
  807. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_evaluation.json +0 -30
  808. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_pairs.json +0 -8
  809. wisent/examples/scripts/results/test_med_concepts_qa_evaluation.json +0 -51
  810. wisent/examples/scripts/results/test_med_concepts_qa_pairs.json +0 -14
  811. wisent/examples/scripts/results/test_meddialog_evaluation.json +0 -30
  812. wisent/examples/scripts/results/test_meddialog_pairs.json +0 -8
  813. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_evaluation.json +0 -30
  814. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_pairs.json +0 -8
  815. wisent/examples/scripts/results/test_mediqa_qa2019_evaluation.json +0 -30
  816. wisent/examples/scripts/results/test_mediqa_qa2019_pairs.json +0 -8
  817. wisent/examples/scripts/results/test_medmcqa_evaluation.json +0 -30
  818. wisent/examples/scripts/results/test_medmcqa_pairs.json +0 -8
  819. wisent/examples/scripts/results/test_medqa_evaluation.json +0 -30
  820. wisent/examples/scripts/results/test_medqa_pairs.json +0 -8
  821. wisent/examples/scripts/results/test_medtext_evaluation.json +0 -30
  822. wisent/examples/scripts/results/test_medtext_pairs.json +0 -8
  823. wisent/examples/scripts/results/test_mela_evaluation.json +0 -51
  824. wisent/examples/scripts/results/test_mela_pairs.json +0 -14
  825. wisent/examples/scripts/results/test_meqsum_evaluation.json +0 -30
  826. wisent/examples/scripts/results/test_meqsum_pairs.json +0 -8
  827. wisent/examples/scripts/results/test_mercury_evaluation.json +0 -30
  828. wisent/examples/scripts/results/test_mercury_pairs.json +0 -8
  829. wisent/examples/scripts/results/test_metabench_evaluation.json +0 -51
  830. wisent/examples/scripts/results/test_metabench_pairs.json +0 -14
  831. wisent/examples/scripts/results/test_mgsm_evaluation.json +0 -51
  832. wisent/examples/scripts/results/test_mgsm_pairs.json +0 -14
  833. wisent/examples/scripts/results/test_mimic_repsum_evaluation.json +0 -30
  834. wisent/examples/scripts/results/test_mimic_repsum_pairs.json +0 -8
  835. wisent/examples/scripts/results/test_minerva_math_evaluation.json +0 -51
  836. wisent/examples/scripts/results/test_minerva_math_pairs.json +0 -14
  837. wisent/examples/scripts/results/test_mlqa_evaluation.json +0 -51
  838. wisent/examples/scripts/results/test_mlqa_pairs.json +0 -14
  839. wisent/examples/scripts/results/test_mmlu-pro-plus_evaluation.json +0 -51
  840. wisent/examples/scripts/results/test_mmlu-pro-plus_pairs.json +0 -14
  841. wisent/examples/scripts/results/test_mmlu_evaluation.json +0 -51
  842. wisent/examples/scripts/results/test_mmlu_pairs.json +0 -14
  843. wisent/examples/scripts/results/test_mmlu_pro_evaluation.json +0 -51
  844. wisent/examples/scripts/results/test_mmlu_pro_pairs.json +0 -14
  845. wisent/examples/scripts/results/test_mmlu_prox_evaluation.json +0 -51
  846. wisent/examples/scripts/results/test_mmlu_prox_pairs.json +0 -14
  847. wisent/examples/scripts/results/test_mmlusr_evaluation.json +0 -30
  848. wisent/examples/scripts/results/test_mmlusr_pairs.json +0 -8
  849. wisent/examples/scripts/results/test_mmmu_evaluation.json +0 -51
  850. wisent/examples/scripts/results/test_mmmu_pairs.json +0 -14
  851. wisent/examples/scripts/results/test_mnli_evaluation.json +0 -30
  852. wisent/examples/scripts/results/test_mnli_pairs.json +0 -8
  853. wisent/examples/scripts/results/test_model_written_evals_evaluation.json +0 -51
  854. wisent/examples/scripts/results/test_model_written_evals_pairs.json +0 -14
  855. wisent/examples/scripts/results/test_moral_stories_evaluation.json +0 -30
  856. wisent/examples/scripts/results/test_moral_stories_pairs.json +0 -8
  857. wisent/examples/scripts/results/test_mts_dialog_evaluation.json +0 -30
  858. wisent/examples/scripts/results/test_mts_dialog_pairs.json +0 -8
  859. wisent/examples/scripts/results/test_multiblimp_evaluation.json +0 -51
  860. wisent/examples/scripts/results/test_multiblimp_pairs.json +0 -14
  861. wisent/examples/scripts/results/test_multimedqa_evaluation.json +0 -51
  862. wisent/examples/scripts/results/test_multimedqa_pairs.json +0 -14
  863. wisent/examples/scripts/results/test_multipl_e_evaluation.json +0 -30
  864. wisent/examples/scripts/results/test_multipl_e_pairs.json +0 -8
  865. wisent/examples/scripts/results/test_mutual_evaluation.json +0 -30
  866. wisent/examples/scripts/results/test_mutual_pairs.json +0 -8
  867. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_evaluation.json +0 -30
  868. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_pairs.json +0 -8
  869. wisent/examples/scripts/results/test_noreval_evaluation.json +0 -51
  870. wisent/examples/scripts/results/test_noreval_pairs.json +0 -14
  871. wisent/examples/scripts/results/test_noticia_evaluation.json +0 -30
  872. wisent/examples/scripts/results/test_noticia_pairs.json +0 -8
  873. wisent/examples/scripts/results/test_nq_open_evaluation.json +0 -30
  874. wisent/examples/scripts/results/test_nq_open_pairs.json +0 -8
  875. wisent/examples/scripts/results/test_olaph_evaluation.json +0 -30
  876. wisent/examples/scripts/results/test_olaph_pairs.json +0 -8
  877. wisent/examples/scripts/results/test_openbookqa_evaluation.json +0 -30
  878. wisent/examples/scripts/results/test_openbookqa_pairs.json +0 -8
  879. wisent/examples/scripts/results/test_openllm_evaluation.json +0 -51
  880. wisent/examples/scripts/results/test_openllm_pairs.json +0 -14
  881. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_evaluation.json +0 -30
  882. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_pairs.json +0 -8
  883. wisent/examples/scripts/results/test_paloma_evaluation.json +0 -51
  884. wisent/examples/scripts/results/test_paloma_pairs.json +0 -14
  885. wisent/examples/scripts/results/test_passkey/test_passkey_evaluation.json +0 -30
  886. wisent/examples/scripts/results/test_passkey/test_passkey_pairs.json +0 -8
  887. wisent/examples/scripts/results/test_paws-x_evaluation.json +0 -51
  888. wisent/examples/scripts/results/test_paws-x_pairs.json +0 -14
  889. wisent/examples/scripts/results/test_paws_en/test_paws_en_evaluation.json +0 -30
  890. wisent/examples/scripts/results/test_paws_en/test_paws_en_pairs.json +0 -8
  891. wisent/examples/scripts/results/test_penn_treebank_evaluation.json +0 -30
  892. wisent/examples/scripts/results/test_penn_treebank_pairs.json +0 -8
  893. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_evaluation.json +0 -30
  894. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_pairs.json +0 -8
  895. wisent/examples/scripts/results/test_piqa_evaluation.json +0 -30
  896. wisent/examples/scripts/results/test_piqa_pairs.json +0 -8
  897. wisent/examples/scripts/results/test_polemo2_evaluation.json +0 -30
  898. wisent/examples/scripts/results/test_polemo2_pairs.json +0 -8
  899. wisent/examples/scripts/results/test_polymath_en_high_evaluation.json +0 -30
  900. wisent/examples/scripts/results/test_polymath_en_high_pairs.json +0 -8
  901. wisent/examples/scripts/results/test_polymath_en_medium_evaluation.json +0 -30
  902. wisent/examples/scripts/results/test_polymath_en_medium_pairs.json +0 -8
  903. wisent/examples/scripts/results/test_polymath_zh_high_evaluation.json +0 -30
  904. wisent/examples/scripts/results/test_polymath_zh_high_pairs.json +0 -8
  905. wisent/examples/scripts/results/test_polymath_zh_medium_evaluation.json +0 -30
  906. wisent/examples/scripts/results/test_polymath_zh_medium_pairs.json +0 -8
  907. wisent/examples/scripts/results/test_portuguese_bench_evaluation.json +0 -51
  908. wisent/examples/scripts/results/test_portuguese_bench_pairs.json +0 -14
  909. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
  910. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
  911. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
  912. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
  913. wisent/examples/scripts/results/test_prost_evaluation.json +0 -30
  914. wisent/examples/scripts/results/test_prost_pairs.json +0 -8
  915. wisent/examples/scripts/results/test_ptb_evaluation.json +0 -30
  916. wisent/examples/scripts/results/test_ptb_pairs.json +0 -8
  917. wisent/examples/scripts/results/test_pubmedqa_evaluation.json +0 -30
  918. wisent/examples/scripts/results/test_pubmedqa_pairs.json +0 -8
  919. wisent/examples/scripts/results/test_pythia_evaluation.json +0 -51
  920. wisent/examples/scripts/results/test_pythia_pairs.json +0 -14
  921. wisent/examples/scripts/results/test_qa4mre_evaluation.json +0 -30
  922. wisent/examples/scripts/results/test_qa4mre_pairs.json +0 -8
  923. wisent/examples/scripts/results/test_qasper_evaluation.json +0 -30
  924. wisent/examples/scripts/results/test_qasper_pairs.json +0 -8
  925. wisent/examples/scripts/results/test_race_evaluation.json +0 -30
  926. wisent/examples/scripts/results/test_race_pairs.json +0 -8
  927. wisent/examples/scripts/results/test_realtoxicityprompts_evaluation.json +0 -30
  928. wisent/examples/scripts/results/test_realtoxicityprompts_pairs.json +0 -8
  929. wisent/examples/scripts/results/test_recode_evaluation.json +0 -30
  930. wisent/examples/scripts/results/test_recode_pairs.json +0 -8
  931. wisent/examples/scripts/results/test_record_evaluation.json +0 -30
  932. wisent/examples/scripts/results/test_record_pairs.json +0 -8
  933. wisent/examples/scripts/results/test_ruler_evaluation.json +0 -51
  934. wisent/examples/scripts/results/test_ruler_pairs.json +0 -14
  935. wisent/examples/scripts/results/test_sciq_evaluation.json +0 -30
  936. wisent/examples/scripts/results/test_sciq_pairs.json +0 -8
  937. wisent/examples/scripts/results/test_score_evaluation.json +0 -51
  938. wisent/examples/scripts/results/test_score_pairs.json +0 -14
  939. wisent/examples/scripts/results/test_self_consistency_evaluation.json +0 -30
  940. wisent/examples/scripts/results/test_self_consistency_pairs.json +0 -8
  941. wisent/examples/scripts/results/test_siqa/test_siqa_evaluation.json +0 -30
  942. wisent/examples/scripts/results/test_siqa/test_siqa_pairs.json +0 -8
  943. wisent/examples/scripts/results/test_siqa_evaluation.json +0 -30
  944. wisent/examples/scripts/results/test_siqa_pairs.json +0 -8
  945. wisent/examples/scripts/results/test_spanish_bench_evaluation.json +0 -51
  946. wisent/examples/scripts/results/test_spanish_bench_pairs.json +0 -14
  947. wisent/examples/scripts/results/test_squad2_evaluation.json +0 -30
  948. wisent/examples/scripts/results/test_squad2_pairs.json +0 -8
  949. wisent/examples/scripts/results/test_squadv2_evaluation.json +0 -30
  950. wisent/examples/scripts/results/test_squadv2_pairs.json +0 -8
  951. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_evaluation.json +0 -30
  952. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_pairs.json +0 -8
  953. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_evaluation.json +0 -51
  954. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_pairs.json +0 -14
  955. wisent/examples/scripts/results/test_swag_evaluation.json +0 -30
  956. wisent/examples/scripts/results/test_swag_pairs.json +0 -8
  957. wisent/examples/scripts/results/test_tinyBenchmarks_evaluation.json +0 -51
  958. wisent/examples/scripts/results/test_tinyBenchmarks_pairs.json +0 -14
  959. wisent/examples/scripts/results/test_tmmluplus_evaluation.json +0 -51
  960. wisent/examples/scripts/results/test_tmmluplus_pairs.json +0 -14
  961. wisent/examples/scripts/results/test_translation_evaluation.json +0 -51
  962. wisent/examples/scripts/results/test_translation_pairs.json +0 -14
  963. wisent/examples/scripts/results/test_triviaqa_evaluation.json +0 -30
  964. wisent/examples/scripts/results/test_triviaqa_pairs.json +0 -8
  965. wisent/examples/scripts/results/test_truthfulqa-multi_evaluation.json +0 -51
  966. wisent/examples/scripts/results/test_truthfulqa-multi_pairs.json +0 -14
  967. wisent/examples/scripts/results/test_truthfulqa_evaluation.json +0 -30
  968. wisent/examples/scripts/results/test_truthfulqa_mc1_evaluation.json +0 -30
  969. wisent/examples/scripts/results/test_truthfulqa_mc1_pairs.json +0 -8
  970. wisent/examples/scripts/results/test_truthfulqa_mc2_evaluation.json +0 -30
  971. wisent/examples/scripts/results/test_truthfulqa_mc2_pairs.json +0 -8
  972. wisent/examples/scripts/results/test_truthfulqa_pairs.json +0 -8
  973. wisent/examples/scripts/results/test_turkishmmlu_evaluation.json +0 -51
  974. wisent/examples/scripts/results/test_turkishmmlu_pairs.json +0 -14
  975. wisent/examples/scripts/results/test_unfair_tos_evaluation.json +0 -30
  976. wisent/examples/scripts/results/test_unfair_tos_pairs.json +0 -8
  977. wisent/examples/scripts/results/test_unscramble_evaluation.json +0 -51
  978. wisent/examples/scripts/results/test_unscramble_pairs.json +0 -14
  979. wisent/examples/scripts/results/test_webqs_evaluation.json +0 -30
  980. wisent/examples/scripts/results/test_webqs_pairs.json +0 -8
  981. wisent/examples/scripts/results/test_wikitext103_evaluation.json +0 -30
  982. wisent/examples/scripts/results/test_wikitext103_pairs.json +0 -8
  983. wisent/examples/scripts/results/test_wikitext_evaluation.json +0 -30
  984. wisent/examples/scripts/results/test_wikitext_pairs.json +0 -8
  985. wisent/examples/scripts/results/test_winogender_evaluation.json +0 -51
  986. wisent/examples/scripts/results/test_winogender_pairs.json +0 -14
  987. wisent/examples/scripts/results/test_winogrande_evaluation.json +0 -30
  988. wisent/examples/scripts/results/test_winogrande_pairs.json +0 -8
  989. wisent/examples/scripts/results/test_wmdp_evaluation.json +0 -30
  990. wisent/examples/scripts/results/test_wmdp_pairs.json +0 -8
  991. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_evaluation.json +0 -30
  992. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_pairs.json +0 -8
  993. wisent/examples/scripts/results/test_wmt14_en_fr_evaluation.json +0 -30
  994. wisent/examples/scripts/results/test_wmt14_en_fr_pairs.json +0 -8
  995. wisent/examples/scripts/results/test_wmt16_en_de_evaluation.json +0 -30
  996. wisent/examples/scripts/results/test_wmt16_en_de_pairs.json +0 -8
  997. wisent/examples/scripts/results/test_wmt16_ro_en_evaluation.json +0 -30
  998. wisent/examples/scripts/results/test_wmt16_ro_en_pairs.json +0 -8
  999. wisent/examples/scripts/results/test_wsc273_evaluation.json +0 -30
  1000. wisent/examples/scripts/results/test_wsc273_pairs.json +0 -8
  1001. wisent/examples/scripts/results/test_xcopa_evaluation.json +0 -51
  1002. wisent/examples/scripts/results/test_xcopa_pairs.json +0 -14
  1003. wisent/examples/scripts/results/test_xnli_eu_evaluation.json +0 -30
  1004. wisent/examples/scripts/results/test_xnli_eu_pairs.json +0 -8
  1005. wisent/examples/scripts/results/test_xnli_evaluation.json +0 -51
  1006. wisent/examples/scripts/results/test_xnli_pairs.json +0 -14
  1007. wisent/examples/scripts/results/test_xquad_evaluation.json +0 -51
  1008. wisent/examples/scripts/results/test_xquad_pairs.json +0 -14
  1009. wisent/examples/scripts/results/test_xstorycloze_evaluation.json +0 -51
  1010. wisent/examples/scripts/results/test_xstorycloze_pairs.json +0 -14
  1011. wisent/examples/scripts/results/test_xsum_evaluation.json +0 -30
  1012. wisent/examples/scripts/results/test_xsum_pairs.json +0 -8
  1013. wisent/examples/scripts/results/test_xwinograd_evaluation.json +0 -51
  1014. wisent/examples/scripts/results/test_xwinograd_pairs.json +0 -14
  1015. wisent/examples/scripts/results/test_yahoo_answers_topics_evaluation.json +0 -30
  1016. wisent/examples/scripts/results/test_yahoo_answers_topics_pairs.json +0 -8
  1017. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
  1018. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
  1019. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
  1020. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
@@ -1,1244 +0,0 @@
1
- {
2
- ".": {
3
- "name": "Tasks",
4
- "description": "A list of supported tasks and task groupings can be viewed with `lm-eval --tasks list`.",
5
- "paper": "",
6
- "homepage": ""
7
- },
8
- "aclue": {
9
- "name": "ACLUE",
10
- "description": "Can Large Language Model Comprehend Ancient Chinese? A Preliminary Test on ACLUE\nhttps://arxiv.org/abs/2310.09550",
11
- "paper": "",
12
- "homepage": "Homepage: https://github.com/isen-zhang/ACLUE"
13
- },
14
- "acpbench": {
15
- "name": "ACPBench",
16
- "description": "**",
17
- "paper": "",
18
- "homepage": "Homepage:** https://ibm.github.io/ACPBench/"
19
- },
20
- "aexams": {
21
- "name": "Arabic EXAMS",
22
- "description": "EXAMS: a resource specialized in multilingual high school exam questions.\nThe original paper [EXAMS](https://aclanthology.org/2020.emnlp-main.438/)",
23
- "paper": "paper [EXAMS](https://aclanthology.org/2020.emnlp-main.438/",
24
- "homepage": "Homepage for Arabic EXAMS: [EXAMS Arabic Homepage](https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/EXAMS_Arabic"
25
- },
26
- "afrimgsm": {
27
- "name": "MathQA",
28
- "description": "IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models\nhttps://arxiv.org/pdf/2406.03368",
29
- "paper": "",
30
- "homepage": ""
31
- },
32
- "afrimmlu": {
33
- "name": "MathQA",
34
- "description": "IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models\nhttps://arxiv.org/pdf/2406.03368",
35
- "paper": "",
36
- "homepage": ""
37
- },
38
- "afrixnli": {
39
- "name": "IrokoBench",
40
- "description": "IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models\nhttps://arxiv.org/pdf/2406.03368",
41
- "paper": "",
42
- "homepage": ""
43
- },
44
- "afrobench": {
45
- "name": "AfroBench",
46
- "description": "> Large-scale multilingual evaluations, such as MEGA, often include only a handful of African languages due to the scarcity of high-quality evaluation data and the limited discoverability of existing African datasets. This lack of representation hinders comprehensive LLM evaluation across a diverse range of languages and tasks. To address these challenges, we introduce AfroBench -- a multi-task benchmark for evaluating the performance of LLMs across 64 African languages, 15 tasks and 22 datasets",
47
- "paper": "Paper Link: https://arxiv.org/abs/2311.07978",
48
- "homepage": "HomePage: https://mcgill-nlp.github.io/AfroBench/"
49
- },
50
- "afrobench_adr": {
51
- "name": "Automatic Diacritics Restoration (ADR)",
52
- "description": "Automatic Diacritics Restoration (ADR) is the task of restoring diacritical marks in text where they have been omitted or removed.\nThis process is essential for languages where diacritics alter pronunciation, meaning, or grammatical structure.\nADR requires the model to have a deep understanding of linguistic context, syntax, and semantics to accurately predict and reinsert the appropriate diacritics.",
53
- "paper": "",
54
- "homepage": ""
55
- },
56
- "afrobench_afriqa": {
57
- "name": "## Paper",
58
- "description": ">AfriQA is the first cross-lingual question answering (QA) dataset with a focus on African languages. The dataset includes over 12,000 XOR QA examples across 10 African languages, making it an invaluable resource for developing more equitable QA technology. African languages have historically been underserved in the digital landscape, with far less in-language content available online. This makes it difficult for QA systems to provide accurate information to users in their native language. Howev",
59
- "paper": "Paper Link: https://arxiv.org/abs/2305.06897",
60
- "homepage": "HomePage: https://github.com/masakhane-io/afriqa"
61
- },
62
- "afrobench_afrisenti": {
63
- "name": "## Paper",
64
- "description": ">Africa is home to over 2,000 languages from over six language families and has the highest linguistic diversity among all continents. This includes 75 languages with at least one million speakers each. Yet, there is little NLP research conducted on African languages. Crucial in enabling such research is the availability of high-quality annotated datasets. In this paper, we introduce AfriSenti, a sentiment analysis benchmark that contains a total of >110,000 tweets in 14 African languages (Amhar",
65
- "paper": "Paper Link: https://aclanthology.org/2023.emnlp-main.862/",
66
- "homepage": "HomePage: https://github.com/afrisenti-semeval/afrisent-semeval-2023"
67
- },
68
- "afrobench_belebele": {
69
- "name": "## Paper",
70
- "description": ">Belebele is a multiple-choice machine reading comprehension (MRC) dataset spanning 122 language variants. This dataset enables the evaluation of mono- and multi-lingual models in high-, medium-, and low-resource languages. Each question has four multiple-choice answers and is linked to a short passage from the FLORES-200 dataset. The human annotation procedure was carefully curated to create questions that discriminate between different levels of generalizable language comprehension and is rein",
71
- "paper": "Paper Link: https://aclanthology.org/2023.emnlp-main.862/",
72
- "homepage": "HomePage: https://github.com/facebookresearch/belebele"
73
- },
74
- "afrobench_flores": {
75
- "name": "## Paper",
76
- "description": "## Paper\nTitle: `The FLORES-200 Evaluation Benchmark for Low-Resource and Multilingual Machine Translation`",
77
- "paper": "Paper Link: https://arxiv.org/abs/2207.04672",
78
- "homepage": "HomePage: https://huggingface.co/datasets/facebook/flores"
79
- },
80
- "afrobench_injongointent": {
81
- "name": "## Paper",
82
- "description": ">Slot-filling and intent detection are well-established tasks in Conversational AI. However, current large-scale benchmarks for these tasks often exclude evaluations of low-resource languages and rely on translations from English benchmarks, thereby predominantly reflecting Western-centric concepts. In this paper, we introduce Injongo -- a multicultural, open-source benchmark dataset for 16 African languages with utterances generated by native speakers across diverse domains, including banking, ",
83
- "paper": "Paper Link: https://arxiv.org/abs/2502.09814",
84
- "homepage": ""
85
- },
86
- "afrobench_mafand": {
87
- "name": "## Paper",
88
- "description": ">Recent advances in the pre-training of language models leverage large-scale datasets to create multilingual models. However, low-resource languages are mostly left out in these datasets. This is primarily because many widely spoken languages are not well represented on the web and therefore excluded from the large-scale crawls used to create datasets. Furthermore, downstream users of these models are restricted to the selection of languages originally chosen for pre-training. This work investig",
89
- "paper": "Paper Link: https://aclanthology.org/2022.naacl-main.223/",
90
- "homepage": "HomePage: https://github.com/masakhane-io/lafand-mt"
91
- },
92
- "afrobench_masakhaner": {
93
- "name": "## Paper",
94
- "description": ">African languages are spoken by over a billion people, but they are under-represented in NLP research and development. Multiple challenges exist, including the limited availability of annotated training and evaluation datasets as well as the lack of understanding of which settings, languages, and recently proposed methods like cross-lingual transfer will be effective. In this paper, we aim to move towards solutions for these challenges, focusing on the task of named entity recognition (NER). We",
95
- "paper": "Paper Link: https://aclanthology.org/2022.emnlp-main.298/",
96
- "homepage": "HomePage: https://github.com/masakhane-io/masakhane-ner"
97
- },
98
- "afrobench_masakhanews": {
99
- "name": "## Paper",
100
- "description": ">African languages are severely under-represented in NLP research due to lack of datasets covering several NLP tasks. While there are individual language specific datasets that are being expanded to different tasks, only a handful of NLP tasks (e.g. named entity recognition and machine translation) have standardized benchmark datasets covering several geographical and typologically-diverse African languages. In this paper, we develop MasakhaNEWS -- a new benchmark dataset for news topic classifi",
101
- "paper": "Paper Link: https://aclanthology.org/2023.ijcnlp-main.10/",
102
- "homepage": "HomePage: https://github.com/masakhane-io/masakhane-news"
103
- },
104
- "afrobench_masakhapos": {
105
- "name": "## Paper",
106
- "description": ">In this paper, we present AfricaPOS, the largest part-of-speech (POS) dataset for 20 typologically diverse African languages. We discuss the challenges in annotating POS for these languages using the universal dependencies (UD) guidelines. We conducted extensive POS baseline experiments using both conditional random field and several multilingual pre-trained language models. We applied various cross-lingual transfer models trained with data available in the UD. Evaluating on the AfricaPOS datas",
107
- "paper": "Paper Link: https://aclanthology.org/2023.acl-long.609/",
108
- "homepage": "HomePage: https://github.com/masakhane-io/masakhane-pos"
109
- },
110
- "afrobench_naijarc": {
111
- "name": "## Paper",
112
- "description": ">In this paper, we create NaijaRC: a new multi-choice Reading Comprehension dataset for three native Nigeria languages that is based on high-school reading comprehension examination. We provide baseline results by performing cross-lingual transfer using existing English RACE and Belebele training dataset based on a pre-trained encoder-only model. Additionally, we provide results by prompting large language models (LLMs) like GPT-4.",
113
- "paper": "Paper Link: https://arxiv.org/abs/2308.09768",
114
- "homepage": "HomePage: https://huggingface.co/datasets/aremuadeolajr/NaijaRC"
115
- },
116
- "afrobench_nollysenti": {
117
- "name": "## Paper",
118
- "description": ">Africa has over 2000 indigenous languages but they are under-represented in NLP research due to lack of datasets. In recent years, there have been progress in developing labelled corpora for African languages. However, they are often available in a single domain and may not generalize to other domains. In this paper, we focus on the task of sentiment classification for cross-domain adaptation. We create a new dataset, Nollywood movie reviews for five languages widely spoken in Nigeria (English,",
119
- "paper": "Paper Link: https://aclanthology.org/2023.acl-short.85/",
120
- "homepage": "HomePage: https://github.com/IyanuSh/NollySenti"
121
- },
122
- "afrobench_ntrex": {
123
- "name": "## Paper",
124
- "description": ">We release NTREX-128, a data set for machine translation (MT) evaluation from English into a total of 128 target languages. The paper describes the data creation process and proposes a quality filtering method based on human evaluation. We show experimental results which confirm that the directionality of test sets translation indeed plays an important role wrt. the usefulness of the corresponding metrics\u2019 scores. Thus, we recommend that the NTREX-128 data set should be used for evaluation of E",
125
- "paper": "Paper Link: https://aclanthology.org/2022.sumeval-1.4/",
126
- "homepage": "HomePage: https://github.com/MicrosoftTranslator/NTREX"
127
- },
128
- "afrobench_openai_mmlu": {
129
- "name": "## Paper",
130
- "description": ">We propose a new test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more. To attain high accuracy on this test, models must possess extensive world knowledge and problem solving ability. We find that while most recent models have near random-chance accuracy, the very largest GPT-3 model improves over random chance by almost 20 percentage points on average. However, on every one of the 57 tasks, the",
131
- "paper": "Paper Link: https://arxiv.org/abs/2009.03300",
132
- "homepage": "HomePage: https://huggingface.co/datasets/openai/MMMLU"
133
- },
134
- "afrobench_salt": {
135
- "name": "## Paper",
136
- "description": ">SALT is a multi-way parallel text and speech corpus of Engish and six languages widely spoken in Uganda and East Africa: Luganda, Lugbara, Acholi, Runyankole, Ateso and Swahili. The core of the dataset is a set of 25,000 sentences covering a range of topics of local relevance, such as agriculture, health and society. Each sentence is translated into all languages, to support machine translation, and speech recordings are made for approximately 5,000 of the sentences both by a variety of speaker",
137
- "paper": "Paper Link: https://aclanthology.org/2023.emnlp-main.862/",
138
- "homepage": "HomePage: https://github.com/SunbirdAI/salt"
139
- },
140
- "afrobench_sib": {
141
- "name": "## Paper",
142
- "description": ">Despite the progress in building multilingual language models, evaluation is often limited to a few languages with available datasets which excludes a large number of low-resource languages. In this paper, we create SIB-200\u2014a large-scale open-sourced benchmark dataset for topic classification in 205 languages and dialects to address the lack of evaluation dataset for Natural Language Understanding (NLU). For many of the languages covered in SIB-200, this is the first publicly available evaluati",
143
- "paper": "Paper Link: https://aclanthology.org/2024.eacl-long.14/",
144
- "homepage": "HomePage: https://github.com/dadelani/sib-200"
145
- },
146
- "afrobench_uhura-arc-easy": {
147
- "name": "## Paper",
148
- "description": ">Evaluations of Large Language Models (LLMs) on knowledge-intensive tasks and factual accuracy often focus on high-resource languages primarily because datasets for low-resource languages (LRLs) are scarce. In this paper, we present Uhura -- a new benchmark that focuses on two tasks in six typologically-diverse African languages, created via human translation of existing English benchmarks. The first dataset, Uhura-ARC-Easy, is composed of multiple-choice science questions. The second, Uhura-Tru",
149
- "paper": "Paper Link: https://arxiv.org/abs/2412.00948",
150
- "homepage": "HomePage: https://huggingface.co/datasets/masakhane/uhura-arc-easy"
151
- },
152
- "afrobench_xlsum": {
153
- "name": "## Paper",
154
- "description": "ive Summarization for 44 Languages`",
155
- "paper": "Paper Link: https://aclanthology.org/2021.findings-acl.413/",
156
- "homepage": "HomePage: https://github.com/csebuetnlp/xl-sum"
157
- },
158
- "agieval": {
159
- "name": "AGIEval",
160
- "description": "https://arxiv.org/abs/2304.06364.pdf",
161
- "paper": "Abstract: https://arxiv.org/abs/2304.06364.pdf",
162
- "homepage": "Homepage: https://github.com/ruixiangcui/AGIEval"
163
- },
164
- "alghafa_copa_ar": {
165
- "name": "alghafa_copa_ar",
166
- "description": "The Choice Of Plausible Alternatives (COPA) evaluation provides researchers with a tool for assessing progress in open-domain commonsense causal reasoning.",
167
- "paper": "",
168
- "homepage": "Homepage](https://people.ict.usc.edu/~gordon/copa.html"
169
- },
170
- "alghafa_piqa_ar": {
171
- "name": "alghafa_piqa_ar",
172
- "description": "Original Title: `PIQA: Reasoning about Physical Commonsense in Natural Language`",
173
- "paper": "paper: [PICA](https://arxiv.org/abs/1911.11641",
174
- "homepage": "Homepage](https://yonatanbisk.com/piqa"
175
- },
176
- "anli": {
177
- "name": "ANLI",
178
- "description": "Title: `Adversarial NLI: A New Benchmark for Natural Language Understanding`",
179
- "paper": "Paper Link: https://arxiv.org/abs/1910.14599",
180
- "homepage": "Homepage: https://github.com/facebookresearch/anli"
181
- },
182
- "arab_culture": {
183
- "name": "Arab Culture",
184
- "description": "https://arxiv.org/abs/2502.12788",
185
- "paper": "Abstract: https://arxiv.org/abs/2502.12788",
186
- "homepage": "Homepage: https://github.com/fajri91/ArabicCulture"
187
- },
188
- "arab_culture_completion": {
189
- "name": "Arab Culture",
190
- "description": "https://arxiv.org/abs/2502.12788",
191
- "paper": "Abstract: https://arxiv.org/abs/2502.12788",
192
- "homepage": "Homepage: https://github.com/fajri91/ArabicCulture"
193
- },
194
- "arabic_leaderboard_complete": {
195
- "name": "Arabic Leaderboard",
196
- "description": "= \"Recent advances in the space of Arabic large language models have opened up a wealth of potential practical applications. From optimal training strategies, large scale data acquisition and continuously increasing NLP resources, the Arabic LLM landscape has improved in a very short span of time, despite being plagued by training data scarcity and limited evaluation resources compared to English. In line with contributing towards this ever-growing field, we introduce AlGhafa, a new multiple-cho",
197
- "paper": "Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf",
198
- "homepage": "Homepage: https://huggingface.co/spaces/OALL/Open-Arabic-LLM-Leaderboard"
199
- },
200
- "arabic_leaderboard_light": {
201
- "name": "Arabic Leaderboard Light",
202
- "description": "This leaderboard follows all the details as in [`arabic_leaderboard_complete`](../arabic_leaderboard_complete), except that a light version - 10% random sample of the test set of each benchmark - is used to test the language models.",
203
- "paper": "",
204
- "homepage": ""
205
- },
206
- "arabicmmlu": {
207
- "name": "ArabicMMLU",
208
- "description": "https://arxiv.org/abs/2402.12840",
209
- "paper": "Abstract: https://arxiv.org/abs/2402.12840",
210
- "homepage": "Homepage: https://github.com/mbzuai-nlp/ArabicMMLU"
211
- },
212
- "aradice": {
213
- "name": "AraDiCE",
214
- "description": "** Arabic, with its rich diversity of dialects, remains significantly underrepresented in Large Language Models, particularly in dialectal variations. We address this gap by introducing seven synthetic datasets in dialects alongside Modern Standard Arabic (MSA), created using Machine Translation (MT) combined with human post-editing. We present AraDiCE, a benchmark for Arabic Dialect and Cultural Evaluation. We evaluate LLMs on dialect comprehension and generation, focusing specifically on low-r",
215
- "paper": "",
216
- "homepage": ""
217
- },
218
- "arc": {
219
- "name": "ARC",
220
- "description": "https://arxiv.org/abs/1803.05457",
221
- "paper": "Abstract: https://arxiv.org/abs/1803.05457",
222
- "homepage": "Homepage: https://allenai.org/data/arc"
223
- },
224
- "arc_mt": {
225
- "name": "arc mt",
226
- "description": "arc mt is an implementation of tasks to support machine translated arc\nchallenge evals, to improve eval support across a number of additional\nlanguages.",
227
- "paper": "",
228
- "homepage": ""
229
- },
230
- "arithmetic": {
231
- "name": "Arithmetic",
232
- "description": "https://arxiv.org/abs/2005.14165",
233
- "paper": "Abstract: https://arxiv.org/abs/2005.14165",
234
- "homepage": "Homepage: https://github.com/openai/gpt-3/tree/master/data"
235
- },
236
- "asdiv": {
237
- "name": "ASDiv",
238
- "description": "https://arxiv.org/abs/2106.15772",
239
- "paper": "Abstract: https://arxiv.org/abs/2106.15772",
240
- "homepage": "Homepage: https://github.com/chaochun/nlu-asdiv-dataset"
241
- },
242
- "babi": {
243
- "name": "bAbI",
244
- "description": "https://arxiv.org/abs/1502.05698",
245
- "paper": "Abstract: https://arxiv.org/abs/1502.05698",
246
- "homepage": "Homepage: https://github.com/facebookarchive/bAbI-tasks"
247
- },
248
- "basque_bench": {
249
- "name": "BasqueBench",
250
- "description": "BasqueBench is a benchmark for evaluating language models in Basque tasks. This is, it evaluates the ability of a language model to understand and generate Basque text. BasqueBench offers a combination of pre-existing, open datasets and datasets developed exclusivelly for this benchmark. All the details of BasqueBench will be published in a paper soon.",
251
- "paper": "",
252
- "homepage": ""
253
- },
254
- "basqueglue": {
255
- "name": "BasqueGLUE",
256
- "description": "`https://aclanthology.org/2022.lrec-1.172/`",
257
- "paper": "Abstract: `https://aclanthology.org/2022.lrec-1.172/`",
258
- "homepage": "Homepage: `https://github.com/orai-nlp/BasqueGLUE`"
259
- },
260
- "bbh": {
261
- "name": "BigBenchHard",
262
- "description": "https://arxiv.org/abs/2210.09261",
263
- "paper": "Abstract: https://arxiv.org/abs/2210.09261",
264
- "homepage": "Homepage: https://github.com/suzgunmirac/BIG-Bench-Hard"
265
- },
266
- "bbq": {
267
- "name": "BBQ",
268
- "description": "https://aclanthology.org/2022.findings-acl.165/",
269
- "paper": "Abstract: https://aclanthology.org/2022.findings-acl.165/",
270
- "homepage": "Homepage: https://github.com/nyu-mll/BBQ"
271
- },
272
- "belebele": {
273
- "name": "Belebele",
274
- "description": "The Belebele Benchmark for Massively Multilingual NLU Evaluation\nhttps://arxiv.org/abs/2308.16884",
275
- "paper": "",
276
- "homepage": "Homepage: https://github.com/facebookresearch/belebele"
277
- },
278
- "benchmarks": {
279
- "name": "benchmarks",
280
- "description": "### Changelog\n- 2025-Mar-17 OpenLLM v2: Fixed few-shot split to correctly use train set for arc_challenge.",
281
- "paper": "",
282
- "homepage": ""
283
- },
284
- "benchmarks_multimedqa": {
285
- "name": "MultiMedQA (multiple-choice subset)",
286
- "description": "https://arxiv.org/abs/2212.13138",
287
- "paper": "Abstract: https://arxiv.org/abs/2212.13138",
288
- "homepage": ""
289
- },
290
- "bertaqa": {
291
- "name": "BertaQA",
292
- "description": "https://arxiv.org/abs/2406.07302",
293
- "paper": "Abstract: https://arxiv.org/abs/2406.07302",
294
- "homepage": "Homepage: https://github.com/juletx/BertaQA"
295
- },
296
- "bigbench": {
297
- "name": "BigBench",
298
- "description": "https://arxiv.org/abs/2206.04615",
299
- "paper": "Abstract: https://arxiv.org/abs/2206.04615",
300
- "homepage": "Homepage: https://github.com/google/BIG-bench"
301
- },
302
- "blimp": {
303
- "name": "Task-name",
304
- "description": "`https://arxiv.org/abs/1912.00582`",
305
- "paper": "Abstract: `https://arxiv.org/abs/1912.00582`",
306
- "homepage": "Homepage: https://github.com/alexwarstadt/blimp"
307
- },
308
- "c4": {
309
- "name": "Colossal Clean Crawled Corpus(C4)",
310
- "description": "[Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683)",
311
- "paper": "",
312
- "homepage": "Homepage](https://huggingface.co/datasets/allenai/c4"
313
- },
314
- "careqa": {
315
- "name": "CareQA",
316
- "description": "[https://arxiv.org/abs/2502.06666](https://arxiv.org/abs/2502.06666)",
317
- "paper": "Abstract: [https://arxiv.org/abs/2502.06666](https://arxiv.org/abs/2502.06666",
318
- "homepage": ""
319
- },
320
- "catalan_bench": {
321
- "name": "CatalanBench",
322
- "description": "ive and extreme). - `phrases_va`: Two Phrases_va tasks for language adaptation between Catalan and Valencian.",
323
- "paper": "",
324
- "homepage": ""
325
- },
326
- "ceval": {
327
- "name": "C-Eval (Validation)",
328
- "description": "### Paper\nC-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models\nhttps://arxiv.org/pdf/2305.08322.pdf",
329
- "paper": "",
330
- "homepage": "Homepage: https://cevalbenchmark.com/"
331
- },
332
- "chartqa": {
333
- "name": "Task-name",
334
- "description": "`In this work, we present a large-scale benchmark covering 9.6K human-written questions as well as 23.1K questions generated from human-written chart summaries.`",
335
- "paper": "",
336
- "homepage": "Homepage: `https://github.com/vis-nlp/ChartQA`"
337
- },
338
- "cmmlu": {
339
- "name": "CMMLU",
340
- "description": "CMMLU: Measuring massive multitask language understanding in Chinese\nhttps://arxiv.org/abs/2306.09212",
341
- "paper": "",
342
- "homepage": "Homepage: https://github.com/haonan-li/CMMLU"
343
- },
344
- "code_x_glue": {
345
- "name": "CodeXGLUE Code-to-Text",
346
- "description": "- **Dataset:** CodeXGLUE (code-to-text tasks)\n- **Source:** https://github.com/microsoft/CodeXGLUE",
347
- "paper": "",
348
- "homepage": ""
349
- },
350
- "commonsense_qa": {
351
- "name": "Task-name",
352
- "description": "https://arxiv.org/pdf/1811.00937.pdf",
353
- "paper": "Abstract: https://arxiv.org/pdf/1811.00937.pdf",
354
- "homepage": "Homepage: https://www.tau-nlp.org/commonsenseqa"
355
- },
356
- "copal_id": {
357
- "name": "COPAL",
358
- "description": "`https://arxiv.org/abs/2311.01012`",
359
- "paper": "Abstract: `https://arxiv.org/abs/2311.01012`",
360
- "homepage": "Homepage: `https://github.com/haryoa/copal-id`"
361
- },
362
- "coqa": {
363
- "name": "CoQA",
364
- "description": "https://arxiv.org/pdf/1808.07042.pdf",
365
- "paper": "Abstract: https://arxiv.org/pdf/1808.07042.pdf",
366
- "homepage": "Homepage: https://stanfordnlp.github.io/coqa/"
367
- },
368
- "crows_pairs": {
369
- "name": "CrowS-Pairs",
370
- "description": "= \"Pretrained language models, especially masked language models (MLMs) have seen success across many NLP tasks. However, there is ample evidence that they use the cultural biases that are undoubtedly present in the corpora they are trained on, implicitly creating harm with biased representations. To measure some forms of social bias in language models against protected demographic groups in the US, we introduce the Crowdsourced Stereotype Pairs benchmark (CrowS-Pairs). CrowS-Pairs has 1508 exam",
371
- "paper": "",
372
- "homepage": "Homepage: https://github.com/nyu-mll/crows-pairs,"
373
- },
374
- "csatqa": {
375
- "name": "CSATQA",
376
- "description": "- **Dataset:** HAERAE-HUB/csatqa\n- **Source:** https://huggingface.co/datasets/HAERAE-HUB/csatqa",
377
- "paper": "",
378
- "homepage": ""
379
- },
380
- "darija_bench": {
381
- "name": "DarijaBench",
382
- "description": "[https://arxiv.org/abs/2409.17912](https://arxiv.org/abs/2409.17912)",
383
- "paper": "Abstract: [https://arxiv.org/abs/2409.17912](https://arxiv.org/abs/2409.17912",
384
- "homepage": "Homepage: [https://huggingface.co/datasets/MBZUAI-Paris/DarijaBench](https://huggingface.co/datasets/MBZUAI-Paris/DarijaBench"
385
- },
386
- "darija_bench_darija_sentiment": {
387
- "name": "DarijaBench: Sentiment Analysis",
388
- "description": "[https://arxiv.org/abs/2409.17912](https://arxiv.org/abs/2409.17912)",
389
- "paper": "Abstract: [https://arxiv.org/abs/2409.17912](https://arxiv.org/abs/2409.17912",
390
- "homepage": "Homepage: [https://huggingface.co/datasets/MBZUAI-Paris/DarijaBench](https://huggingface.co/datasets/MBZUAI-Paris/DarijaBench"
391
- },
392
- "darija_bench_darija_summarization": {
393
- "name": "DarijaBench: Summarization",
394
- "description": "[https://arxiv.org/abs/2409.17912](https://arxiv.org/abs/2409.17912)",
395
- "paper": "Abstract: [https://arxiv.org/abs/2409.17912](https://arxiv.org/abs/2409.17912",
396
- "homepage": "Homepage: [https://huggingface.co/datasets/MBZUAI-Paris/DarijaBench](https://huggingface.co/datasets/MBZUAI-Paris/DarijaBench"
397
- },
398
- "darija_bench_darija_translation": {
399
- "name": "DarijaBench: Translation",
400
- "description": "[https://arxiv.org/abs/2409.17912](https://arxiv.org/abs/2409.17912)",
401
- "paper": "Abstract: [https://arxiv.org/abs/2409.17912](https://arxiv.org/abs/2409.17912",
402
- "homepage": "Homepage: [https://huggingface.co/datasets/MBZUAI-Paris/DarijaBench](https://huggingface.co/datasets/MBZUAI-Paris/DarijaBench"
403
- },
404
- "darija_bench_darija_transliteration": {
405
- "name": "DarijaBench: Transliteration",
406
- "description": "[https://arxiv.org/abs/2409.17912](https://arxiv.org/abs/2409.17912)",
407
- "paper": "Abstract: [https://arxiv.org/abs/2409.17912](https://arxiv.org/abs/2409.17912",
408
- "homepage": "Homepage: [https://huggingface.co/datasets/MBZUAI-Paris/DarijaBench](https://huggingface.co/datasets/MBZUAI-Paris/DarijaBench"
409
- },
410
- "darijahellaswag": {
411
- "name": "DarijaHellaSwag",
412
- "description": "[https://arxiv.org/abs/2409.17912](https://arxiv.org/abs/2409.17912)",
413
- "paper": "Abstract: [https://arxiv.org/abs/2409.17912](https://arxiv.org/abs/2409.17912",
414
- "homepage": "Homepage: [https://huggingface.co/datasets/MBZUAI-Paris/DarijaHellaSwag](https://huggingface.co/datasets/MBZUAI-Paris/DarijaHellaSwag"
415
- },
416
- "darijammlu": {
417
- "name": "DarijaMMLU",
418
- "description": "[https://arxiv.org/abs/2409.17912](https://arxiv.org/abs/2409.17912)",
419
- "paper": "Abstract: [https://arxiv.org/abs/2409.17912](https://arxiv.org/abs/2409.17912",
420
- "homepage": "Homepage: [https://huggingface.co/datasets/MBZUAI-Paris/DarijaMMLU](https://huggingface.co/datasets/MBZUAI-Paris/DarijaMMLU"
421
- },
422
- "drop": {
423
- "name": "DROP",
424
- "description": "https://aclanthology.org/attachments/N19-1246.Supplementary.pdf",
425
- "paper": "Abstract: https://aclanthology.org/attachments/N19-1246.Supplementary.pdf",
426
- "homepage": "Homepage: https://allenai.org/data/drop"
427
- },
428
- "egyhellaswag": {
429
- "name": "EgyHellaSwag",
430
- "description": "[https://arxiv.org/abs/2505.18383](https://arxiv.org/abs/2505.18383)",
431
- "paper": "Abstract: [https://arxiv.org/abs/2505.18383](https://arxiv.org/abs/2505.18383",
432
- "homepage": "Homepage: [https://huggingface.co/datasets/UBC-NLP/EgyHellaSwag](https://huggingface.co/datasets/UBC-NLP/EgyHellaSwag"
433
- },
434
- "egymmlu": {
435
- "name": "EgyMMLU",
436
- "description": "[https://arxiv.org/abs/2505.18383](https://arxiv.org/abs/2505.18383)",
437
- "paper": "Abstract: [https://arxiv.org/abs/2505.18383](https://arxiv.org/abs/2505.18383",
438
- "homepage": "Homepage: [https://huggingface.co/datasets/UBC-NLP/EgyMMLU](https://huggingface.co/datasets/UBC-NLP/EgyMMLU"
439
- },
440
- "eq_bench": {
441
- "name": "EQ-Bench",
442
- "description": "https://arxiv.org/abs/2312.06281",
443
- "paper": "Abstract: https://arxiv.org/abs/2312.06281",
444
- "homepage": "Homepage: https://eqbench.com/"
445
- },
446
- "eus_exams": {
447
- "name": "EusExams",
448
- "description": "https://arxiv.org/abs/2403.20266",
449
- "paper": "Abstract: https://arxiv.org/abs/2403.20266",
450
- "homepage": "Homepage: https://github.com/hitz-zentroa/latxa"
451
- },
452
- "eus_proficiency": {
453
- "name": "EusProficiency",
454
- "description": "https://arxiv.org/abs/2403.20266",
455
- "paper": "Abstract: https://arxiv.org/abs/2403.20266",
456
- "homepage": "Homepage: https://github.com/hitz-zentroa/latxa"
457
- },
458
- "eus_reading": {
459
- "name": "EusReading",
460
- "description": "https://arxiv.org/abs/2403.20266",
461
- "paper": "Abstract: https://arxiv.org/abs/2403.20266",
462
- "homepage": "Homepage: https://github.com/hitz-zentroa/latxa"
463
- },
464
- "eus_trivia": {
465
- "name": "EusTrivia",
466
- "description": "https://arxiv.org/abs/2403.20266",
467
- "paper": "Abstract: https://arxiv.org/abs/2403.20266",
468
- "homepage": "Homepage: https://github.com/hitz-zentroa/latxa"
469
- },
470
- "evalita_llm": {
471
- "name": "Evalita-LLM",
472
- "description": "Evalita-LLM, a new benchmark designed to evaluate Large Language\nModels (LLMs) on Italian tasks. The distinguishing and innovative features of\nEvalita-LLM are the following: (i) all tasks are native Italian, avoiding issues of\ntranslating from Italian and potential cultural biases; (ii) in addition to well established multiple-choice tasks, the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitiga",
473
- "paper": "",
474
- "homepage": ""
475
- },
476
- "fda": {
477
- "name": "FDA",
478
- "description": "A long standing goal of the data management community is to develop general, automated systems that ingest semi-structured documents and output queryable tables without human effort or domain specific customization. Given the sheer variety of potential documents, state-of-the art systems make simplifying assumptions and use domain specific training. In this work, we ask whether we can maintain generality by using large language models (LLMs). LLMs, which are pretrained on broad data, can perform",
479
- "paper": "",
480
- "homepage": "Homepage: https://github.com/HazyResearch/based-evaluation-harness"
481
- },
482
- "fld": {
483
- "name": "FLD",
484
- "description": "https://arxiv.org/abs/2308.07336",
485
- "paper": "Abstract: https://arxiv.org/abs/2308.07336",
486
- "homepage": "Homepage: https://github.com/hitachi-nlp/FLD"
487
- },
488
- "french_bench": {
489
- "name": "FrenchBench",
490
- "description": "- french_bench_orangesum_title - french_bench_trivia - french_bench_hellaswag - french_bench_arc_challenge",
491
- "paper": "",
492
- "homepage": ""
493
- },
494
- "galician_bench": {
495
- "name": "GalicianBench",
496
- "description": "GalicianBench is a benchmark for evaluating language models in Galician tasks. This is, it evaluates the ability of a language model to understand and generate Galician text. GalicianBench offers a combination of pre-existing, open datasets and datasets developed exclusivelly for this benchmark. All the details of GalicianBench will be published in a paper soon.",
497
- "paper": "",
498
- "homepage": ""
499
- },
500
- "glianorex": {
501
- "name": "Glianorex",
502
- "description": "https://arxiv.org/abs/2406.02394",
503
- "paper": "Abstract: https://arxiv.org/abs/2406.02394",
504
- "homepage": ""
505
- },
506
- "global_mmlu": {
507
- "name": "Global-MMLU",
508
- "description": "[https://arxiv.org/abs/2412.03304](https://arxiv.org/abs/2412.03304)",
509
- "paper": "Abstract: [https://arxiv.org/abs/2412.03304](https://arxiv.org/abs/2412.03304",
510
- "homepage": ""
511
- },
512
- "glue": {
513
- "name": "GLUE",
514
- "description": "https://openreview.net/pdf?id=rJ4km2R5t7",
515
- "paper": "Abstract: https://openreview.net/pdf?id=rJ4km2R5t7",
516
- "homepage": "Homepage: https://gluebenchmark.com/"
517
- },
518
- "gpqa": {
519
- "name": "GPQA",
520
- "description": "https://arxiv.org/abs/2311.12022",
521
- "paper": "Abstract: https://arxiv.org/abs/2311.12022",
522
- "homepage": "Homepage: `https://github.com/idavidrein/gpqa/tree/main`"
523
- },
524
- "groundcocoa": {
525
- "name": "GroundCocoa",
526
- "description": "https://arxiv.org/abs/2404.04237",
527
- "paper": "Abstract: https://arxiv.org/abs/2404.04237",
528
- "homepage": "Homepage: `https://osu-nlp-group.github.io/GroundCocoa/`"
529
- },
530
- "gsm8k": {
531
- "name": "GSM8k",
532
- "description": "## Paper\nTraining Verifiers to Solve Math Word Problems\nhttps://arxiv.org/abs/2110.14168",
533
- "paper": "",
534
- "homepage": "Homepage: https://github.com/openai/grade-school-math"
535
- },
536
- "gsm8k_platinum": {
537
- "name": "GSM8k Platinum",
538
- "description": "GSM8K Platinum is a revised version of the full test set of GSM8K (Grade School Math 8K), a dataset of grade school math word problems. To revise this dataset, we ran a variety of frontier models each individual example and manually re-annotated any example for which at least one model made an error. We revise the labels of mislabeled examples, and remove any question that we determine to be poorly written (most often due to ambiguity in the problem statement). See our paper for further details ",
539
- "paper": "",
540
- "homepage": "Homepage: http://platinum-bench.csail.mit.edu/"
541
- },
542
- "gsm_plus": {
543
- "name": "gsm_plus",
544
- "description": "`Large language models (LLMs) have achieved impressive performance across various mathematical reasoning benchmarks. However, there are increasing debates regarding whether these models truly understand and apply mathematical knowledge or merely rely on shortcuts for mathematical reasoning. One essential and frequently occurring evidence is that when the math questions are slightly changed, LLMs can behave incorrectly. This motivates us to evaluate the robustness of LLMs\u2019 math reasoning capabili",
545
- "paper": "",
546
- "homepage": "Homepage: https://huggingface.co/datasets/qintongli/GSM-Plus"
547
- },
548
- "haerae": {
549
- "name": "HAE-RAE BENCH",
550
- "description": "`Large Language Models (LLMs) trained on massive corpora demonstrate impressive capabilities in a wide range of tasks. While there are ongoing efforts to adapt these models to languages beyond English, the attention given to their evaluation methodologies remains limited. Current multilingual benchmarks often rely on back translations or re-implementations of English tests, limiting their capacity to capture unique cultural and linguistic nuances. To bridge this gap for the Korean language, we i",
551
- "paper": "",
552
- "homepage": "Homepage: https://huggingface.co/datasets/HAERAE-HUB/HAE_RAE_BENCH"
553
- },
554
- "headqa": {
555
- "name": "HEAD-QA",
556
- "description": "= \"We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performan",
557
- "paper": "",
558
- "homepage": "Homepage: https://aghie.github.io/head-qa/"
559
- },
560
- "hellaswag": {
561
- "name": "HellaSwag",
562
- "description": "https://arxiv.org/abs/1905.07830",
563
- "paper": "Abstract: https://arxiv.org/abs/1905.07830",
564
- "homepage": "Homepage: `https://rowanzellers.com/hellaswag/`"
565
- },
566
- "hendrycks_ethics": {
567
- "name": "ETHICS Dataset",
568
- "description": "Pointer Sentinel Mixture Models\nhttps://arxiv.org/pdf/1609.07843.pdf",
569
- "paper": "",
570
- "homepage": "Homepage: https://github.com/hendrycks/ethics"
571
- },
572
- "hendrycks_math": {
573
- "name": "MATH",
574
- "description": "## Paper\nMeasuring Mathematical Problem Solving With the MATH Dataset\nhttps://arxiv.org/abs/2103.03874",
575
- "paper": "paper (https://arxiv.org/abs/2206.14858",
576
- "homepage": "Homepage: https://github.com/hendrycks/math"
577
- },
578
- "histoires_morales": {
579
- "name": "Histoires Morales",
580
- "description": "`https://arxiv.org/pdf/2501.17117`",
581
- "paper": "Abstract: `https://arxiv.org/pdf/2501.17117`",
582
- "homepage": "Homepage: `https://huggingface.co/datasets/LabHC/histoires_morales`"
583
- },
584
- "hrm8k": {
585
- "name": "HRM8K",
586
- "description": "Title: [Understand, Solve and Translate: Bridging the Multilingual Mathematical Reasoning Gap](https://www.arxiv.org/abs/2501.02448)",
587
- "paper": "",
588
- "homepage": "Homepage: https://huggingface.co/datasets/HAERAE-HUB/HRM8K"
589
- },
590
- "humaneval": {
591
- "name": "HumanEval",
592
- "description": "## Paper\nEvaluating Large Language Models Trained on Code\nhttps://arxiv.org/abs/2107.03374",
593
- "paper": "",
594
- "homepage": "Homepage: https://github.com/openai/human-eval"
595
- },
596
- "ifeval": {
597
- "name": "IFEval",
598
- "description": "https://arxiv.org/abs/2311.07911",
599
- "paper": "Abstract: https://arxiv.org/abs/2311.07911",
600
- "homepage": "Homepage: https://github.com/google-research/google-research/tree/master/instruction_following_eval"
601
- },
602
- "include": {
603
- "name": "INCLUDE",
604
- "description": "[https://arxiv.org/abs/2411.19799](https://arxiv.org/abs/2411.19799)",
605
- "paper": "Abstract: [https://arxiv.org/abs/2411.19799](https://arxiv.org/abs/2411.19799",
606
- "homepage": ""
607
- },
608
- "inverse_scaling": {
609
- "name": "inverse_scaling",
610
- "description": "`Work on scaling laws has found that large language models (LMs) show predictable improvements to overall loss with increased scale (model size, training data, and compute). Here, we present evidence for the claim that LMs may show inverse scaling, or worse task performance with increased scale, e.g., due to flaws in the training objective and data. We present empirical evidence of inverse scaling on 11 datasets collected by running a public contest, the Inverse Scaling Prize, with a substantial",
611
- "paper": "paper (but have been tested for equivalence to the OPT numbers reported at https://huggingface.co/inverse-scaling/opt-1.3b_eval",
612
- "homepage": "Homepage: https://github.com/inverse-scaling/prize"
613
- },
614
- "japanese_leaderboard": {
615
- "name": "Japanese Leaderboard",
616
- "description": "ive Summarization for 44 Languages}, author = { Hasan, Tahmid and Bhattacharjee, Abhik and Islam, Md. Saiful and Mubasshir, Kazi and Li, Yuan-Fang and Kang, Yong-Bin and Rahman, M. Sohel and Shahriyar, Rifat }, year = 2021, month = aug, booktitle = {Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021}, publisher = {Association for Computational Linguistics}, address = {Online}, pages = {4693--4703}, url = {https://aclanthology.org/2021.findings-acl.413} }",
617
- "paper": "paper Language models are multilingual chain-of-thought reasoners. [Source](https://huggingface.co/datasets/juletxara/mgsm",
618
- "homepage": ""
619
- },
620
- "jsonschema_bench": {
621
- "name": "JSONSchema Bench",
622
- "description": "- `jsonschema_bench_easy`, corresponding to the `github_easy` split of the original paper\n- `jsonschema_bench_medium`, corresponding to the `github_medium` split of the original paper\n- `jsonschema_bench_hard`, corresponding to the `github_hard` split of the original paper",
623
- "paper": "",
624
- "homepage": "Homepage: https://github.com/guidance-ai/jsonschemabench"
625
- },
626
- "kbl": {
627
- "name": "kbl",
628
- "description": "`Large language models (LLMs) have demonstrated remarkable performance in the legal domain, with GPT-4 even passing the Uniform Bar Exam in the U.S. However their efficacy remains limited for non-standardized tasks and tasks in languages other than English. This underscores the need for careful evaluation of LLMs within each legal system before application. Here, we introduce KBL, a benchmark for assessing the Korean legal language understanding of LLMs, consisting of (1) 7 legal knowledge tasks",
629
- "paper": "",
630
- "homepage": "Homepage: `https://github.com/lbox-kr/kbl`"
631
- },
632
- "kmmlu": {
633
- "name": "k_mmlu",
634
- "description": "`We propose KMMLU, a new Korean benchmark with 35,030 expert-level multiple-choice questions across 45 subjects ranging from humanities to STEM. Unlike previous Korean benchmarks that are translated from existing English benchmarks, KMMLU is collected from original Korean exams, capturing linguistic and cultural aspects of the Korean language. We test 26 publicly available and proprietary LLMs, identifying significant room for improvement. The best publicly available model achieves 50.54% on KMM",
635
- "paper": "",
636
- "homepage": "Homepage: https://huggingface.co/datasets/HAERAE-HUB/KMMLU"
637
- },
638
- "kobest": {
639
- "name": "LAMBADA",
640
- "description": "https://arxiv.org/abs/2204.04541",
641
- "paper": "Abstract: https://arxiv.org/abs/2204.04541",
642
- "homepage": "Homepage: https://huggingface.co/datasets/skt/kobest_v1"
643
- },
644
- "kormedmcqa": {
645
- "name": "KorMedMCQA",
646
- "description": "`We introduce KorMedMCQA, the first Korean multiple-choice question answering (MCQA) benchmark derived from Korean healthcare professional licensing examinations, covering from the year 2012 to year 2023. This dataset consists of a selection of questions from the license examinations for doctors, nurses, and pharmacists, featuring a diverse array of subjects. We conduct baseline experiments on various large language models, including proprietary/open-source, multilingual/Korean-additional pretra",
647
- "paper": "Paper : https://arxiv.org/abs/2403.01469",
648
- "homepage": "Homepage: https://huggingface.co/datasets/sean0042/KorMedMCQA"
649
- },
650
- "lambada": {
651
- "name": "LAMBADA",
652
- "description": "https://arxiv.org/pdf/1606.06031.pdf",
653
- "paper": "Abstract: https://arxiv.org/pdf/1606.06031.pdf",
654
- "homepage": "Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI"
655
- },
656
- "lambada_cloze": {
657
- "name": "LAMBADA Cloze",
658
- "description": "https://arxiv.org/abs/1606.06031",
659
- "paper": "Abstract: https://arxiv.org/abs/1606.06031",
660
- "homepage": "Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI"
661
- },
662
- "lambada_multilingual": {
663
- "name": "LAMBADA",
664
- "description": "### Paper\nThe LAMBADA dataset: Word prediction requiring a broad discourse context\nhttps://arxiv.org/pdf/1606.06031.pdf",
665
- "paper": "",
666
- "homepage": "Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI"
667
- },
668
- "lambada_multilingual_stablelm": {
669
- "name": "LAMBADA",
670
- "description": "### Paper\nThe LAMBADA dataset: Word prediction requiring a broad discourse context\nhttps://arxiv.org/pdf/1606.06031.pdf",
671
- "paper": "",
672
- "homepage": "Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI"
673
- },
674
- "leaderboard": {
675
- "name": "Leaderboard evaluations",
676
- "description": "# Leaderboard evaluations\nOur goal with this group is to create an unchanging through time version of\nevaluations that will power the Open LLM Leaderboard on HuggingFace.",
677
- "paper": "paper: https://huggingface.co/papers/2210.09261",
678
- "homepage": "Homepage: https://github.com/suzgunmirac/BIG-Bench-Hard"
679
- },
680
- "libra": {
681
- "name": "Task-name",
682
- "description": "`Datasets for proper evaluation of long-context understanding in Russian. For the Russian language LIBRA comprises 21 adapted datasets to study the LLM's abilities to understand long texts thoroughly. The tests are divided into four complexity groups and allow the evaluation of models across various context lengths ranging from 4k up to 128k tokens.`",
683
- "paper": "",
684
- "homepage": "Homepage: `https://huggingface.co/datasets/ai-forever/LIBRA`"
685
- },
686
- "lingoly": {
687
- "name": "LingOly",
688
- "description": "`https://arxiv.org/abs/2406.06196`",
689
- "paper": "Abstract: `https://arxiv.org/abs/2406.06196`",
690
- "homepage": "Homepage: `https://github.com/am-bean/lingOly`"
691
- },
692
- "llama3": {
693
- "name": "Task-name",
694
- "description": "Evals reproducing those provided by the LLAMA team in the Hugging Face repo.",
695
- "paper": "",
696
- "homepage": "Homepage: `https://huggingface.co/collections/meta-llama/llama-31-evals-66a2c5a14c2093e58298ac7f`"
697
- },
698
- "logiqa": {
699
- "name": "LogiQA",
700
- "description": "https://arxiv.org/abs/2007.08124",
701
- "paper": "Abstract: https://arxiv.org/abs/2007.08124",
702
- "homepage": "Homepage: https://github.com/lgw863/LogiQA-dataset"
703
- },
704
- "logiqa2": {
705
- "name": "LogiQA 2.0",
706
- "description": "LogiQA 2.0 \u2014 An Improved Dataset for Logical Reasoning in Natural Language Understanding https://ieeexplore.ieee.org/document/10174688",
707
- "paper": "paper does not. There is another implementation of this task, but it designed for instruction tuned models: https://github.com/csitfun/LogiEval",
708
- "homepage": "Homepage: https://github.com/csitfun/LogiQA2.0"
709
- },
710
- "longbench": {
711
- "name": "LongBench",
712
- "description": "`In this paper, we introduce LongBench, the first bilingual, multi-task benchmark for long context understanding, enabling a more rigorous evaluation of long context understanding. LongBench comprises 21 datasets across 6 task categories in both English and Chinese, with an average length of 6,711 words (English) and 13,386 characters (Chinese). These tasks cover key long-text application areas including single-doc QA, multi-doc QA, summarization, few-shot learning, synthetic tasks, and code com",
713
- "paper": "",
714
- "homepage": "Homepage: `https://github.com/THUDM/LongBench`"
715
- },
716
- "mastermind": {
717
- "name": "MastermindEval",
718
- "description": "https://arxiv.org/abs/2503.05891",
719
- "paper": "Abstract: https://arxiv.org/abs/2503.05891",
720
- "homepage": ""
721
- },
722
- "mathqa": {
723
- "name": "MathQA",
724
- "description": "MathQA: Towards Interpretable Math Word Problem Solving with Operation-Based Formalisms\nhttps://arxiv.org/pdf/1905.13319.pdf",
725
- "paper": "",
726
- "homepage": "Homepage: https://math-qa.github.io/math-QA/"
727
- },
728
- "mbpp": {
729
- "name": "MBPP",
730
- "description": "## Paper\nProgram Synthesis with Large Language Models\nhttps://arxiv.org/abs/2108.07732",
731
- "paper": "",
732
- "homepage": "Homepage: https://github.com/google-research/google-research/tree/master/mbpp"
733
- },
734
- "mc_taco": {
735
- "name": "MC Taco",
736
- "description": "https://arxiv.org/abs/1909.03065",
737
- "paper": "Abstract: https://arxiv.org/abs/1909.03065",
738
- "homepage": "Homepage: https://leaderboard.allenai.org/mctaco/submissions/public"
739
- },
740
- "med_concepts_qa": {
741
- "name": "MedConceptsQA",
742
- "description": "https://arxiv.org/abs/2405.07348",
743
- "paper": "Abstract: https://arxiv.org/abs/2405.07348",
744
- "homepage": ""
745
- },
746
- "meddialog": {
747
- "name": "Meddialog",
748
- "description": "[https://aclanthology.org/2020.emnlp-main.743/](https://aclanthology.org/2020.emnlp-main.743/)",
749
- "paper": "Abstract: [https://aclanthology.org/2020.emnlp-main.743/](https://aclanthology.org/2020.emnlp-main.743/",
750
- "homepage": ""
751
- },
752
- "mediqa_qa2019": {
753
- "name": "MEDIQA_QA 2019",
754
- "description": "[https://aclanthology.org/W19-5039/](https://aclanthology.org/W19-5039/)",
755
- "paper": "Abstract: [https://aclanthology.org/W19-5039/](https://aclanthology.org/W19-5039/",
756
- "homepage": ""
757
- },
758
- "medmcqa": {
759
- "name": "MedMCQA",
760
- "description": "- **Dataset:** medmcqa\n- **Source:** https://huggingface.co/datasets/medmcqa",
761
- "paper": "",
762
- "homepage": ""
763
- },
764
- "medqa": {
765
- "name": "MedQA",
766
- "description": "- **Dataset:** GBaker/MedQA-USMLE-4-options-hf\n- **Source:** https://huggingface.co/datasets/GBaker/MedQA-USMLE-4-options-hf",
767
- "paper": "",
768
- "homepage": ""
769
- },
770
- "medtext": {
771
- "name": "MedText",
772
- "description": "[https://arxiv.org/abs/1905.07002](https://arxiv.org/abs/1905.07002)",
773
- "paper": "Abstract: [https://arxiv.org/abs/1905.07002](https://arxiv.org/abs/1905.07002",
774
- "homepage": ""
775
- },
776
- "mela": {
777
- "name": "Task-name",
778
- "description": "**: In this work, we present the largest benchmark to date on linguistic acceptability: Multilingual Evaluation of Linguistic Acceptability -- MELA, with 46K samples covering 10 languages from a diverse set of language families. We establish LLM baselines on this benchmark, and investigate cross-lingual transfer in acceptability judgements with XLM-R. In pursuit of multilingual interpretability, we conduct probing experiments with fine-tuned XLM-R to explore the process of syntax capability acqu",
779
- "paper": "",
780
- "homepage": "Homepage: https://github.com/sjtu-compling/MELA"
781
- },
782
- "meqsum": {
783
- "name": "MeqSum",
784
- "description": "[https://aclanthology.org/P19-1215/](https://aclanthology.org/P19-1215/)",
785
- "paper": "Abstract: [https://aclanthology.org/P19-1215/](https://aclanthology.org/P19-1215/",
786
- "homepage": ""
787
- },
788
- "metabench": {
789
- "name": "Metabench",
790
- "description": "https://arxiv.org/abs/2407.12844",
791
- "paper": "Abstract: https://arxiv.org/abs/2407.12844",
792
- "homepage": "Homepage: https://github.com/adkipnis/metabench"
793
- },
794
- "mgsm": {
795
- "name": "MGSM",
796
- "description": "https://arxiv.org/abs/2210.03057",
797
- "paper": "Abstract: https://arxiv.org/abs/2210.03057",
798
- "homepage": "Homepage: https://github.com/google-research/url-nlp/tree/main/mgsm"
799
- },
800
- "mimic_repsum": {
801
- "name": "MIMIC-III Report Summarization",
802
- "description": "[https://www.nature.com/articles/sdata201635](https://www.nature.com/articles/sdata201635)",
803
- "paper": "Abstract: [https://www.nature.com/articles/sdata201635](https://www.nature.com/articles/sdata201635",
804
- "homepage": ""
805
- },
806
- "minerva_math": {
807
- "name": "MATH",
808
- "description": "# MATH\n\u2139\ufe0f This is the 4-shot variant!\n## Paper\nMeasuring Mathematical Problem Solving With the MATH Dataset\nhttps://arxiv.org/abs/2103.03874",
809
- "paper": "paper is one where the model is first fine-tuned on the data. They do have a few-shot evaluation for GPT-3, however the few-shot context used here is sourced from [Lewkowycz et al](https://arxiv.org/abs/2206.14858",
810
- "homepage": "Homepage: https://github.com/hendrycks/math"
811
- },
812
- "mlqa": {
813
- "name": "MLQA",
814
- "description": "`https://arxiv.org/abs/1910.07475`",
815
- "paper": "Abstract: `https://arxiv.org/abs/1910.07475`",
816
- "homepage": "Homepage: `https://github.com/facebookresearch/MLQA`"
817
- },
818
- "mmlu-pro-plus": {
819
- "name": "mmlu_pro_plus",
820
- "description": "`Existing benchmarks for large language models (LLMs) increasingly struggle to differentiate between top-performing models, underscoring the need for more challenging evaluation frameworks. We introduce MMLU-Pro+, an enhanced benchmark building upon MMLU-Pro to assess shortcut learning and higher-order reasoning in LLMs. By incorporating questions with multiple correct answers across diverse domains, MMLU-Pro+ tests LLMs' ability to engage in complex reasoning and resist simplistic problem-solvi",
821
- "paper": "",
822
- "homepage": "Homepage: https://github.com/asgsaeid/mmlu-pro-plus"
823
- },
824
- "mmlu": {
825
- "name": "Task-name",
826
- "description": "`https://arxiv.org/abs/2009.03300`",
827
- "paper": "Abstract: `https://arxiv.org/abs/2009.03300`",
828
- "homepage": "Homepage: `https://github.com/hendrycks/test`"
829
- },
830
- "mmlu_pro": {
831
- "name": "mmlu_pro",
832
- "description": "`In the age of large-scale language models, benchmarks like the Massive Multitask Language Understanding (MMLU) have been pivotal in pushing the boundaries of what AI can achieve in language comprehension and reasoning across diverse domains. However, as models continue to improve, their performance on these benchmarks has begun to plateau, making it increasingly difficult to discern differences in model capabilities. This paper introduces MMLU-Pro, an enhanced dataset designed to extend the mos",
833
- "paper": "",
834
- "homepage": "Homepage: https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro"
835
- },
836
- "mmlu_prox": {
837
- "name": "MMLU-ProX",
838
- "description": "`Traditional benchmarks like MMLU and MMLU-Pro focus primarily on single-language evaluation, limiting their ability to assess language models in multilingual and culturally diverse contexts. To address this gap, we introduce MMLU-ProX, a comprehensive multilingual benchmark that builds upon MMLU-Pro by covering multiple typologically diverse languages with approximately 11,829 questions per language.`",
839
- "paper": "",
840
- "homepage": "Homepage: https://mmluprox.github.io/"
841
- },
842
- "mmlusr": {
843
- "name": "MMLU-SR",
844
- "description": "_algebra`",
845
- "paper": "paper is one where the model is first fine-tuned on the data. They do have a few-shot evaluation for GPT-3, however the few-shot context used here is sourced from [Lewkowycz et al](https://arxiv.org/abs/2206.14858",
846
- "homepage": "Homepage: [https://github.com/Wang-ML-Lab/MMLU-SR](https://github.com/Wang-ML-Lab/MMLU-SR"
847
- },
848
- "mmmu": {
849
- "name": "MMMU Benchmark",
850
- "description": "`MMMU is a new benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning.`",
851
- "paper": "",
852
- "homepage": "Homepage: `https://github.com/MMMU-Benchmark/MMMU/tree/main/mmmu`"
853
- },
854
- "model_written_evals": {
855
- "name": "Model Written Evals",
856
- "description": "Model Written Evals are evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. These tasks test language models on various safety-related dimensions including AI risk awareness, persona biases, and sycophancy.",
857
- "paper": "",
858
- "homepage": ""
859
- },
860
- "moral_stories": {
861
- "name": "Moral Stories",
862
- "description": "`https://aclanthology.org/2021.emnlp-main.54/`",
863
- "paper": "Abstract: `https://aclanthology.org/2021.emnlp-main.54/`",
864
- "homepage": "Homepage: `https://github.com/demelin/moral_stories`"
865
- },
866
- "mts_dialog": {
867
- "name": "MTS-Dialog",
868
- "description": "[https://aclanthology.org/2023.eacl-main.168/](https://aclanthology.org/2023.eacl-main.168/)",
869
- "paper": "Abstract: [https://aclanthology.org/2023.eacl-main.168/](https://aclanthology.org/2023.eacl-main.168/",
870
- "homepage": ""
871
- },
872
- "multiblimp": {
873
- "name": "MultiBLiMP: A Massively Multilingual Benchmark of Linguistic Minimal Pairs",
874
- "description": "# MultiBLiMP: A Massively Multilingual Benchmark of Linguistic Minimal Pairs",
875
- "paper": "Paper: https://arxiv.org/abs/2504.02768",
876
- "homepage": ""
877
- },
878
- "mutual": {
879
- "name": "MuTual",
880
- "description": "https://www.aclweb.org/anthology/2020.acl-main.130/",
881
- "paper": "Abstract: https://www.aclweb.org/anthology/2020.acl-main.130/",
882
- "homepage": "Homepage: https://github.com/Nealcly/MuTual"
883
- },
884
- "noreval": {
885
- "name": "\ud83c\uddf3\ud83c\uddf4 NorEval",
886
- "description": "[arxiv.org/abs/2504.07749](https://arxiv.org/abs/2504.07749) * Homepage: [github.com/ltgoslo/noreval](https://github.com/ltgoslo/noreval/tree/main)",
887
- "paper": "Abstract: [arxiv.org/abs/2504.07749](https://arxiv.org/abs/2504.07749",
888
- "homepage": "Homepage: [github.com/ltgoslo/noreval](https://github.com/ltgoslo/noreval/tree/main"
889
- },
890
- "noreval_ask_gec": {
891
- "name": "noreval_ask_gec",
892
- "description": "Here, we use the `--predict_only` argument and compute the performance metrics as described below.",
893
- "paper": "",
894
- "homepage": ""
895
- },
896
- "noticia": {
897
- "name": "NoticIA",
898
- "description": "https://arxiv.org/abs/2404.07611",
899
- "paper": "Abstract: https://arxiv.org/abs/2404.07611",
900
- "homepage": "Homepage: https://github.com/ikergarcia1996/NoticIA"
901
- },
902
- "nq_open": {
903
- "name": "nq_open",
904
- "description": "Question Answering dataset based on aggregated user queries from Google Search.",
905
- "paper": "Paper: [aclanthology.org/P19-1612](https://aclanthology.org/P19-1612/",
906
- "homepage": "Homepage: https://research.google/pubs/natural-questions-a-benchmark-for-question-answering-research/"
907
- },
908
- "okapi_arc_multilingual": {
909
- "name": "Multilingual ARC",
910
- "description": "https://arxiv.org/abs/2307.16039",
911
- "paper": "Abstract: https://arxiv.org/abs/2307.16039",
912
- "homepage": "Homepage: `https://github.com/nlp-uoregon/Okapi`"
913
- },
914
- "okapi_hellaswag_multilingual": {
915
- "name": "Multilingual HellaSwag",
916
- "description": "https://arxiv.org/abs/2307.16039",
917
- "paper": "Abstract: https://arxiv.org/abs/2307.16039",
918
- "homepage": "Homepage: `https://github.com/nlp-uoregon/Okapi`"
919
- },
920
- "okapi_mmlu_multilingual": {
921
- "name": "OKAPI MMLU Multilingual",
922
- "description": "- **Dataset:** alexandrainst/m_mmlu\n- **Source:** https://huggingface.co/datasets/alexandrainst/m_mmlu",
923
- "paper": "",
924
- "homepage": ""
925
- },
926
- "okapi_truthfulqa_multilingual": {
927
- "name": "Multilingual TruthfulQA",
928
- "description": "https://arxiv.org/abs/2307.16039",
929
- "paper": "Abstract: https://arxiv.org/abs/2307.16039",
930
- "homepage": "Homepage: `https://github.com/nlp-uoregon/Okapi`"
931
- },
932
- "olaph": {
933
- "name": "OLAPH",
934
- "description": "[https://arxiv.org/abs/2405.12701](https://arxiv.org/abs/2405.12701)",
935
- "paper": "Abstract: [https://arxiv.org/abs/2405.12701](https://arxiv.org/abs/2405.12701",
936
- "homepage": ""
937
- },
938
- "openbookqa": {
939
- "name": "OpenBookQA",
940
- "description": "https://arxiv.org/abs/1809.02789",
941
- "paper": "Abstract: https://arxiv.org/abs/1809.02789",
942
- "homepage": "Homepage: https://allenai.org/data/open-book-qa"
943
- },
944
- "paloma": {
945
- "name": "Paloma",
946
- "description": "https://arxiv.org/abs/2312.10523v1",
947
- "paper": "Abstract: https://arxiv.org/abs/2312.10523v1",
948
- "homepage": "Homepage: https://allenai.org/olmo"
949
- },
950
- "paws-x": {
951
- "name": "PAWS-X",
952
- "description": "https://arxiv.org/abs/1908.11828",
953
- "paper": "Abstract: https://arxiv.org/abs/1908.11828",
954
- "homepage": "Homepage: https://github.com/google-research-datasets/paws/tree/master/pawsx"
955
- },
956
- "pile": {
957
- "name": "The Pile",
958
- "description": "https://arxiv.org/abs/2101.00027",
959
- "paper": "Abstract: https://arxiv.org/abs/2101.00027",
960
- "homepage": "Homepage: https://pile.eleuther.ai/"
961
- },
962
- "pile_10k": {
963
- "name": "Pile-10k",
964
- "description": "The first 10K elements of [The Pile](https://pile.eleuther.ai/), useful for debugging models trained on it. See the [HuggingFace page for the full Pile](https://huggingface.co/datasets/the_pile) for more info. Inspired by [stas' great resource](https://huggingface.co/datasets/stas/openwebtext-10k) doing the same for OpenWebText",
965
- "paper": "Abstract: The first 10K elements of [The Pile](https://pile.eleuther.ai/",
966
- "homepage": "Homepage: [https://huggingface.co/datasets/NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k"
967
- },
968
- "piqa": {
969
- "name": "PIQA",
970
- "description": "https://arxiv.org/abs/1911.11641",
971
- "paper": "Abstract: https://arxiv.org/abs/1911.11641",
972
- "homepage": "Homepage: https://yonatanbisk.com/piqa/"
973
- },
974
- "polemo2": {
975
- "name": "PolEmo 2.0",
976
- "description": "https://aclanthology.org/K19-1092/",
977
- "paper": "Abstract: https://aclanthology.org/K19-1092/",
978
- "homepage": "Homepage: https://clarin-pl.eu/dspace/handle/11321/710"
979
- },
980
- "portuguese_bench": {
981
- "name": "PortugueseBench",
982
- "description": "PortugueseBench is a benchmark for evaluating language models in Portuguese tasks. This is, it evaluates the ability of a language model to understand and generate Portuguese text. PortugueseBench offers a combination of pre-existing, open datasets. All the details of PortugueseBench will be published in a paper soon.",
983
- "paper": "",
984
- "homepage": ""
985
- },
986
- "prost": {
987
- "name": "PROST",
988
- "description": "https://arxiv.org/abs/2106.03634",
989
- "paper": "Abstract: https://arxiv.org/abs/2106.03634",
990
- "homepage": "Homepage: https://github.com/nala-cub/prost"
991
- },
992
- "pubmedqa": {
993
- "name": "PubMedQA",
994
- "description": "https://arxiv.org/abs/1909.06146",
995
- "paper": "Abstract: https://arxiv.org/abs/1909.06146",
996
- "homepage": "Homepage: https://pubmedqa.github.io/"
997
- },
998
- "qa4mre": {
999
- "name": "QA4MRE",
1000
- "description": "https://www.cs.cmu.edu/~./hovy/papers/13CLEF-QA4MRE.pdf",
1001
- "paper": "Abstract: https://www.cs.cmu.edu/~./hovy/papers/13CLEF-QA4MRE.pdf",
1002
- "homepage": "Homepage: http://nlp.uned.es/clef-qa/repository/qa4mre.php"
1003
- },
1004
- "qasper": {
1005
- "name": "QASPER",
1006
- "description": "https://arxiv.org/abs/2105.03011",
1007
- "paper": "Abstract: https://arxiv.org/abs/2105.03011",
1008
- "homepage": "Homepage: https://allenai.org/data/qasper"
1009
- },
1010
- "race": {
1011
- "name": "RACE",
1012
- "description": "https://arxiv.org/abs/1704.04683",
1013
- "paper": "Abstract: https://arxiv.org/abs/1704.04683",
1014
- "homepage": "Homepage: https://www.cs.cmu.edu/~glai1/data/race/"
1015
- },
1016
- "realtoxicityprompts": {
1017
- "name": "RealToxicityPrompts",
1018
- "description": "- **Dataset:** allenai/real-toxicity-prompts\n- **Source:** https://huggingface.co/datasets/allenai/real-toxicity-prompts",
1019
- "paper": "",
1020
- "homepage": ""
1021
- },
1022
- "ruler": {
1023
- "name": "Task-name",
1024
- "description": "`https://arxiv.org/abs/2404.06654`",
1025
- "paper": "Abstract: `https://arxiv.org/abs/2404.06654`",
1026
- "homepage": "Homepage: `https://github.com/NVIDIA/RULER`"
1027
- },
1028
- "sciq": {
1029
- "name": "SciQ",
1030
- "description": "https://aclanthology.org/W17-4413.pdf",
1031
- "paper": "Abstract: https://aclanthology.org/W17-4413.pdf",
1032
- "homepage": "Homepage: https://allenai.org/data/sciq"
1033
- },
1034
- "score": {
1035
- "name": "SCORE: Systematic COnsistency and Robustness Evaluation for Large Language Models",
1036
- "description": "```\nCopyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.",
1037
- "paper": "",
1038
- "homepage": ""
1039
- },
1040
- "scrolls": {
1041
- "name": "scrolls",
1042
- "description": "\"\"\"\nSCROLLS: Standardized CompaRison Over Long Language Sequences\nhttps://arxiv.org/abs/2201.03533",
1043
- "paper": "",
1044
- "homepage": "Homepage: https://www.scrolls-benchmark.com/"
1045
- },
1046
- "simple_cooccurrence_bias": {
1047
- "name": "Simple Cooccurrence Bias",
1048
- "description": "https://arxiv.org/pdf/2005.14165.pdf",
1049
- "paper": "Abstract: https://arxiv.org/pdf/2005.14165.pdf",
1050
- "homepage": ""
1051
- },
1052
- "siqa": {
1053
- "name": "Social IQA",
1054
- "description": "https://arxiv.org/abs/1904.09728",
1055
- "paper": "Abstract: https://arxiv.org/abs/1904.09728",
1056
- "homepage": "Homepage: https://allenai.org/data/socialiqa"
1057
- },
1058
- "spanish_bench": {
1059
- "name": "SpanishBench",
1060
- "description": "ive Summarization for 44 Languages](https://aclanthology.org/2021.findings-acl.413/) | https://huggingface.co/datasets/csebuetnlp/xlsum | | XNLI_es | Natural Language Inference | [XNLI: Evaluating Cross-lingual Sentence Representations](https://aclanthology.org/D18-1269/) | https://huggingface.co/datasets/facebook/xnli | | XQuAD_es | Question Answering | [On the Cross-lingual Transferability of Monolingual Representations](https://aclanthology.org/2020.acl-main.421/) | https://huggingface.co/dat",
1061
- "paper": "paper04.pdf) | https://huggingface.co/datasets/gplsi/cocoteros",
1062
- "homepage": ""
1063
- },
1064
- "squad_completion": {
1065
- "name": "Squad-completion",
1066
- "description": "Title: Simple Linear Attention Language Models Balance The Recall-Throughput Tradeoff",
1067
- "paper": "",
1068
- "homepage": "Homepage: https://github.com/HazyResearch/based-evaluation-harness"
1069
- },
1070
- "squadv2": {
1071
- "name": "Task-name",
1072
- "description": "https://arxiv.org/abs/1806.03822",
1073
- "paper": "Abstract: https://arxiv.org/abs/1806.03822",
1074
- "homepage": "Homepage: https://rajpurkar.github.io/SQuAD-explorer/"
1075
- },
1076
- "storycloze": {
1077
- "name": "StoryCloze",
1078
- "description": "`https://arxiv.org/abs/1604.01696`",
1079
- "paper": "Abstract: `https://arxiv.org/abs/1604.01696`",
1080
- "homepage": "Homepage: https://cs.rochester.edu/nlp/rocstories/"
1081
- },
1082
- "super_glue": {
1083
- "name": "SuperGLUE",
1084
- "description": "`https://w4ngatang.github.io/static/papers/superglue.pdf`",
1085
- "paper": "Abstract: `https://w4ngatang.github.io/static/papers/superglue.pdf`",
1086
- "homepage": "Homepage: https://super.gluebenchmark.com/"
1087
- },
1088
- "swag": {
1089
- "name": "SWAG",
1090
- "description": "https://arxiv.org/pdf/1808.05326.pdf",
1091
- "paper": "Abstract: https://arxiv.org/pdf/1808.05326.pdf",
1092
- "homepage": "Homepage: https://rowanzellers.com/swag/"
1093
- },
1094
- "swde": {
1095
- "name": "SWDE",
1096
- "description": "A long standing goal of the data management community is to develop general, automated systems that ingest semi-structured documents and output queryable tables without human effort or domain specific customization. Given the sheer variety of potential documents, state-of-the art systems make simplifying assumptions and use domain specific training. In this work, we ask whether we can maintain generality by using large language models (LLMs). LLMs, which are pretrained on broad data, can perform",
1097
- "paper": "",
1098
- "homepage": "Homepage: https://github.com/HazyResearch/based-evaluation-harness"
1099
- },
1100
- "tinyBenchmarks": {
1101
- "name": "tinyBenchmarks",
1102
- "description": "https://arxiv.org/abs/2402.14992",
1103
- "paper": "Abstract: https://arxiv.org/abs/2402.14992",
1104
- "homepage": ""
1105
- },
1106
- "tmlu": {
1107
- "name": "TMLU",
1108
- "description": "`The evaluation of large language models (LLMs) has drawn substantial attention in the field recently. This work focuses on evaluating LLMs in a Chinese context, specifically, for Traditional Chinese which has been largely underrepresented in existing benchmarks. We present TMLU, a holistic evaluation suit tailored for assessing the advanced knowledge and reasoning capability in LLMs, under the context of Taiwanese Mandarin. TMLU consists of an array of 37 subjects across social science, STEM, h",
1109
- "paper": "",
1110
- "homepage": "Homepage: [TMLU Huggingface Dataset](https://huggingface.co/datasets/miulab/tmlu"
1111
- },
1112
- "tmmluplus": {
1113
- "name": "TMMLU+",
1114
- "description": "`We present TMMLU+, a comprehensive dataset designed for the Traditional Chinese massive multitask language understanding dataset. TMMLU+ is a multiple-choice question-answering dataset with 66 subjects from elementary to professional level. Compared to its predecessor, TMMLU, TMMLU+ is six times larger and boasts a more balanced subject distribution. We included benchmark results in TMMLU+ from closed-source models and 24 open-weight Chinese large language models of parameters ranging from 1.8B",
1115
- "paper": "",
1116
- "homepage": "Homepage: [https://huggingface.co/datasets/ikala/tmmluplus](https://huggingface.co/datasets/ikala/tmmluplus"
1117
- },
1118
- "toxigen": {
1119
- "name": "ToxiGen",
1120
- "description": "https://arxiv.org/abs/2203.09509",
1121
- "paper": "Abstract: https://arxiv.org/abs/2203.09509",
1122
- "homepage": "Homepage: https://github.com/microsoft/TOXIGEN"
1123
- },
1124
- "translation": {
1125
- "name": "Translation Tasks",
1126
- "description": "* `gpt3_translation_tasks`\n* `wmt14`\n* `wmt16`\n* `wmt20`\n* `iwslt2017`",
1127
- "paper": "",
1128
- "homepage": ""
1129
- },
1130
- "triviaqa": {
1131
- "name": "Trivia QA",
1132
- "description": "https://arxiv.org/abs/1705.03551",
1133
- "paper": "Abstract: https://arxiv.org/abs/1705.03551",
1134
- "homepage": "Homepage: https://nlp.cs.washington.edu/triviaqa/"
1135
- },
1136
- "truthfulqa-multi": {
1137
- "name": "TruthfulQA-Multi",
1138
- "description": "`[https://arxiv.org/abs/2502.09387v1](https://arxiv.org/abs/2502.09387v1)`",
1139
- "paper": "Abstract: `[https://arxiv.org/abs/2502.09387v1](https://arxiv.org/abs/2502.09387v1",
1140
- "homepage": ""
1141
- },
1142
- "truthfulqa": {
1143
- "name": "TruthfulQA",
1144
- "description": "`https://arxiv.org/abs/2109.07958`",
1145
- "paper": "Abstract: `https://arxiv.org/abs/2109.07958`",
1146
- "homepage": "Homepage: `https://github.com/sylinrl/TruthfulQA`"
1147
- },
1148
- "turkishmmlu": {
1149
- "name": "TurkishMMLU",
1150
- "description": "Multiple choice question answering tasks evaluate the reasoning, comprehension, and mathematical abilities of Large Language Models (LLMs). While existing benchmarks employ automatic translation for multilingual evaluation, this approach is error-prone and potentially introduces culturally biased questions, especially in social sciences. We introduce the first multitask, multiple-choice Turkish QA benchmark, TurkishMMLU, to evaluate LLMs' understanding of the Turkish language. TurkishMMLU includ",
1151
- "paper": "",
1152
- "homepage": ""
1153
- },
1154
- "unitxt": {
1155
- "name": "Unitxt",
1156
- "description": "[link](https://arxiv.org/abs/2401.14019)",
1157
- "paper": "Abstract: [link](https://arxiv.org/abs/2401.14019",
1158
- "homepage": ""
1159
- },
1160
- "unscramble": {
1161
- "name": "Unscramble",
1162
- "description": "Language Models are Few-Shot Learners\nhttps://arxiv.org/pdf/2005.14165.pdf",
1163
- "paper": "",
1164
- "homepage": "Homepage: https://github.com/openai/gpt-3/tree/master/data"
1165
- },
1166
- "webqs": {
1167
- "name": "WEBQs",
1168
- "description": "`https://cs.stanford.edu/~pliang/papers/freebase-emnlp2013.pdf`",
1169
- "paper": "Abstract: `https://cs.stanford.edu/~pliang/papers/freebase-emnlp2013.pdf`",
1170
- "homepage": "Homepage: `https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a`"
1171
- },
1172
- "wikitext": {
1173
- "name": "Wikitext",
1174
- "description": "Pointer Sentinel Mixture Models\nhttps://arxiv.org/pdf/1609.07843.pdf",
1175
- "paper": "",
1176
- "homepage": "Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/"
1177
- },
1178
- "winogender": {
1179
- "name": "Winogender",
1180
- "description": "https://aclanthology.org/N18-2002.pdf",
1181
- "paper": "Abstract: https://aclanthology.org/N18-2002.pdf",
1182
- "homepage": "Homepage: https://github.com/rudinger/winogender-schemas"
1183
- },
1184
- "winogrande": {
1185
- "name": "WinoGrande",
1186
- "description": "https://arxiv.org/abs/1907.10641",
1187
- "paper": "Abstract: https://arxiv.org/abs/1907.10641",
1188
- "homepage": "Homepage: https://leaderboard.allenai.org/winogrande/submissions/public"
1189
- },
1190
- "wmdp": {
1191
- "name": "WMDP",
1192
- "description": "`https://arxiv.org/abs/2403.03218`",
1193
- "paper": "Abstract: `https://arxiv.org/abs/2403.03218`",
1194
- "homepage": "Homepage: https://wmdp.ai"
1195
- },
1196
- "wmt2016": {
1197
- "name": "WMT16",
1198
- "description": "http://www.aclweb.org/anthology/W/W16/W16-2301",
1199
- "paper": "Abstract: http://www.aclweb.org/anthology/W/W16/W16-2301",
1200
- "homepage": "Homepage: https://huggingface.co/datasets/wmt16"
1201
- },
1202
- "wsc273": {
1203
- "name": "WSC273",
1204
- "description": "http://commonsensereasoning.org/2011/papers/Levesque.pdf",
1205
- "paper": "Abstract: http://commonsensereasoning.org/2011/papers/Levesque.pdf",
1206
- "homepage": "Homepage: https://cs.nyu.edu/~davise/papers/WinogradSchemas/WS.html"
1207
- },
1208
- "xcopa": {
1209
- "name": "XCOPA",
1210
- "description": "https://ducdauge.github.io/files/xcopa.pdf",
1211
- "paper": "Abstract: https://ducdauge.github.io/files/xcopa.pdf",
1212
- "homepage": "Homepage: https://github.com/cambridgeltl/xcopa"
1213
- },
1214
- "xnli": {
1215
- "name": "XNLI",
1216
- "description": "https://arxiv.org/abs/1809.05053",
1217
- "paper": "Abstract: https://arxiv.org/abs/1809.05053",
1218
- "homepage": "Homepage: https://github.com/facebookresearch/XNLI"
1219
- },
1220
- "xnli_eu": {
1221
- "name": "XNLIeu",
1222
- "description": "https://arxiv.org/abs/2404.06996",
1223
- "paper": "Abstract: https://arxiv.org/abs/2404.06996",
1224
- "homepage": "Homepage: https://github.com/hitz-zentroa/xnli-eu"
1225
- },
1226
- "xquad": {
1227
- "name": "XQuAD",
1228
- "description": "https://aclanthology.org/2020.acl-main.421.pdf",
1229
- "paper": "Abstract: https://aclanthology.org/2020.acl-main.421.pdf",
1230
- "homepage": "Homepage: https://github.com/deepmind/xquad"
1231
- },
1232
- "xstorycloze": {
1233
- "name": "XStoryCloze",
1234
- "description": "https://arxiv.org/abs/2112.10668",
1235
- "paper": "Abstract: https://arxiv.org/abs/2112.10668",
1236
- "homepage": "Homepage: https://github.com/facebookresearch/fairseq/pull/4820"
1237
- },
1238
- "xwinograd": {
1239
- "name": "Task-name",
1240
- "description": "`https://arxiv.org/abs/2106.12066`",
1241
- "paper": "Abstract: `https://arxiv.org/abs/2106.12066`",
1242
- "homepage": "Homepage: `https://huggingface.co/datasets/Muennighoff/xwinograd`"
1243
- }
1244
- }