wisent 0.7.379__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1720) hide show
  1. wisent/__init__.py +64 -0
  2. wisent/cli.py +114 -0
  3. wisent/core/__init__.py +40 -0
  4. wisent/core/activations/__init__.py +26 -0
  5. wisent/core/activations/activations.py +97 -0
  6. wisent/core/activations/activations_collector.py +506 -0
  7. wisent/core/activations/core/__init__.py +0 -0
  8. wisent/core/activations/core/atoms.py +219 -0
  9. wisent/core/activations/prompt_construction_strategy.py +47 -0
  10. wisent/core/adapters/__init__.py +22 -0
  11. wisent/core/adapters/audio.py +616 -0
  12. wisent/core/adapters/base.py +420 -0
  13. wisent/core/adapters/multimodal.py +738 -0
  14. wisent/core/adapters/robotics.py +643 -0
  15. wisent/core/adapters/text.py +441 -0
  16. wisent/core/adapters/video.py +555 -0
  17. wisent/core/agent/__init__.py +1 -0
  18. wisent/core/agent/budget.py +644 -0
  19. wisent/core/agent/device_benchmarks.py +691 -0
  20. wisent/core/agent/diagnose/__init__.py +1 -0
  21. wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
  22. wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
  23. wisent/core/agent/diagnose/create_classifier.py +1155 -0
  24. wisent/core/agent/diagnose/response_diagnostics.py +273 -0
  25. wisent/core/agent/diagnose/select_classifiers.py +507 -0
  26. wisent/core/agent/diagnose/synthetic_classifier_option.py +755 -0
  27. wisent/core/agent/diagnose/tasks/__init__.py +33 -0
  28. wisent/core/agent/diagnose/tasks/task_manager.py +1453 -0
  29. wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
  30. wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
  31. wisent/core/agent/diagnose.py +249 -0
  32. wisent/core/agent/steer.py +215 -0
  33. wisent/core/agent/timeout.py +134 -0
  34. wisent/core/autonomous_agent.py +1158 -0
  35. wisent/core/benchmark_extractors.py +372 -0
  36. wisent/core/benchmark_registry.py +151 -0
  37. wisent/core/bigcode_extractors.py +26 -0
  38. wisent/core/bigcode_integration.py +886 -0
  39. wisent/core/branding.py +108 -0
  40. wisent/core/classifier/__init__.py +1 -0
  41. wisent/core/classifier/models/__init__.py +1 -0
  42. wisent/core/classifiers/__init__.py +1 -0
  43. wisent/core/classifiers/classifiers/__init__.py +0 -0
  44. wisent/core/classifiers/classifiers/core/__init__.py +0 -0
  45. wisent/core/classifiers/classifiers/core/atoms.py +748 -0
  46. wisent/core/classifiers/classifiers/models/__init__.py +0 -0
  47. wisent/core/classifiers/classifiers/models/logistic.py +29 -0
  48. wisent/core/classifiers/classifiers/models/mlp.py +47 -0
  49. wisent/core/classifiers/classifiers/rotator.py +137 -0
  50. wisent/core/classifiers/core/__init__.py +1 -0
  51. wisent/core/classifiers/models/__init__.py +1 -0
  52. wisent/core/classifiers/pipeline_steps/__init__.py +1 -0
  53. wisent/core/cli/__init__.py +26 -0
  54. wisent/core/cli/agent/__init__.py +15 -0
  55. wisent/core/cli/agent/apply_steering.py +192 -0
  56. wisent/core/cli/agent/evaluate_response.py +128 -0
  57. wisent/core/cli/agent/generate_synthetic_pairs.py +123 -0
  58. wisent/core/cli/agent/main.py +139 -0
  59. wisent/core/cli/agent/train_classifier.py +173 -0
  60. wisent/core/cli/check_linearity.py +126 -0
  61. wisent/core/cli/create_steering_vector.py +304 -0
  62. wisent/core/cli/diagnose_pairs.py +153 -0
  63. wisent/core/cli/diagnose_vectors.py +404 -0
  64. wisent/core/cli/estimate_unified_goodness_time.py +428 -0
  65. wisent/core/cli/evaluate_refusal.py +241 -0
  66. wisent/core/cli/evaluate_responses.py +926 -0
  67. wisent/core/cli/generate_humanization_pairs.py +128 -0
  68. wisent/core/cli/generate_pairs.py +175 -0
  69. wisent/core/cli/generate_pairs_from_task.py +108 -0
  70. wisent/core/cli/generate_responses.py +160 -0
  71. wisent/core/cli/generate_vector_from_synthetic.py +217 -0
  72. wisent/core/cli/generate_vector_from_task.py +248 -0
  73. wisent/core/cli/get_activations.py +192 -0
  74. wisent/core/cli/inference_config.py +84 -0
  75. wisent/core/cli/inference_config_cli.py +54 -0
  76. wisent/core/cli/modify_weights.py +660 -0
  77. wisent/core/cli/multi_steer.py +112 -0
  78. wisent/core/cli/optimization_cache.py +298 -0
  79. wisent/core/cli/optimize.py +621 -0
  80. wisent/core/cli/optimize_classification.py +473 -0
  81. wisent/core/cli/optimize_sample_size.py +390 -0
  82. wisent/core/cli/optimize_steering.py +3421 -0
  83. wisent/core/cli/optimize_weights.py +1287 -0
  84. wisent/core/cli/steering_method_trainer.py +641 -0
  85. wisent/core/cli/steering_search_space.py +508 -0
  86. wisent/core/cli/tasks.py +940 -0
  87. wisent/core/cli/train_unified_goodness.py +681 -0
  88. wisent/core/cli_logger.py +22 -0
  89. wisent/core/config_manager.py +1731 -0
  90. wisent/core/contrastive_pairs/__init__.py +15 -0
  91. wisent/core/contrastive_pairs/core/__init__.py +0 -0
  92. wisent/core/contrastive_pairs/core/atoms.py +45 -0
  93. wisent/core/contrastive_pairs/core/buliders.py +59 -0
  94. wisent/core/contrastive_pairs/core/pair.py +183 -0
  95. wisent/core/contrastive_pairs/core/response.py +153 -0
  96. wisent/core/contrastive_pairs/core/serialization.py +306 -0
  97. wisent/core/contrastive_pairs/core/set.py +192 -0
  98. wisent/core/contrastive_pairs/diagnostics/__init__.py +79 -0
  99. wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
  100. wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
  101. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +1655 -0
  102. wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
  103. wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
  104. wisent/core/contrastive_pairs/diagnostics/duplicates.py +118 -0
  105. wisent/core/contrastive_pairs/diagnostics/linearity.py +325 -0
  106. wisent/core/contrastive_pairs/diagnostics/vector_quality.py +620 -0
  107. wisent/core/contrastive_pairs/huggingface_pairs/__init__.py +1 -0
  108. wisent/core/contrastive_pairs/huggingface_pairs/atoms.py +255 -0
  109. wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py +470 -0
  110. wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_registry.py +136 -0
  111. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py +44 -0
  112. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentbench.py +225 -0
  113. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentharm.py +267 -0
  114. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +444 -0
  115. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +225 -0
  116. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aime.py +118 -0
  117. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aime2024.py +74 -0
  118. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aime2025.py +73 -0
  119. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/alpaca_eval.py +153 -0
  120. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py +182 -0
  121. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/arena_hard.py +179 -0
  122. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/atis.py +89 -0
  123. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/babilong.py +96 -0
  124. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bangla_mmlu.py +108 -0
  125. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/basqueglue.py +217 -0
  126. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bec2016eu.py +99 -0
  127. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bfcl.py +283 -0
  128. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bhtc_v2.py +87 -0
  129. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +245 -0
  130. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/chain_of_thought.py +89 -0
  131. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/chinese_simpleqa.py +209 -0
  132. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/cluewsc.py +177 -0
  133. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/cnn_dailymail.py +92 -0
  134. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +378 -0
  135. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py +109 -0
  136. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text.py +15 -0
  137. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_go.py +64 -0
  138. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_java.py +65 -0
  139. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_javascript.py +65 -0
  140. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_php.py +65 -0
  141. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_python.py +65 -0
  142. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_ruby.py +65 -0
  143. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +844 -0
  144. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coedit_gec.py +79 -0
  145. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/conala.py +133 -0
  146. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/concode.py +111 -0
  147. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/dbpedia_14.py +91 -0
  148. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/doc_vqa.py +102 -0
  149. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/donotanswer.py +236 -0
  150. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ds1000.py +129 -0
  151. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ds_1000.py +155 -0
  152. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/epec_koref_bin.py +85 -0
  153. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ethos_binary.py +82 -0
  154. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/evalita_mp.py +165 -0
  155. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/evalita_sp_sum_task_fp_small_p1.py +89 -0
  156. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/facts_grounding.py +181 -0
  157. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +295 -0
  158. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/financial_tweets.py +100 -0
  159. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +270 -0
  160. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flan_held_in.py +98 -0
  161. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +572 -0
  162. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +143 -0
  163. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/freebase.py +99 -0
  164. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/get_negative_example_livecodebench.py +146 -0
  165. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/get_positive_example_livecodebench.py +140 -0
  166. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/gpt3_translation_benchmarks.py +98 -0
  167. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +389 -0
  168. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/halueval.py +246 -0
  169. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/harmbench.py +250 -0
  170. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/healthbench.py +181 -0
  171. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hle.py +106 -0
  172. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hmmt.py +117 -0
  173. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py +119 -0
  174. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humanevalpack.py +102 -0
  175. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py +180 -0
  176. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py +129 -0
  177. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/iwslt2017_ar_en.py +98 -0
  178. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/iwslt2017_en_ar.py +98 -0
  179. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/jailbreakbench.py +258 -0
  180. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/law_stack_exchange.py +101 -0
  181. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ledgar.py +118 -0
  182. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livecodebench.py +61 -0
  183. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livecodebench_contrastive_pair_generator.py +491 -0
  184. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livecodebench_v6.py +263 -0
  185. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +230 -0
  186. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/llama.py +96 -0
  187. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +285 -0
  188. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/m_mmlu.py +96 -0
  189. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math.py +186 -0
  190. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +146 -0
  191. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py +142 -0
  192. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/meddialog.py +79 -0
  193. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medical_abstracts.py +101 -0
  194. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +787 -0
  195. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +111 -0
  196. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mmlu_redux.py +194 -0
  197. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mmlusr.py +108 -0
  198. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multimedqa.py +99 -0
  199. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multipl_e.py +109 -0
  200. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple.py +96 -0
  201. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_choice.py +87 -0
  202. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_cpp.py +128 -0
  203. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_go.py +128 -0
  204. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_java.py +128 -0
  205. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_js.py +128 -0
  206. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_py.py +15 -0
  207. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_rs.py +128 -0
  208. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/non_greedy_robustness_agieval_aqua_rat.py +92 -0
  209. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +287 -0
  210. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/openllm.py +99 -0
  211. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/option_order_robustness_agieval_aqua_rat.py +92 -0
  212. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/or_bench.py +300 -0
  213. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/penn_treebank.py +80 -0
  214. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +317 -0
  215. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +467 -0
  216. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/prompt_robustness_agieval_aqua_rat.py +92 -0
  217. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/pythia.py +99 -0
  218. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +131 -0
  219. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +280 -0
  220. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/scicode.py +275 -0
  221. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/self_consistency.py +90 -0
  222. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +145 -0
  223. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/sorry_bench.py +211 -0
  224. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/stsb.py +79 -0
  225. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_glue_lm_eval_v1.py +99 -0
  226. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_glue_lm_eval_v1_seq2seq.py +98 -0
  227. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_glue_t5_prompt.py +123 -0
  228. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_gpqa.py +106 -0
  229. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/swe_bench.py +428 -0
  230. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/swe_bench_verified.py +158 -0
  231. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/sycophancy_eval.py +205 -0
  232. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/t0_eval.py +79 -0
  233. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tag.py +98 -0
  234. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +305 -0
  235. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tmlu.py +109 -0
  236. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +360 -0
  237. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +386 -0
  238. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/travelplanner.py +286 -0
  239. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/truthfulqa_generation.py +128 -0
  240. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/unfair_tos.py +83 -0
  241. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/vaxx_stance.py +86 -0
  242. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wiceu.py +85 -0
  243. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wikitext103.py +97 -0
  244. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wildguard.py +280 -0
  245. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt14_en_fr.py +97 -0
  246. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt14_fr_en.py +97 -0
  247. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_de_en.py +90 -0
  248. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_en_de.py +90 -0
  249. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_en_ro.py +90 -0
  250. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_ro_en.py +90 -0
  251. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt_ro_en_t5_prompt.py +90 -0
  252. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/xsum.py +81 -0
  253. wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
  254. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +265 -0
  255. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/__init__.py +472 -0
  256. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/aclue.py +24 -0
  257. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/acp.py +33 -0
  258. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/acpbench.py +39 -0
  259. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/advanced_ai_risk.py +59 -0
  260. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/aexams.py +14 -0
  261. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrimgsm.py +10 -0
  262. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrimmlu.py +10 -0
  263. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrixnli.py +9 -0
  264. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench.py +14 -0
  265. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_adr.py +9 -0
  266. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_afriqa.py +9 -0
  267. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_afrisenti.py +9 -0
  268. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_belebele.py +9 -0
  269. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_flores.py +9 -0
  270. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_injongointent.py +9 -0
  271. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_mafand.py +9 -0
  272. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_masakhaner.py +9 -0
  273. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_masakhanews.py +9 -0
  274. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_masakhapos.py +9 -0
  275. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_naijarc.py +9 -0
  276. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_nollysenti.py +9 -0
  277. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_ntrex.py +9 -0
  278. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_openai_mmlu.py +9 -0
  279. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_salt.py +9 -0
  280. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_sib.py +9 -0
  281. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_uhura_arc_easy.py +9 -0
  282. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_xlsum.py +9 -0
  283. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/agieval.py +33 -0
  284. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/anli.py +9 -0
  285. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arab_culture.py +24 -0
  286. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_acva.py +67 -0
  287. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_acva_light.py +67 -0
  288. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_complete.py +24 -0
  289. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_light.py +81 -0
  290. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabicmmlu.py +59 -0
  291. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/aradice.py +36 -0
  292. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arc.py +61 -0
  293. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arithmetic.py +19 -0
  294. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/basque_bench.py +37 -0
  295. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bbh.py +121 -0
  296. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bbq.py +9 -0
  297. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/belebele.py +293 -0
  298. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bertaqa.py +25 -0
  299. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bigbench.py +300 -0
  300. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/blimp.py +76 -0
  301. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/careqa.py +9 -0
  302. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/catalan_bench.py +43 -0
  303. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/ceval_valid.py +61 -0
  304. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/cmmlu.py +76 -0
  305. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py +16 -0
  306. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/copal_id.py +11 -0
  307. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/crows_pairs.py +31 -0
  308. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/csatqa.py +15 -0
  309. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/darija.py +29 -0
  310. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/darijammlu.py +57 -0
  311. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/egymmlu.py +62 -0
  312. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/eus.py +76 -0
  313. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/evalita_mp.py +93 -0
  314. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/fld.py +9 -0
  315. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/flores.py +466 -0
  316. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py +9 -0
  317. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/french_bench.py +23 -0
  318. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/galician_bench.py +41 -0
  319. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/glianorex.py +11 -0
  320. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/global_mmlu.py +115 -0
  321. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/gpqa.py +27 -0
  322. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/gsm8k.py +9 -0
  323. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/gsm8k_platinum.py +9 -0
  324. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/haerae.py +14 -0
  325. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/headqa.py +11 -0
  326. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hellaswag.py +39 -0
  327. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hendrycks_ethics.py +14 -0
  328. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hendrycks_math.py +9 -0
  329. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hrm8k.py +20 -0
  330. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/inverse.py +22 -0
  331. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/japanese_leaderboard.py +20 -0
  332. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/jsonschema_bench.py +9 -0
  333. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kbl.py +85 -0
  334. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kmmlu.py +281 -0
  335. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kobest.py +14 -0
  336. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kormedmcqa.py +9 -0
  337. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/lambada.py +28 -0
  338. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/leaderboard.py +52 -0
  339. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/libra.py +9 -0
  340. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/lingoly.py +11 -0
  341. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/longbench.py +9 -0
  342. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/m.py +43 -0
  343. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mastermind.py +9 -0
  344. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mathqa.py +9 -0
  345. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/med.py +24 -0
  346. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/meddialog.py +12 -0
  347. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/medqa.py +9 -0
  348. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mela.py +18 -0
  349. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/metabench.py +36 -0
  350. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mgsm.py +44 -0
  351. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/minerva_math.py +16 -0
  352. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mlqa.py +58 -0
  353. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu.py +70 -0
  354. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu_pro.py +23 -0
  355. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu_pro_plus.py +23 -0
  356. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu_prox.py +191 -0
  357. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlusr.py +9 -0
  358. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmmu.py +46 -0
  359. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/model_written_evals.py +9 -0
  360. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/multiblimp.py +111 -0
  361. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/non.py +23 -0
  362. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/noreval.py +143 -0
  363. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/noridiom.py +20 -0
  364. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/nortruthfulqa.py +32 -0
  365. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/nrk.py +20 -0
  366. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi.py +9 -0
  367. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_arc_multilingual.py +10 -0
  368. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_hellaswag_multilingual.py +24 -0
  369. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_mmlu_multilingual.py +24 -0
  370. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_truthfulqa_multilingual.py +34 -0
  371. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/paloma.py +25 -0
  372. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/pawsx.py +9 -0
  373. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/persona.py +144 -0
  374. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/pile.py +31 -0
  375. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/polemo2.py +9 -0
  376. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/portuguese_bench.py +31 -0
  377. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/prompt.py +23 -0
  378. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/qa4mre.py +12 -0
  379. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/qasper.py +11 -0
  380. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/ru.py +19 -0
  381. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/ruler.py +9 -0
  382. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/score.py +20 -0
  383. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/scrolls.py +9 -0
  384. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/self_consistency.py +11 -0
  385. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/spanish_bench.py +38 -0
  386. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/storycloze.py +9 -0
  387. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/super_glue_t5_prompt.py +17 -0
  388. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/tinyBenchmarks.py +9 -0
  389. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/tmlu.py +9 -0
  390. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/tmmluplus.py +80 -0
  391. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/translation.py +9 -0
  392. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/truthfulqa.py +76 -0
  393. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/truthfulqa_multi.py +24 -0
  394. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/turkishmmlu.py +30 -0
  395. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/unitxt.py +23 -0
  396. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/unscramble.py +9 -0
  397. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/winogender.py +16 -0
  398. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wmdp.py +12 -0
  399. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wmt14.py +16 -0
  400. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wmt16.py +22 -0
  401. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wsc273.py +9 -0
  402. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xcopa.py +21 -0
  403. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xnli.py +28 -0
  404. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xnli_eu.py +12 -0
  405. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xquad.py +22 -0
  406. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xstorycloze.py +22 -0
  407. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xwinograd.py +15 -0
  408. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +478 -0
  409. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +140 -0
  410. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +125 -0
  411. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +171 -0
  412. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +207 -0
  413. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +185 -0
  414. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +130 -0
  415. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +184 -0
  416. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimgsm.py +98 -0
  417. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +113 -0
  418. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +129 -0
  419. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrobench_cot.py +88 -0
  420. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrobench_mc.py +107 -0
  421. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ag.py +134 -0
  422. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py +155 -0
  423. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ai2_arc.py +114 -0
  424. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/anagrams1.py +81 -0
  425. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/anagrams2.py +81 -0
  426. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/anli.py +140 -0
  427. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +180 -0
  428. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +98 -0
  429. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +104 -0
  430. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +168 -0
  431. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +168 -0
  432. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +167 -0
  433. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +268 -0
  434. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +133 -0
  435. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +118 -0
  436. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +118 -0
  437. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_gen.py +101 -0
  438. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_mc.py +106 -0
  439. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/argument.py +134 -0
  440. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +114 -0
  441. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +122 -0
  442. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/assin.py +103 -0
  443. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +113 -0
  444. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +155 -0
  445. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench_gen.py +168 -0
  446. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench_mc.py +139 -0
  447. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbh.py +133 -0
  448. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +169 -0
  449. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +181 -0
  450. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +155 -0
  451. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +165 -0
  452. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +155 -0
  453. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +143 -0
  454. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bigbench.py +170 -0
  455. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +171 -0
  456. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +152 -0
  457. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +117 -0
  458. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq_seq2seq.py +117 -0
  459. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +150 -0
  460. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +152 -0
  461. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabreu.py +127 -0
  462. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +169 -0
  463. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +155 -0
  464. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench_gen.py +119 -0
  465. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench_mc.py +113 -0
  466. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +171 -0
  467. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +139 -0
  468. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +117 -0
  469. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +223 -0
  470. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +163 -0
  471. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +110 -0
  472. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +238 -0
  473. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +151 -0
  474. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +152 -0
  475. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +166 -0
  476. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +144 -0
  477. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +148 -0
  478. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code2text.py +161 -0
  479. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py +114 -0
  480. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/codexglue.py +107 -0
  481. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +149 -0
  482. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cola.py +83 -0
  483. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +107 -0
  484. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +127 -0
  485. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +124 -0
  486. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +169 -0
  487. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +162 -0
  488. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqcat.py +114 -0
  489. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/crows_pairs.py +158 -0
  490. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +152 -0
  491. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +107 -0
  492. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle_letters.py +81 -0
  493. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +221 -0
  494. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +174 -0
  495. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +152 -0
  496. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +157 -0
  497. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +152 -0
  498. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +107 -0
  499. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +129 -0
  500. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/egyhellaswag.py +125 -0
  501. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/egymmlu.py +180 -0
  502. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +142 -0
  503. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +107 -0
  504. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +194 -0
  505. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +152 -0
  506. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +152 -0
  507. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +152 -0
  508. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/escola.py +85 -0
  509. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +135 -0
  510. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethos.py +99 -0
  511. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +107 -0
  512. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +225 -0
  513. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +159 -0
  514. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +159 -0
  515. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +159 -0
  516. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +166 -0
  517. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_sp.py +109 -0
  518. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/fda.py +105 -0
  519. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +107 -0
  520. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +114 -0
  521. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/fld.py +143 -0
  522. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +202 -0
  523. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench_mc.py +98 -0
  524. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench_perplexity.py +86 -0
  525. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galcola.py +109 -0
  526. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +155 -0
  527. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench_gen.py +118 -0
  528. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench_mc.py +112 -0
  529. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +141 -0
  530. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +118 -0
  531. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +171 -0
  532. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +152 -0
  533. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glue.py +109 -0
  534. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpqa.py +161 -0
  535. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +110 -0
  536. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +184 -0
  537. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm.py +108 -0
  538. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +134 -0
  539. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +152 -0
  540. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +112 -0
  541. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +125 -0
  542. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +225 -0
  543. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +191 -0
  544. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +179 -0
  545. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hle.py +111 -0
  546. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +203 -0
  547. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval.py +124 -0
  548. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +152 -0
  549. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +152 -0
  550. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ifeval.py +118 -0
  551. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +107 -0
  552. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +192 -0
  553. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/iwslt2017.py +117 -0
  554. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +107 -0
  555. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +155 -0
  556. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_gen.py +224 -0
  557. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +120 -0
  558. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/jsonschema_bench.py +123 -0
  559. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kbl.py +140 -0
  560. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +168 -0
  561. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu_cot.py +88 -0
  562. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu_mc.py +107 -0
  563. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +165 -0
  564. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +160 -0
  565. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada.py +147 -0
  566. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +185 -0
  567. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +185 -0
  568. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual_stablelm.py +141 -0
  569. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +107 -0
  570. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +194 -0
  571. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/libra.py +165 -0
  572. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +203 -0
  573. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livemathbench.py +155 -0
  574. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +152 -0
  575. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +152 -0
  576. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logieval.py +82 -0
  577. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +115 -0
  578. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +114 -0
  579. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +152 -0
  580. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +152 -0
  581. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +203 -0
  582. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mathqa.py +137 -0
  583. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py +123 -0
  584. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +115 -0
  585. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +224 -0
  586. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +180 -0
  587. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +107 -0
  588. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mediqa_qa2019.py +123 -0
  589. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +169 -0
  590. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +118 -0
  591. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medtext.py +108 -0
  592. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +96 -0
  593. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meqsum.py +115 -0
  594. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +154 -0
  595. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mgsm.py +122 -0
  596. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mimic_repsum.py +140 -0
  597. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +172 -0
  598. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mlqa.py +143 -0
  599. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +144 -0
  600. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_cot.py +88 -0
  601. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_mc.py +107 -0
  602. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_pro.py +145 -0
  603. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +189 -0
  604. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmmu.py +150 -0
  605. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mnli.py +113 -0
  606. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/model_written_evals.py +115 -0
  607. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/moral_stories.py +151 -0
  608. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +111 -0
  609. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mts_dialog.py +118 -0
  610. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mts_dialog_perplexity.py +97 -0
  611. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +134 -0
  612. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multilingual.py +106 -0
  613. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +114 -0
  614. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +113 -0
  615. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +107 -0
  616. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +173 -0
  617. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +157 -0
  618. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen.py +277 -0
  619. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +165 -0
  620. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +228 -0
  621. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +223 -0
  622. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noticia.py +105 -0
  623. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +135 -0
  624. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi.py +27 -0
  625. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +167 -0
  626. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +174 -0
  627. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +162 -0
  628. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +209 -0
  629. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +186 -0
  630. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph_perplexity.py +97 -0
  631. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +118 -0
  632. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +107 -0
  633. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paloma.py +205 -0
  634. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +110 -0
  635. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +110 -0
  636. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +107 -0
  637. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +154 -0
  638. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +115 -0
  639. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +246 -0
  640. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +144 -0
  641. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases_ca_va.py +82 -0
  642. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +161 -0
  643. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile_10k.py +140 -0
  644. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +116 -0
  645. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polemo2.py +135 -0
  646. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polymath.py +155 -0
  647. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +155 -0
  648. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench_gen.py +121 -0
  649. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench_mc.py +103 -0
  650. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +107 -0
  651. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +115 -0
  652. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +112 -0
  653. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +119 -0
  654. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +118 -0
  655. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +112 -0
  656. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +111 -0
  657. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +107 -0
  658. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +111 -0
  659. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/quac.py +111 -0
  660. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +124 -0
  661. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +107 -0
  662. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/realtoxicityprompts.py +124 -0
  663. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +125 -0
  664. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +110 -0
  665. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +111 -0
  666. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +170 -0
  667. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +113 -0
  668. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +177 -0
  669. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +161 -0
  670. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +157 -0
  671. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +110 -0
  672. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +131 -0
  673. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +119 -0
  674. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/simple_cooccurrence_bias.py +121 -0
  675. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +209 -0
  676. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +114 -0
  677. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +155 -0
  678. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench_gen.py +117 -0
  679. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench_mc.py +110 -0
  680. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad2.py +129 -0
  681. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad_completion.py +121 -0
  682. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sst2.py +111 -0
  683. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +250 -0
  684. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +107 -0
  685. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +107 -0
  686. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +154 -0
  687. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/superglue.py +111 -0
  688. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/supergpqa.py +111 -0
  689. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +115 -0
  690. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +179 -0
  691. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +117 -0
  692. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +110 -0
  693. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +110 -0
  694. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +110 -0
  695. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +155 -0
  696. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +110 -0
  697. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +110 -0
  698. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +110 -0
  699. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +113 -0
  700. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +110 -0
  701. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +181 -0
  702. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/toxigen.py +91 -0
  703. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/translation.py +149 -0
  704. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +130 -0
  705. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +112 -0
  706. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +120 -0
  707. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +140 -0
  708. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_multi.py +142 -0
  709. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +152 -0
  710. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +161 -0
  711. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_cot.py +104 -0
  712. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +102 -0
  713. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/twenty_newsgroups.py +111 -0
  714. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unitxt.py +131 -0
  715. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +155 -0
  716. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +95 -0
  717. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +130 -0
  718. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +122 -0
  719. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wikitext.py +146 -0
  720. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogender.py +139 -0
  721. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +118 -0
  722. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +155 -0
  723. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmt14.py +110 -0
  724. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmt16.py +118 -0
  725. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +114 -0
  726. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +117 -0
  727. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +180 -0
  728. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +197 -0
  729. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +147 -0
  730. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +131 -0
  731. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +203 -0
  732. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +129 -0
  733. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +124 -0
  734. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/yahoo.py +108 -0
  735. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +155 -0
  736. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +56 -0
  737. wisent/core/data_loaders/__init__.py +235 -0
  738. wisent/core/data_loaders/core/__init__.py +0 -0
  739. wisent/core/data_loaders/core/atoms.py +99 -0
  740. wisent/core/data_loaders/loaders/__init__.py +0 -0
  741. wisent/core/data_loaders/loaders/custom.py +120 -0
  742. wisent/core/data_loaders/loaders/huggingface_loader.py +153 -0
  743. wisent/core/data_loaders/loaders/lm_loader.py +494 -0
  744. wisent/core/data_loaders/loaders/lm_loader_special_cases.py +496 -0
  745. wisent/core/data_loaders/loaders/task_interface_loader.py +300 -0
  746. wisent/core/data_loaders/rotator.py +118 -0
  747. wisent/core/detection_handling.py +259 -0
  748. wisent/core/diversity_processors.py +193 -0
  749. wisent/core/download_full_benchmarks.py +1512 -0
  750. wisent/core/errors/__init__.py +203 -0
  751. wisent/core/errors/error_codes.py +763 -0
  752. wisent/core/errors/error_handler.py +134 -0
  753. wisent/core/evaluators/__init__.py +0 -0
  754. wisent/core/evaluators/benchmark_specific/__init__.py +42 -0
  755. wisent/core/evaluators/benchmark_specific/aime_evaluator.py +90 -0
  756. wisent/core/evaluators/benchmark_specific/coding/__init__.py +0 -0
  757. wisent/core/evaluators/benchmark_specific/coding/metrics/__init__.py +0 -0
  758. wisent/core/evaluators/benchmark_specific/coding/metrics/core/__init__.py +0 -0
  759. wisent/core/evaluators/benchmark_specific/coding/metrics/core/atoms.py +36 -0
  760. wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py +363 -0
  761. wisent/core/evaluators/benchmark_specific/coding/metrics/passk.py +67 -0
  762. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/__init__.py +0 -0
  763. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/core/__init__.py +0 -0
  764. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/core/atoms.py +27 -0
  765. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/cpp_sanitizer.py +62 -0
  766. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/java_sanitizer.py +78 -0
  767. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/python_sanitizer.py +94 -0
  768. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/utils.py +126 -0
  769. wisent/core/evaluators/benchmark_specific/coding/providers/__init__.py +18 -0
  770. wisent/core/evaluators/benchmark_specific/coding/providers/core/__init__.py +0 -0
  771. wisent/core/evaluators/benchmark_specific/coding/providers/core/atoms.py +31 -0
  772. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/__init__.py +3 -0
  773. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/provider.py +305 -0
  774. wisent/core/evaluators/benchmark_specific/coding/safe_docker/Dockerfile +31 -0
  775. wisent/core/evaluators/benchmark_specific/coding/safe_docker/__init__.py +0 -0
  776. wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/__init__.py +0 -0
  777. wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/atoms.py +105 -0
  778. wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/runtime.py +143 -0
  779. wisent/core/evaluators/benchmark_specific/coding/safe_docker/entrypoint.py +121 -0
  780. wisent/core/evaluators/benchmark_specific/coding/safe_docker/recipes.py +60 -0
  781. wisent/core/evaluators/benchmark_specific/coding/solution_generator.py +258 -0
  782. wisent/core/evaluators/benchmark_specific/conala_evaluator.py +332 -0
  783. wisent/core/evaluators/benchmark_specific/exact_match_evaluator.py +81 -0
  784. wisent/core/evaluators/benchmark_specific/f1_evaluator.py +173 -0
  785. wisent/core/evaluators/benchmark_specific/generation_evaluator.py +488 -0
  786. wisent/core/evaluators/benchmark_specific/livemathbench_evaluator.py +393 -0
  787. wisent/core/evaluators/benchmark_specific/log_likelihoods_evaluator.py +202 -0
  788. wisent/core/evaluators/benchmark_specific/math_evaluator.py +119 -0
  789. wisent/core/evaluators/benchmark_specific/math_parsing/__init__.py +1 -0
  790. wisent/core/evaluators/benchmark_specific/math_parsing/core.py +1640 -0
  791. wisent/core/evaluators/benchmark_specific/math_parsing/extract_boxed.py +48 -0
  792. wisent/core/evaluators/benchmark_specific/math_parsing/is_equiv.py +159 -0
  793. wisent/core/evaluators/benchmark_specific/math_parsing/scripts.py +919 -0
  794. wisent/core/evaluators/benchmark_specific/perplexity_evaluator.py +175 -0
  795. wisent/core/evaluators/benchmark_specific/polymath_evaluator.py +114 -0
  796. wisent/core/evaluators/core/__init__.py +5 -0
  797. wisent/core/evaluators/core/atoms.py +166 -0
  798. wisent/core/evaluators/custom/__init__.py +20 -0
  799. wisent/core/evaluators/custom/custom_evaluator.py +382 -0
  800. wisent/core/evaluators/custom/examples/__init__.py +37 -0
  801. wisent/core/evaluators/custom/examples/desklib_detector.py +166 -0
  802. wisent/core/evaluators/custom/examples/gptzero.py +185 -0
  803. wisent/core/evaluators/custom/examples/humanization.py +79 -0
  804. wisent/core/evaluators/custom/examples/humanization_coherent.py +127 -0
  805. wisent/core/evaluators/custom/examples/roberta_detector.py +173 -0
  806. wisent/core/evaluators/oracles/__init__.py +0 -0
  807. wisent/core/evaluators/oracles/interactive.py +73 -0
  808. wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
  809. wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py +168 -0
  810. wisent/core/evaluators/oracles/user_specified.py +67 -0
  811. wisent/core/evaluators/personalization/__init__.py +12 -0
  812. wisent/core/evaluators/personalization/alignment.py +166 -0
  813. wisent/core/evaluators/personalization/coherence.py +325 -0
  814. wisent/core/evaluators/personalization/difference.py +73 -0
  815. wisent/core/evaluators/rotator.py +217 -0
  816. wisent/core/evaluators/steering_evaluators.py +386 -0
  817. wisent/core/evaluators/synthetic_evaluator.py +377 -0
  818. wisent/core/hyperparameter_optimizer.py +547 -0
  819. wisent/core/layer.py +17 -0
  820. wisent/core/lm_eval_harness_ground_truth.py +1431 -0
  821. wisent/core/main.py +101 -0
  822. wisent/core/managed_cached_benchmarks.py +609 -0
  823. wisent/core/mixed_benchmark_sampler.py +366 -0
  824. wisent/core/modalities/__init__.py +545 -0
  825. wisent/core/model_persistence.py +302 -0
  826. wisent/core/models/__init__.py +23 -0
  827. wisent/core/models/core/__init__.py +0 -0
  828. wisent/core/models/core/atoms.py +465 -0
  829. wisent/core/models/inference_config.py +127 -0
  830. wisent/core/models/wisent_model.py +893 -0
  831. wisent/core/multi_steering.py +397 -0
  832. wisent/core/opti/__init__.py +0 -0
  833. wisent/core/opti/core/__init__.py +0 -0
  834. wisent/core/opti/core/atoms.py +177 -0
  835. wisent/core/opti/methods/__init__.py +10 -0
  836. wisent/core/opti/methods/opti_classificator.py +172 -0
  837. wisent/core/opti/methods/opti_steering.py +139 -0
  838. wisent/core/opti/methods/opti_weights.py +523 -0
  839. wisent/core/optuna/__init__.py +54 -0
  840. wisent/core/optuna/classifier/__init__.py +25 -0
  841. wisent/core/optuna/classifier/activation_generator.py +351 -0
  842. wisent/core/optuna/classifier/classifier_cache.py +509 -0
  843. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +685 -0
  844. wisent/core/optuna/steering/__init__.py +20 -0
  845. wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +200 -0
  846. wisent/core/optuna/steering/data_utils.py +342 -0
  847. wisent/core/optuna/steering/metrics.py +412 -0
  848. wisent/core/optuna/steering/steering_optimization.py +1096 -0
  849. wisent/core/parser.py +1662 -0
  850. wisent/core/parser_arguments/__init__.py +10 -0
  851. wisent/core/parser_arguments/agent_parser.py +122 -0
  852. wisent/core/parser_arguments/check_linearity_parser.py +82 -0
  853. wisent/core/parser_arguments/configure_model_parser.py +7 -0
  854. wisent/core/parser_arguments/create_steering_vector_parser.py +67 -0
  855. wisent/core/parser_arguments/diagnose_pairs_parser.py +25 -0
  856. wisent/core/parser_arguments/diagnose_vectors_parser.py +72 -0
  857. wisent/core/parser_arguments/evaluate_parser.py +40 -0
  858. wisent/core/parser_arguments/evaluate_refusal_parser.py +32 -0
  859. wisent/core/parser_arguments/evaluate_responses_parser.py +12 -0
  860. wisent/core/parser_arguments/full_optimize_parser.py +194 -0
  861. wisent/core/parser_arguments/generate_pairs_from_task_parser.py +33 -0
  862. wisent/core/parser_arguments/generate_pairs_parser.py +43 -0
  863. wisent/core/parser_arguments/generate_responses_parser.py +16 -0
  864. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +148 -0
  865. wisent/core/parser_arguments/generate_vector_from_task_parser.py +149 -0
  866. wisent/core/parser_arguments/generate_vector_parser.py +89 -0
  867. wisent/core/parser_arguments/get_activations_parser.py +90 -0
  868. wisent/core/parser_arguments/inference_config_parser.py +65 -0
  869. wisent/core/parser_arguments/main_parser.py +220 -0
  870. wisent/core/parser_arguments/model_config_parser.py +59 -0
  871. wisent/core/parser_arguments/modify_weights_parser.py +309 -0
  872. wisent/core/parser_arguments/monitor_parser.py +17 -0
  873. wisent/core/parser_arguments/multi_steer_parser.py +48 -0
  874. wisent/core/parser_arguments/nonsense_parser.py +26 -0
  875. wisent/core/parser_arguments/optimization_cache_parser.py +64 -0
  876. wisent/core/parser_arguments/optimize_classification_parser.py +108 -0
  877. wisent/core/parser_arguments/optimize_parser.py +142 -0
  878. wisent/core/parser_arguments/optimize_sample_size_parser.py +58 -0
  879. wisent/core/parser_arguments/optimize_steering_parser.py +617 -0
  880. wisent/core/parser_arguments/optimize_weights_parser.py +403 -0
  881. wisent/core/parser_arguments/synthetic_parser.py +117 -0
  882. wisent/core/parser_arguments/tasks_parser.py +591 -0
  883. wisent/core/parser_arguments/train_unified_goodness_parser.py +172 -0
  884. wisent/core/parser_arguments/utils.py +107 -0
  885. wisent/core/prompts/__init__.py +0 -0
  886. wisent/core/prompts/core/__init__.py +0 -0
  887. wisent/core/prompts/core/atom.py +57 -0
  888. wisent/core/prompts/core/prompt_formater.py +148 -0
  889. wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
  890. wisent/core/prompts/prompt_stratiegies/direct_completion.py +26 -0
  891. wisent/core/prompts/prompt_stratiegies/instruction_following.py +26 -0
  892. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +31 -0
  893. wisent/core/prompts/prompt_stratiegies/role_playing.py +33 -0
  894. wisent/core/representation.py +5 -0
  895. wisent/core/save_results.py +277 -0
  896. wisent/core/steering.py +660 -0
  897. wisent/core/steering_method.py +20 -0
  898. wisent/core/steering_methods/__init__.py +54 -0
  899. wisent/core/steering_methods/core/__init__.py +0 -0
  900. wisent/core/steering_methods/core/atoms.py +154 -0
  901. wisent/core/steering_methods/methods/__init__.py +0 -0
  902. wisent/core/steering_methods/methods/caa.py +45 -0
  903. wisent/core/steering_methods/methods/prism.py +588 -0
  904. wisent/core/steering_methods/methods/pulse.py +641 -0
  905. wisent/core/steering_methods/methods/titan.py +1005 -0
  906. wisent/core/steering_methods/preflight.py +322 -0
  907. wisent/core/steering_methods/registry.py +649 -0
  908. wisent/core/steering_methods/rotator.py +121 -0
  909. wisent/core/steering_optimizer.py +1503 -0
  910. wisent/core/synthetic/__init__.py +0 -0
  911. wisent/core/synthetic/cleaners/__init__.py +0 -0
  912. wisent/core/synthetic/cleaners/core/__init__.py +0 -0
  913. wisent/core/synthetic/cleaners/core/atoms.py +58 -0
  914. wisent/core/synthetic/cleaners/deduper_cleaner.py +53 -0
  915. wisent/core/synthetic/cleaners/methods/__init__.py +0 -0
  916. wisent/core/synthetic/cleaners/methods/base_dedupers.py +321 -0
  917. wisent/core/synthetic/cleaners/methods/base_refusalers.py +286 -0
  918. wisent/core/synthetic/cleaners/methods/core/__init__.py +0 -0
  919. wisent/core/synthetic/cleaners/methods/core/atoms.py +47 -0
  920. wisent/core/synthetic/cleaners/pairs_cleaner.py +90 -0
  921. wisent/core/synthetic/cleaners/refusaler_cleaner.py +133 -0
  922. wisent/core/synthetic/db_instructions/__init__.py +0 -0
  923. wisent/core/synthetic/db_instructions/core/__init__.py +0 -0
  924. wisent/core/synthetic/db_instructions/core/atoms.py +25 -0
  925. wisent/core/synthetic/db_instructions/mini_dp.py +115 -0
  926. wisent/core/synthetic/generators/__init__.py +0 -0
  927. wisent/core/synthetic/generators/core/__init__.py +0 -0
  928. wisent/core/synthetic/generators/core/atoms.py +73 -0
  929. wisent/core/synthetic/generators/diversities/__init__.py +0 -0
  930. wisent/core/synthetic/generators/diversities/core/__init__.py +0 -0
  931. wisent/core/synthetic/generators/diversities/core/core.py +68 -0
  932. wisent/core/synthetic/generators/diversities/methods/__init__.py +0 -0
  933. wisent/core/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
  934. wisent/core/synthetic/generators/nonsense_generator.py +150 -0
  935. wisent/core/synthetic/generators/pairs_generator.py +313 -0
  936. wisent/core/task_interface.py +143 -0
  937. wisent/core/task_selector.py +232 -0
  938. wisent/core/tasks/__init__.py +218 -0
  939. wisent/core/tasks/aime_task.py +142 -0
  940. wisent/core/tasks/file_task.py +212 -0
  941. wisent/core/tasks/hle_task.py +180 -0
  942. wisent/core/tasks/hmmt_task.py +120 -0
  943. wisent/core/tasks/livecodebench_task.py +94 -0
  944. wisent/core/tasks/livemathbench_task.py +159 -0
  945. wisent/core/tasks/lm_eval_task.py +611 -0
  946. wisent/core/tasks/math500_task.py +84 -0
  947. wisent/core/tasks/polymath_task.py +147 -0
  948. wisent/core/tasks/supergpqa_task.py +220 -0
  949. wisent/core/time_estimator.py +155 -0
  950. wisent/core/timing_calibration.py +176 -0
  951. wisent/core/tracking/__init__.py +54 -0
  952. wisent/core/tracking/latency.py +620 -0
  953. wisent/core/tracking/memory.py +360 -0
  954. wisent/core/trainers/__init__.py +0 -0
  955. wisent/core/trainers/core/__init__.py +11 -0
  956. wisent/core/trainers/core/atoms.py +45 -0
  957. wisent/core/trainers/steering_trainer.py +365 -0
  958. wisent/core/universal_subspace.py +918 -0
  959. wisent/core/user_model_config.py +158 -0
  960. wisent/core/utils/__init__.py +64 -0
  961. wisent/core/utils/base_rotator.py +292 -0
  962. wisent/core/utils/dataset_splits.py +197 -0
  963. wisent/core/utils/device.py +279 -0
  964. wisent/core/weight_modification/__init__.py +134 -0
  965. wisent/core/weight_modification/additive.py +340 -0
  966. wisent/core/weight_modification/directional.py +1357 -0
  967. wisent/core/weight_modification/export.py +359 -0
  968. wisent/core/weight_modification/multi_direction.py +410 -0
  969. wisent/core/weight_modification/utils.py +236 -0
  970. wisent/core/wisent.py +660 -0
  971. wisent/examples/contrastive_pairs/humanization_human_vs_ai.json +2112 -0
  972. wisent/examples/scripts/1/test_basqueglue_evaluation.json +51 -0
  973. wisent/examples/scripts/1/test_basqueglue_pairs.json +14 -0
  974. wisent/examples/scripts/1/test_bec2016eu_evaluation.json +51 -0
  975. wisent/examples/scripts/1/test_bec2016eu_pairs.json +14 -0
  976. wisent/examples/scripts/1/test_belebele_evaluation.json +51 -0
  977. wisent/examples/scripts/1/test_belebele_pairs.json +14 -0
  978. wisent/examples/scripts/1/test_benchmarks_evaluation.json +51 -0
  979. wisent/examples/scripts/1/test_benchmarks_pairs.json +14 -0
  980. wisent/examples/scripts/1/test_bertaqa_evaluation.json +51 -0
  981. wisent/examples/scripts/1/test_bertaqa_pairs.json +14 -0
  982. wisent/examples/scripts/1/test_bhtc_v2_evaluation.json +30 -0
  983. wisent/examples/scripts/1/test_bhtc_v2_pairs.json +8 -0
  984. wisent/examples/scripts/1/test_boolq-seq2seq_evaluation.json +30 -0
  985. wisent/examples/scripts/1/test_boolq-seq2seq_pairs.json +8 -0
  986. wisent/examples/scripts/1/test_cabreu_evaluation.json +30 -0
  987. wisent/examples/scripts/1/test_cabreu_pairs.json +8 -0
  988. wisent/examples/scripts/1/test_careqa_en_evaluation.json +30 -0
  989. wisent/examples/scripts/1/test_careqa_en_pairs.json +8 -0
  990. wisent/examples/scripts/1/test_careqa_evaluation.json +30 -0
  991. wisent/examples/scripts/1/test_careqa_pairs.json +8 -0
  992. wisent/examples/scripts/1/test_catalanqa_evaluation.json +30 -0
  993. wisent/examples/scripts/1/test_catalanqa_pairs.json +8 -0
  994. wisent/examples/scripts/1/test_catcola_evaluation.json +30 -0
  995. wisent/examples/scripts/1/test_catcola_pairs.json +8 -0
  996. wisent/examples/scripts/1/test_chartqa_evaluation.json +30 -0
  997. wisent/examples/scripts/1/test_chartqa_pairs.json +8 -0
  998. wisent/examples/scripts/1/test_claim_stance_topic_evaluation.json +30 -0
  999. wisent/examples/scripts/1/test_claim_stance_topic_pairs.json +8 -0
  1000. wisent/examples/scripts/1/test_cnn_dailymail_evaluation.json +30 -0
  1001. wisent/examples/scripts/1/test_cnn_dailymail_pairs.json +8 -0
  1002. wisent/examples/scripts/1/test_cocoteros_es_evaluation.json +30 -0
  1003. wisent/examples/scripts/1/test_cocoteros_es_pairs.json +8 -0
  1004. wisent/examples/scripts/1/test_coedit_gec_evaluation.json +30 -0
  1005. wisent/examples/scripts/1/test_coedit_gec_pairs.json +8 -0
  1006. wisent/examples/scripts/1/test_cola_evaluation.json +30 -0
  1007. wisent/examples/scripts/1/test_cola_pairs.json +8 -0
  1008. wisent/examples/scripts/1/test_coqcat_evaluation.json +30 -0
  1009. wisent/examples/scripts/1/test_coqcat_pairs.json +8 -0
  1010. wisent/examples/scripts/1/test_dbpedia_14_evaluation.json +30 -0
  1011. wisent/examples/scripts/1/test_dbpedia_14_pairs.json +8 -0
  1012. wisent/examples/scripts/1/test_epec_koref_bin_evaluation.json +30 -0
  1013. wisent/examples/scripts/1/test_epec_koref_bin_pairs.json +8 -0
  1014. wisent/examples/scripts/1/test_ethos_binary_evaluation.json +30 -0
  1015. wisent/examples/scripts/1/test_ethos_binary_pairs.json +8 -0
  1016. wisent/examples/scripts/2/test_afrimgsm_direct_amh_evaluation.json +30 -0
  1017. wisent/examples/scripts/2/test_afrimgsm_direct_amh_pairs.json +8 -0
  1018. wisent/examples/scripts/2/test_afrimmlu_direct_amh_evaluation.json +30 -0
  1019. wisent/examples/scripts/2/test_afrimmlu_direct_amh_pairs.json +8 -0
  1020. wisent/examples/scripts/2/test_afrixnli_en_direct_amh_evaluation.json +30 -0
  1021. wisent/examples/scripts/2/test_afrixnli_en_direct_amh_pairs.json +8 -0
  1022. wisent/examples/scripts/2/test_arc_ar_evaluation.json +30 -0
  1023. wisent/examples/scripts/2/test_arc_ar_pairs.json +8 -0
  1024. wisent/examples/scripts/2/test_atis_evaluation.json +30 -0
  1025. wisent/examples/scripts/2/test_atis_pairs.json +8 -0
  1026. wisent/examples/scripts/2/test_babi_evaluation.json +30 -0
  1027. wisent/examples/scripts/2/test_babi_pairs.json +8 -0
  1028. wisent/examples/scripts/2/test_babilong_evaluation.json +30 -0
  1029. wisent/examples/scripts/2/test_babilong_pairs.json +8 -0
  1030. wisent/examples/scripts/2/test_bangla_mmlu_evaluation.json +30 -0
  1031. wisent/examples/scripts/2/test_bangla_mmlu_pairs.json +8 -0
  1032. wisent/examples/scripts/2/test_basque-glue_pairs.json +14 -0
  1033. wisent/examples/scripts/benchmark_tags.json +2140 -0
  1034. wisent/examples/scripts/lm_eval_readme.json +4 -0
  1035. wisent/examples/scripts/results/benchmark_descriptions.json +1244 -0
  1036. wisent/examples/scripts/results/benchmark_evaluation_methods.json +66 -0
  1037. wisent/examples/scripts/results/benchmark_evaluator_mapping.json +2781 -0
  1038. wisent/examples/scripts/results/benchmark_evaluator_mapping_updated.json +30536 -0
  1039. wisent/examples/scripts/results/benchmark_evaluators_clean.json +469 -0
  1040. wisent/examples/scripts/results/benchmark_methods_summary.json +260 -0
  1041. wisent/examples/scripts/results/benchmark_pair_creation_methods.json +66 -0
  1042. wisent/examples/scripts/results/benchmark_pair_totals.json +269 -0
  1043. wisent/examples/scripts/results/benchmark_tags.json +917 -0
  1044. wisent/examples/scripts/results/benchmark_test_summary_nov4.json +71 -0
  1045. wisent/examples/scripts/results/coding_benchmarks_test_code_status.json +150 -0
  1046. wisent/examples/scripts/results/failing_benchmarks.json +946 -0
  1047. wisent/examples/scripts/results/failing_benchmarks_list.json +41 -0
  1048. wisent/examples/scripts/results/failing_benchmarks_test_results.json +945 -0
  1049. wisent/examples/scripts/results/missing_benchmark_tags.json +341 -0
  1050. wisent/examples/scripts/results/test_20_newsgroups_evaluation.json +30 -0
  1051. wisent/examples/scripts/results/test_20_newsgroups_pairs.json +8 -0
  1052. wisent/examples/scripts/results/test_AraDICE_evaluation.json +51 -0
  1053. wisent/examples/scripts/results/test_AraDICE_pairs.json +14 -0
  1054. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_evaluation.json +30 -0
  1055. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_pairs.json +8 -0
  1056. wisent/examples/scripts/results/test_ArabCulture_evaluation.json +51 -0
  1057. wisent/examples/scripts/results/test_ArabCulture_pairs.json +14 -0
  1058. wisent/examples/scripts/results/test_Tag_evaluation.json +30 -0
  1059. wisent/examples/scripts/results/test_Tag_pairs.json +8 -0
  1060. wisent/examples/scripts/results/test_aclue_evaluation.json +51 -0
  1061. wisent/examples/scripts/results/test_aclue_pairs.json +14 -0
  1062. wisent/examples/scripts/results/test_acp_bench_evaluation.json +51 -0
  1063. wisent/examples/scripts/results/test_acp_bench_hard_evaluation.json +51 -0
  1064. wisent/examples/scripts/results/test_acp_bench_hard_pairs.json +14 -0
  1065. wisent/examples/scripts/results/test_acp_bench_pairs.json +14 -0
  1066. wisent/examples/scripts/results/test_advanced_ai_risk_evaluation.json +51 -0
  1067. wisent/examples/scripts/results/test_advanced_ai_risk_pairs.json +14 -0
  1068. wisent/examples/scripts/results/test_aexams_evaluation.json +51 -0
  1069. wisent/examples/scripts/results/test_aexams_pairs.json +14 -0
  1070. wisent/examples/scripts/results/test_afrimgsm_direct_amh_evaluation.json +30 -0
  1071. wisent/examples/scripts/results/test_afrimgsm_direct_amh_pairs.json +8 -0
  1072. wisent/examples/scripts/results/test_afrimmlu_direct_amh_evaluation.json +30 -0
  1073. wisent/examples/scripts/results/test_afrimmlu_direct_amh_pairs.json +8 -0
  1074. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_evaluation.json +30 -0
  1075. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_pairs.json +8 -0
  1076. wisent/examples/scripts/results/test_ag_news_evaluation.json +30 -0
  1077. wisent/examples/scripts/results/test_ag_news_pairs.json +8 -0
  1078. wisent/examples/scripts/results/test_agieval_evaluation.json +51 -0
  1079. wisent/examples/scripts/results/test_agieval_pairs.json +14 -0
  1080. wisent/examples/scripts/results/test_aime2024_evaluation.json +30 -0
  1081. wisent/examples/scripts/results/test_aime2024_pairs.json +8 -0
  1082. wisent/examples/scripts/results/test_aime2025_evaluation.json +30 -0
  1083. wisent/examples/scripts/results/test_aime2025_pairs.json +8 -0
  1084. wisent/examples/scripts/results/test_aime_evaluation.json +30 -0
  1085. wisent/examples/scripts/results/test_aime_pairs.json +8 -0
  1086. wisent/examples/scripts/results/test_anagrams1_evaluation.json +30 -0
  1087. wisent/examples/scripts/results/test_anagrams1_pairs.json +8 -0
  1088. wisent/examples/scripts/results/test_anagrams2_evaluation.json +30 -0
  1089. wisent/examples/scripts/results/test_anagrams2_pairs.json +8 -0
  1090. wisent/examples/scripts/results/test_anli_evaluation.json +30 -0
  1091. wisent/examples/scripts/results/test_anli_pairs.json +8 -0
  1092. wisent/examples/scripts/results/test_apps_evaluation.json +30 -0
  1093. wisent/examples/scripts/results/test_apps_pairs.json +8 -0
  1094. wisent/examples/scripts/results/test_arabic_exams_evaluation.json +30 -0
  1095. wisent/examples/scripts/results/test_arabic_exams_pairs.json +8 -0
  1096. wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json +51 -0
  1097. wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json +14 -0
  1098. wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json +51 -0
  1099. wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json +14 -0
  1100. wisent/examples/scripts/results/test_arabicmmlu_evaluation.json +51 -0
  1101. wisent/examples/scripts/results/test_arabicmmlu_pairs.json +14 -0
  1102. wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json +51 -0
  1103. wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json +14 -0
  1104. wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json +51 -0
  1105. wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json +14 -0
  1106. wisent/examples/scripts/results/test_arc_ar_evaluation.json +30 -0
  1107. wisent/examples/scripts/results/test_arc_ar_pairs.json +8 -0
  1108. wisent/examples/scripts/results/test_arc_challenge_evaluation.json +30 -0
  1109. wisent/examples/scripts/results/test_arc_challenge_pairs.json +8 -0
  1110. wisent/examples/scripts/results/test_arc_easy_evaluation.json +30 -0
  1111. wisent/examples/scripts/results/test_arc_easy_pairs.json +8 -0
  1112. wisent/examples/scripts/results/test_argument_topic_evaluation.json +30 -0
  1113. wisent/examples/scripts/results/test_argument_topic_pairs.json +8 -0
  1114. wisent/examples/scripts/results/test_arithmetic_evaluation.json +51 -0
  1115. wisent/examples/scripts/results/test_arithmetic_pairs.json +14 -0
  1116. wisent/examples/scripts/results/test_asdiv_evaluation.json +30 -0
  1117. wisent/examples/scripts/results/test_asdiv_pairs.json +8 -0
  1118. wisent/examples/scripts/results/test_assin_entailment_evaluation.json +30 -0
  1119. wisent/examples/scripts/results/test_assin_entailment_pairs.json +8 -0
  1120. wisent/examples/scripts/results/test_atis_evaluation.json +30 -0
  1121. wisent/examples/scripts/results/test_atis_pairs.json +8 -0
  1122. wisent/examples/scripts/results/test_babi_evaluation.json +30 -0
  1123. wisent/examples/scripts/results/test_babi_pairs.json +8 -0
  1124. wisent/examples/scripts/results/test_babilong_evaluation.json +30 -0
  1125. wisent/examples/scripts/results/test_babilong_pairs.json +8 -0
  1126. wisent/examples/scripts/results/test_bangla_mmlu_evaluation.json +30 -0
  1127. wisent/examples/scripts/results/test_bangla_mmlu_pairs.json +8 -0
  1128. wisent/examples/scripts/results/test_banking77_evaluation.json +30 -0
  1129. wisent/examples/scripts/results/test_banking77_pairs.json +8 -0
  1130. wisent/examples/scripts/results/test_basque/test_basque-glue_pairs.json +14 -0
  1131. wisent/examples/scripts/results/test_basque-glue_evaluation.json +51 -0
  1132. wisent/examples/scripts/results/test_basque-glue_pairs.json +14 -0
  1133. wisent/examples/scripts/results/test_basque2/test_basque-glue_evaluation.json +51 -0
  1134. wisent/examples/scripts/results/test_basque2/test_basque-glue_pairs.json +14 -0
  1135. wisent/examples/scripts/results/test_basque_bench_evaluation.json +51 -0
  1136. wisent/examples/scripts/results/test_basque_bench_pairs.json +14 -0
  1137. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_evaluation.json +51 -0
  1138. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_pairs.json +14 -0
  1139. wisent/examples/scripts/results/test_basqueglue_evaluation.json +51 -0
  1140. wisent/examples/scripts/results/test_basqueglue_pairs.json +14 -0
  1141. wisent/examples/scripts/results/test_bbh_evaluation.json +51 -0
  1142. wisent/examples/scripts/results/test_bbh_pairs.json +14 -0
  1143. wisent/examples/scripts/results/test_bbq_evaluation.json +30 -0
  1144. wisent/examples/scripts/results/test_bbq_pairs.json +8 -0
  1145. wisent/examples/scripts/results/test_bec2016eu_evaluation.json +51 -0
  1146. wisent/examples/scripts/results/test_bec2016eu_pairs.json +14 -0
  1147. wisent/examples/scripts/results/test_belebele_evaluation.json +51 -0
  1148. wisent/examples/scripts/results/test_belebele_pairs.json +14 -0
  1149. wisent/examples/scripts/results/test_benchmarks_evaluation.json +51 -0
  1150. wisent/examples/scripts/results/test_benchmarks_pairs.json +14 -0
  1151. wisent/examples/scripts/results/test_bertaqa_evaluation.json +51 -0
  1152. wisent/examples/scripts/results/test_bertaqa_pairs.json +14 -0
  1153. wisent/examples/scripts/results/test_bhtc_v2_evaluation.json +30 -0
  1154. wisent/examples/scripts/results/test_bhtc_v2_pairs.json +8 -0
  1155. wisent/examples/scripts/results/test_bigbench_evaluation.json +51 -0
  1156. wisent/examples/scripts/results/test_bigbench_pairs.json +14 -0
  1157. wisent/examples/scripts/results/test_blimp_evaluation.json +51 -0
  1158. wisent/examples/scripts/results/test_blimp_pairs.json +14 -0
  1159. wisent/examples/scripts/results/test_boolq/test_boolq_evaluation.json +30 -0
  1160. wisent/examples/scripts/results/test_boolq/test_boolq_pairs.json +8 -0
  1161. wisent/examples/scripts/results/test_boolq-seq2seq_evaluation.json +30 -0
  1162. wisent/examples/scripts/results/test_boolq-seq2seq_pairs.json +8 -0
  1163. wisent/examples/scripts/results/test_boolq_evaluation.json +30 -0
  1164. wisent/examples/scripts/results/test_boolq_pairs.json +8 -0
  1165. wisent/examples/scripts/results/test_c4_evaluation.json +30 -0
  1166. wisent/examples/scripts/results/test_c4_pairs.json +8 -0
  1167. wisent/examples/scripts/results/test_cabreu_evaluation.json +30 -0
  1168. wisent/examples/scripts/results/test_cabreu_pairs.json +8 -0
  1169. wisent/examples/scripts/results/test_careqa_evaluation.json +30 -0
  1170. wisent/examples/scripts/results/test_careqa_pairs.json +8 -0
  1171. wisent/examples/scripts/results/test_catalan_bench_evaluation.json +51 -0
  1172. wisent/examples/scripts/results/test_catalan_bench_pairs.json +14 -0
  1173. wisent/examples/scripts/results/test_catalanqa_evaluation.json +30 -0
  1174. wisent/examples/scripts/results/test_catalanqa_pairs.json +8 -0
  1175. wisent/examples/scripts/results/test_catcola_evaluation.json +30 -0
  1176. wisent/examples/scripts/results/test_catcola_pairs.json +8 -0
  1177. wisent/examples/scripts/results/test_cb_evaluation.json +30 -0
  1178. wisent/examples/scripts/results/test_cb_pairs.json +8 -0
  1179. wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json +51 -0
  1180. wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json +14 -0
  1181. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json +30 -0
  1182. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json +8 -0
  1183. wisent/examples/scripts/results/test_ceval_evaluation.json +51 -0
  1184. wisent/examples/scripts/results/test_ceval_pairs.json +14 -0
  1185. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json +51 -0
  1186. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json +14 -0
  1187. wisent/examples/scripts/results/test_chain_of_thought_evaluation.json +51 -0
  1188. wisent/examples/scripts/results/test_chain_of_thought_pairs.json +14 -0
  1189. wisent/examples/scripts/results/test_chartqa_evaluation.json +30 -0
  1190. wisent/examples/scripts/results/test_chartqa_pairs.json +8 -0
  1191. wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json +30 -0
  1192. wisent/examples/scripts/results/test_claim_stance_topic_pairs.json +8 -0
  1193. wisent/examples/scripts/results/test_cmmlu_evaluation.json +51 -0
  1194. wisent/examples/scripts/results/test_cmmlu_pairs.json +14 -0
  1195. wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json +30 -0
  1196. wisent/examples/scripts/results/test_cnn_dailymail_pairs.json +8 -0
  1197. wisent/examples/scripts/results/test_cocoteros_es_evaluation.json +30 -0
  1198. wisent/examples/scripts/results/test_cocoteros_es_pairs.json +8 -0
  1199. wisent/examples/scripts/results/test_codexglue_code_to_text_go_evaluation.json +30 -0
  1200. wisent/examples/scripts/results/test_codexglue_code_to_text_go_pairs.json +8 -0
  1201. wisent/examples/scripts/results/test_codexglue_code_to_text_java_evaluation.json +30 -0
  1202. wisent/examples/scripts/results/test_codexglue_code_to_text_java_pairs.json +8 -0
  1203. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_evaluation.json +30 -0
  1204. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_pairs.json +8 -0
  1205. wisent/examples/scripts/results/test_codexglue_code_to_text_php_evaluation.json +30 -0
  1206. wisent/examples/scripts/results/test_codexglue_code_to_text_php_pairs.json +8 -0
  1207. wisent/examples/scripts/results/test_codexglue_code_to_text_python_evaluation.json +30 -0
  1208. wisent/examples/scripts/results/test_codexglue_code_to_text_python_pairs.json +8 -0
  1209. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_evaluation.json +30 -0
  1210. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_pairs.json +8 -0
  1211. wisent/examples/scripts/results/test_coedit_gec_evaluation.json +30 -0
  1212. wisent/examples/scripts/results/test_coedit_gec_pairs.json +8 -0
  1213. wisent/examples/scripts/results/test_cola_evaluation.json +30 -0
  1214. wisent/examples/scripts/results/test_cola_pairs.json +8 -0
  1215. wisent/examples/scripts/results/test_commonsense_qa_evaluation.json +30 -0
  1216. wisent/examples/scripts/results/test_commonsense_qa_pairs.json +8 -0
  1217. wisent/examples/scripts/results/test_conala_evaluation.json +30 -0
  1218. wisent/examples/scripts/results/test_conala_pairs.json +8 -0
  1219. wisent/examples/scripts/results/test_concode_evaluation.json +30 -0
  1220. wisent/examples/scripts/results/test_concode_pairs.json +8 -0
  1221. wisent/examples/scripts/results/test_copa_evaluation.json +30 -0
  1222. wisent/examples/scripts/results/test_copa_pairs.json +8 -0
  1223. wisent/examples/scripts/results/test_copal_id_evaluation.json +30 -0
  1224. wisent/examples/scripts/results/test_copal_id_pairs.json +8 -0
  1225. wisent/examples/scripts/results/test_coqa_evaluation.json +30 -0
  1226. wisent/examples/scripts/results/test_coqa_pairs.json +8 -0
  1227. wisent/examples/scripts/results/test_coqcat_evaluation.json +30 -0
  1228. wisent/examples/scripts/results/test_coqcat_pairs.json +8 -0
  1229. wisent/examples/scripts/results/test_crows_pairs_evaluation.json +51 -0
  1230. wisent/examples/scripts/results/test_crows_pairs_pairs.json +14 -0
  1231. wisent/examples/scripts/results/test_csatqa_evaluation.json +51 -0
  1232. wisent/examples/scripts/results/test_csatqa_pairs.json +14 -0
  1233. wisent/examples/scripts/results/test_cycle_letters_evaluation.json +30 -0
  1234. wisent/examples/scripts/results/test_cycle_letters_pairs.json +8 -0
  1235. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_evaluation.json +51 -0
  1236. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_pairs.json +14 -0
  1237. wisent/examples/scripts/results/test_darija_bench_evaluation.json +51 -0
  1238. wisent/examples/scripts/results/test_darija_bench_pairs.json +14 -0
  1239. wisent/examples/scripts/results/test_darijahellaswag_evaluation.json +30 -0
  1240. wisent/examples/scripts/results/test_darijahellaswag_pairs.json +8 -0
  1241. wisent/examples/scripts/results/test_darijammlu_evaluation.json +51 -0
  1242. wisent/examples/scripts/results/test_darijammlu_pairs.json +14 -0
  1243. wisent/examples/scripts/results/test_dbpedia_14_evaluation.json +30 -0
  1244. wisent/examples/scripts/results/test_dbpedia_14_pairs.json +8 -0
  1245. wisent/examples/scripts/results/test_drop_evaluation.json +30 -0
  1246. wisent/examples/scripts/results/test_drop_pairs.json +8 -0
  1247. wisent/examples/scripts/results/test_ds1000_evaluation.json +30 -0
  1248. wisent/examples/scripts/results/test_ds1000_pairs.json +8 -0
  1249. wisent/examples/scripts/results/test_egyhellaswag_evaluation.json +30 -0
  1250. wisent/examples/scripts/results/test_egyhellaswag_pairs.json +8 -0
  1251. wisent/examples/scripts/results/test_egymmlu_evaluation.json +51 -0
  1252. wisent/examples/scripts/results/test_egymmlu_pairs.json +14 -0
  1253. wisent/examples/scripts/results/test_epec_koref_bin_evaluation.json +30 -0
  1254. wisent/examples/scripts/results/test_epec_koref_bin_pairs.json +8 -0
  1255. wisent/examples/scripts/results/test_eq_bench_evaluation.json +30 -0
  1256. wisent/examples/scripts/results/test_eq_bench_pairs.json +8 -0
  1257. wisent/examples/scripts/results/test_escola_evaluation.json +30 -0
  1258. wisent/examples/scripts/results/test_escola_pairs.json +8 -0
  1259. wisent/examples/scripts/results/test_ethics_cm_evaluation.json +30 -0
  1260. wisent/examples/scripts/results/test_ethics_cm_pairs.json +8 -0
  1261. wisent/examples/scripts/results/test_ethos_binary_evaluation.json +30 -0
  1262. wisent/examples/scripts/results/test_ethos_binary_pairs.json +8 -0
  1263. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_evaluation.json +51 -0
  1264. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_pairs.json +14 -0
  1265. wisent/examples/scripts/results/test_eus_exams_es_evaluation.json +51 -0
  1266. wisent/examples/scripts/results/test_eus_exams_es_pairs.json +14 -0
  1267. wisent/examples/scripts/results/test_eus_exams_evaluation.json +51 -0
  1268. wisent/examples/scripts/results/test_eus_exams_pairs.json +14 -0
  1269. wisent/examples/scripts/results/test_eus_proficiency_evaluation.json +30 -0
  1270. wisent/examples/scripts/results/test_eus_proficiency_pairs.json +8 -0
  1271. wisent/examples/scripts/results/test_eus_reading_evaluation.json +30 -0
  1272. wisent/examples/scripts/results/test_eus_reading_pairs.json +8 -0
  1273. wisent/examples/scripts/results/test_eus_trivia_evaluation.json +30 -0
  1274. wisent/examples/scripts/results/test_eus_trivia_pairs.json +8 -0
  1275. wisent/examples/scripts/results/test_evalita-mp_evaluation.json +51 -0
  1276. wisent/examples/scripts/results/test_evalita-mp_pairs.json +14 -0
  1277. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +30 -0
  1278. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_pairs.json +8 -0
  1279. wisent/examples/scripts/results/test_evalita_LLM_evaluation.json +51 -0
  1280. wisent/examples/scripts/results/test_evalita_LLM_pairs.json +14 -0
  1281. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_evaluation.json +51 -0
  1282. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_pairs.json +14 -0
  1283. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_evaluation.json +30 -0
  1284. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_pairs.json +8 -0
  1285. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_evaluation.json +51 -0
  1286. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_pairs.json +14 -0
  1287. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +30 -0
  1288. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_pairs.json +8 -0
  1289. wisent/examples/scripts/results/test_fda_evaluation.json +30 -0
  1290. wisent/examples/scripts/results/test_fda_pairs.json +8 -0
  1291. wisent/examples/scripts/results/test_financial_tweets_evaluation.json +30 -0
  1292. wisent/examples/scripts/results/test_financial_tweets_pairs.json +8 -0
  1293. wisent/examples/scripts/results/test_fld/test_fld_evaluation.json +30 -0
  1294. wisent/examples/scripts/results/test_fld/test_fld_pairs.json +8 -0
  1295. wisent/examples/scripts/results/test_fld_evaluation.json +30 -0
  1296. wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json +30 -0
  1297. wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json +8 -0
  1298. wisent/examples/scripts/results/test_fld_pairs.json +8 -0
  1299. wisent/examples/scripts/results/test_flores_evaluation.json +51 -0
  1300. wisent/examples/scripts/results/test_flores_pairs.json +14 -0
  1301. wisent/examples/scripts/results/test_freebase_evaluation.json +30 -0
  1302. wisent/examples/scripts/results/test_freebase_pairs.json +8 -0
  1303. wisent/examples/scripts/results/test_french_bench_evaluation.json +51 -0
  1304. wisent/examples/scripts/results/test_french_bench_pairs.json +14 -0
  1305. wisent/examples/scripts/results/test_galcola_evaluation.json +30 -0
  1306. wisent/examples/scripts/results/test_galcola_pairs.json +8 -0
  1307. wisent/examples/scripts/results/test_galician_bench_evaluation.json +51 -0
  1308. wisent/examples/scripts/results/test_galician_bench_pairs.json +14 -0
  1309. wisent/examples/scripts/results/test_glianorex_evaluation.json +30 -0
  1310. wisent/examples/scripts/results/test_glianorex_pairs.json +8 -0
  1311. wisent/examples/scripts/results/test_global_mmlu_evaluation.json +51 -0
  1312. wisent/examples/scripts/results/test_global_mmlu_pairs.json +14 -0
  1313. wisent/examples/scripts/results/test_glue_evaluation.json +51 -0
  1314. wisent/examples/scripts/results/test_glue_pairs.json +14 -0
  1315. wisent/examples/scripts/results/test_gpqa_evaluation.json +51 -0
  1316. wisent/examples/scripts/results/test_gpqa_pairs.json +14 -0
  1317. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_evaluation.json +51 -0
  1318. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_pairs.json +14 -0
  1319. wisent/examples/scripts/results/test_groundcocoa_evaluation.json +30 -0
  1320. wisent/examples/scripts/results/test_groundcocoa_pairs.json +8 -0
  1321. wisent/examples/scripts/results/test_gsm8k_evaluation.json +30 -0
  1322. wisent/examples/scripts/results/test_gsm8k_pairs.json +8 -0
  1323. wisent/examples/scripts/results/test_haerae_evaluation.json +51 -0
  1324. wisent/examples/scripts/results/test_haerae_pairs.json +14 -0
  1325. wisent/examples/scripts/results/test_headqa_evaluation.json +30 -0
  1326. wisent/examples/scripts/results/test_headqa_pairs.json +8 -0
  1327. wisent/examples/scripts/results/test_hellaswag_evaluation.json +30 -0
  1328. wisent/examples/scripts/results/test_hellaswag_pairs.json +8 -0
  1329. wisent/examples/scripts/results/test_hendrycks_ethics_evaluation.json +51 -0
  1330. wisent/examples/scripts/results/test_hendrycks_ethics_pairs.json +14 -0
  1331. wisent/examples/scripts/results/test_hendrycks_math_evaluation.json +51 -0
  1332. wisent/examples/scripts/results/test_hendrycks_math_pairs.json +14 -0
  1333. wisent/examples/scripts/results/test_histoires_morales_evaluation.json +30 -0
  1334. wisent/examples/scripts/results/test_histoires_morales_pairs.json +8 -0
  1335. wisent/examples/scripts/results/test_hmmt_evaluation.json +30 -0
  1336. wisent/examples/scripts/results/test_hmmt_feb_2025_evaluation.json +30 -0
  1337. wisent/examples/scripts/results/test_hmmt_feb_2025_pairs.json +8 -0
  1338. wisent/examples/scripts/results/test_hmmt_pairs.json +8 -0
  1339. wisent/examples/scripts/results/test_hrm8k_evaluation.json +51 -0
  1340. wisent/examples/scripts/results/test_hrm8k_pairs.json +14 -0
  1341. wisent/examples/scripts/results/test_humaneval_evaluation.json +30 -0
  1342. wisent/examples/scripts/results/test_humaneval_pairs.json +8 -0
  1343. wisent/examples/scripts/results/test_humaneval_plus_evaluation.json +30 -0
  1344. wisent/examples/scripts/results/test_humaneval_plus_pairs.json +8 -0
  1345. wisent/examples/scripts/results/test_ifeval_evaluation.json +30 -0
  1346. wisent/examples/scripts/results/test_ifeval_pairs.json +8 -0
  1347. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_evaluation.json +30 -0
  1348. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_pairs.json +8 -0
  1349. wisent/examples/scripts/results/test_instruct_humaneval_evaluation.json +30 -0
  1350. wisent/examples/scripts/results/test_instruct_humaneval_pairs.json +8 -0
  1351. wisent/examples/scripts/results/test_inverse_scaling_evaluation.json +51 -0
  1352. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_evaluation.json +30 -0
  1353. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_pairs.json +8 -0
  1354. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_evaluation.json +51 -0
  1355. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_pairs.json +14 -0
  1356. wisent/examples/scripts/results/test_inverse_scaling_pairs.json +14 -0
  1357. wisent/examples/scripts/results/test_iwslt2017-ar-en_evaluation.json +30 -0
  1358. wisent/examples/scripts/results/test_iwslt2017-ar-en_pairs.json +8 -0
  1359. wisent/examples/scripts/results/test_iwslt2017-en-ar_evaluation.json +30 -0
  1360. wisent/examples/scripts/results/test_iwslt2017-en-ar_pairs.json +8 -0
  1361. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_evaluation.json +30 -0
  1362. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_pairs.json +8 -0
  1363. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_evaluation.json +30 -0
  1364. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_pairs.json +8 -0
  1365. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_evaluation.json +30 -0
  1366. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_pairs.json +8 -0
  1367. wisent/examples/scripts/results/test_japanese_leaderboard_evaluation.json +51 -0
  1368. wisent/examples/scripts/results/test_japanese_leaderboard_pairs.json +14 -0
  1369. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_evaluation.json +30 -0
  1370. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_pairs.json +8 -0
  1371. wisent/examples/scripts/results/test_jsonschema_bench_evaluation.json +30 -0
  1372. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_evaluation.json +30 -0
  1373. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_pairs.json +8 -0
  1374. wisent/examples/scripts/results/test_jsonschema_bench_pairs.json +8 -0
  1375. wisent/examples/scripts/results/test_kbl_evaluation.json +51 -0
  1376. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_evaluation.json +51 -0
  1377. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_pairs.json +14 -0
  1378. wisent/examples/scripts/results/test_kbl_pairs.json +14 -0
  1379. wisent/examples/scripts/results/test_kmmlu_evaluation.json +51 -0
  1380. wisent/examples/scripts/results/test_kmmlu_pairs.json +14 -0
  1381. wisent/examples/scripts/results/test_kobest_evaluation.json +51 -0
  1382. wisent/examples/scripts/results/test_kobest_pairs.json +14 -0
  1383. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json +30 -0
  1384. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json +8 -0
  1385. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json +30 -0
  1386. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json +8 -0
  1387. wisent/examples/scripts/results/test_kormedmcqa_evaluation.json +30 -0
  1388. wisent/examples/scripts/results/test_kormedmcqa_pairs.json +8 -0
  1389. wisent/examples/scripts/results/test_lambada_cloze_evaluation.json +30 -0
  1390. wisent/examples/scripts/results/test_lambada_cloze_pairs.json +8 -0
  1391. wisent/examples/scripts/results/test_lambada_evaluation.json +30 -0
  1392. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json +30 -0
  1393. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json +8 -0
  1394. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json +51 -0
  1395. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json +14 -0
  1396. wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json +51 -0
  1397. wisent/examples/scripts/results/test_lambada_multilingual_pairs.json +14 -0
  1398. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json +51 -0
  1399. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json +14 -0
  1400. wisent/examples/scripts/results/test_lambada_openai_evaluation.json +30 -0
  1401. wisent/examples/scripts/results/test_lambada_openai_pairs.json +8 -0
  1402. wisent/examples/scripts/results/test_lambada_pairs.json +8 -0
  1403. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +30 -0
  1404. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +8 -0
  1405. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +30 -0
  1406. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +8 -0
  1407. wisent/examples/scripts/results/test_lambada_standard_evaluation.json +30 -0
  1408. wisent/examples/scripts/results/test_lambada_standard_pairs.json +8 -0
  1409. wisent/examples/scripts/results/test_leaderboard_evaluation.json +51 -0
  1410. wisent/examples/scripts/results/test_leaderboard_pairs.json +14 -0
  1411. wisent/examples/scripts/results/test_libra/test_libra_evaluation.json +51 -0
  1412. wisent/examples/scripts/results/test_libra/test_libra_pairs.json +14 -0
  1413. wisent/examples/scripts/results/test_libra_evaluation.json +51 -0
  1414. wisent/examples/scripts/results/test_libra_pairs.json +14 -0
  1415. wisent/examples/scripts/results/test_lingoly_evaluation.json +30 -0
  1416. wisent/examples/scripts/results/test_lingoly_pairs.json +8 -0
  1417. wisent/examples/scripts/results/test_livecodebench_evaluation.json +30 -0
  1418. wisent/examples/scripts/results/test_livecodebench_pairs.json +8 -0
  1419. wisent/examples/scripts/results/test_livemathbench_cnmo_en_evaluation.json +30 -0
  1420. wisent/examples/scripts/results/test_livemathbench_cnmo_en_pairs.json +8 -0
  1421. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_evaluation.json +30 -0
  1422. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_pairs.json +8 -0
  1423. wisent/examples/scripts/results/test_llama_evaluation.json +30 -0
  1424. wisent/examples/scripts/results/test_llama_pairs.json +8 -0
  1425. wisent/examples/scripts/results/test_logiqa2_evaluation.json +30 -0
  1426. wisent/examples/scripts/results/test_logiqa2_pairs.json +8 -0
  1427. wisent/examples/scripts/results/test_logiqa_evaluation.json +30 -0
  1428. wisent/examples/scripts/results/test_logiqa_pairs.json +8 -0
  1429. wisent/examples/scripts/results/test_m_mmlu_evaluation.json +51 -0
  1430. wisent/examples/scripts/results/test_m_mmlu_pairs.json +14 -0
  1431. wisent/examples/scripts/results/test_mastermind/test_mastermind_evaluation.json +51 -0
  1432. wisent/examples/scripts/results/test_mastermind/test_mastermind_pairs.json +14 -0
  1433. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_evaluation.json +30 -0
  1434. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_pairs.json +8 -0
  1435. wisent/examples/scripts/results/test_mastermind_evaluation.json +51 -0
  1436. wisent/examples/scripts/results/test_mastermind_pairs.json +14 -0
  1437. wisent/examples/scripts/results/test_math500_evaluation.json +30 -0
  1438. wisent/examples/scripts/results/test_math500_pairs.json +8 -0
  1439. wisent/examples/scripts/results/test_math_evaluation.json +30 -0
  1440. wisent/examples/scripts/results/test_math_pairs.json +8 -0
  1441. wisent/examples/scripts/results/test_mathqa_evaluation.json +30 -0
  1442. wisent/examples/scripts/results/test_mathqa_pairs.json +8 -0
  1443. wisent/examples/scripts/results/test_mbpp_evaluation.json +30 -0
  1444. wisent/examples/scripts/results/test_mbpp_pairs.json +8 -0
  1445. wisent/examples/scripts/results/test_mbpp_plus_evaluation.json +30 -0
  1446. wisent/examples/scripts/results/test_mbpp_plus_pairs.json +8 -0
  1447. wisent/examples/scripts/results/test_mc_taco_evaluation.json +30 -0
  1448. wisent/examples/scripts/results/test_mc_taco_pairs.json +8 -0
  1449. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_evaluation.json +51 -0
  1450. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_pairs.json +14 -0
  1451. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_evaluation.json +30 -0
  1452. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_pairs.json +8 -0
  1453. wisent/examples/scripts/results/test_med_concepts_qa_evaluation.json +51 -0
  1454. wisent/examples/scripts/results/test_med_concepts_qa_pairs.json +14 -0
  1455. wisent/examples/scripts/results/test_meddialog_evaluation.json +30 -0
  1456. wisent/examples/scripts/results/test_meddialog_pairs.json +8 -0
  1457. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_evaluation.json +30 -0
  1458. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_pairs.json +8 -0
  1459. wisent/examples/scripts/results/test_mediqa_qa2019_evaluation.json +30 -0
  1460. wisent/examples/scripts/results/test_mediqa_qa2019_pairs.json +8 -0
  1461. wisent/examples/scripts/results/test_medmcqa_evaluation.json +30 -0
  1462. wisent/examples/scripts/results/test_medmcqa_pairs.json +8 -0
  1463. wisent/examples/scripts/results/test_medqa_evaluation.json +30 -0
  1464. wisent/examples/scripts/results/test_medqa_pairs.json +8 -0
  1465. wisent/examples/scripts/results/test_medtext_evaluation.json +30 -0
  1466. wisent/examples/scripts/results/test_medtext_pairs.json +8 -0
  1467. wisent/examples/scripts/results/test_mela_evaluation.json +51 -0
  1468. wisent/examples/scripts/results/test_mela_pairs.json +14 -0
  1469. wisent/examples/scripts/results/test_meqsum_evaluation.json +30 -0
  1470. wisent/examples/scripts/results/test_meqsum_pairs.json +8 -0
  1471. wisent/examples/scripts/results/test_mercury_evaluation.json +30 -0
  1472. wisent/examples/scripts/results/test_mercury_pairs.json +8 -0
  1473. wisent/examples/scripts/results/test_metabench_evaluation.json +51 -0
  1474. wisent/examples/scripts/results/test_metabench_pairs.json +14 -0
  1475. wisent/examples/scripts/results/test_mgsm_evaluation.json +51 -0
  1476. wisent/examples/scripts/results/test_mgsm_pairs.json +14 -0
  1477. wisent/examples/scripts/results/test_mimic_repsum_evaluation.json +30 -0
  1478. wisent/examples/scripts/results/test_mimic_repsum_pairs.json +8 -0
  1479. wisent/examples/scripts/results/test_minerva_math_evaluation.json +51 -0
  1480. wisent/examples/scripts/results/test_minerva_math_pairs.json +14 -0
  1481. wisent/examples/scripts/results/test_mlqa_evaluation.json +51 -0
  1482. wisent/examples/scripts/results/test_mlqa_pairs.json +14 -0
  1483. wisent/examples/scripts/results/test_mmlu-pro-plus_evaluation.json +51 -0
  1484. wisent/examples/scripts/results/test_mmlu-pro-plus_pairs.json +14 -0
  1485. wisent/examples/scripts/results/test_mmlu_evaluation.json +51 -0
  1486. wisent/examples/scripts/results/test_mmlu_pairs.json +14 -0
  1487. wisent/examples/scripts/results/test_mmlu_pro_evaluation.json +51 -0
  1488. wisent/examples/scripts/results/test_mmlu_pro_pairs.json +14 -0
  1489. wisent/examples/scripts/results/test_mmlu_prox_evaluation.json +51 -0
  1490. wisent/examples/scripts/results/test_mmlu_prox_pairs.json +14 -0
  1491. wisent/examples/scripts/results/test_mmlusr_evaluation.json +30 -0
  1492. wisent/examples/scripts/results/test_mmlusr_pairs.json +8 -0
  1493. wisent/examples/scripts/results/test_mmmu_evaluation.json +51 -0
  1494. wisent/examples/scripts/results/test_mmmu_pairs.json +14 -0
  1495. wisent/examples/scripts/results/test_mnli_evaluation.json +30 -0
  1496. wisent/examples/scripts/results/test_mnli_pairs.json +8 -0
  1497. wisent/examples/scripts/results/test_model_written_evals_evaluation.json +51 -0
  1498. wisent/examples/scripts/results/test_model_written_evals_pairs.json +14 -0
  1499. wisent/examples/scripts/results/test_moral_stories_evaluation.json +30 -0
  1500. wisent/examples/scripts/results/test_moral_stories_pairs.json +8 -0
  1501. wisent/examples/scripts/results/test_mts_dialog_evaluation.json +30 -0
  1502. wisent/examples/scripts/results/test_mts_dialog_pairs.json +8 -0
  1503. wisent/examples/scripts/results/test_multiblimp_evaluation.json +51 -0
  1504. wisent/examples/scripts/results/test_multiblimp_pairs.json +14 -0
  1505. wisent/examples/scripts/results/test_multimedqa_evaluation.json +51 -0
  1506. wisent/examples/scripts/results/test_multimedqa_pairs.json +14 -0
  1507. wisent/examples/scripts/results/test_multipl_e_evaluation.json +30 -0
  1508. wisent/examples/scripts/results/test_multipl_e_pairs.json +8 -0
  1509. wisent/examples/scripts/results/test_mutual_evaluation.json +30 -0
  1510. wisent/examples/scripts/results/test_mutual_pairs.json +8 -0
  1511. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_evaluation.json +30 -0
  1512. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_pairs.json +8 -0
  1513. wisent/examples/scripts/results/test_noreval_evaluation.json +51 -0
  1514. wisent/examples/scripts/results/test_noreval_pairs.json +14 -0
  1515. wisent/examples/scripts/results/test_noticia_evaluation.json +30 -0
  1516. wisent/examples/scripts/results/test_noticia_pairs.json +8 -0
  1517. wisent/examples/scripts/results/test_nq_open_evaluation.json +30 -0
  1518. wisent/examples/scripts/results/test_nq_open_pairs.json +8 -0
  1519. wisent/examples/scripts/results/test_olaph_evaluation.json +30 -0
  1520. wisent/examples/scripts/results/test_olaph_pairs.json +8 -0
  1521. wisent/examples/scripts/results/test_openbookqa_evaluation.json +30 -0
  1522. wisent/examples/scripts/results/test_openbookqa_pairs.json +8 -0
  1523. wisent/examples/scripts/results/test_openllm_evaluation.json +51 -0
  1524. wisent/examples/scripts/results/test_openllm_pairs.json +14 -0
  1525. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_evaluation.json +30 -0
  1526. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_pairs.json +8 -0
  1527. wisent/examples/scripts/results/test_paloma_evaluation.json +51 -0
  1528. wisent/examples/scripts/results/test_paloma_pairs.json +14 -0
  1529. wisent/examples/scripts/results/test_passkey/test_passkey_evaluation.json +30 -0
  1530. wisent/examples/scripts/results/test_passkey/test_passkey_pairs.json +8 -0
  1531. wisent/examples/scripts/results/test_paws-x_evaluation.json +51 -0
  1532. wisent/examples/scripts/results/test_paws-x_pairs.json +14 -0
  1533. wisent/examples/scripts/results/test_paws_en/test_paws_en_evaluation.json +30 -0
  1534. wisent/examples/scripts/results/test_paws_en/test_paws_en_pairs.json +8 -0
  1535. wisent/examples/scripts/results/test_penn_treebank_evaluation.json +30 -0
  1536. wisent/examples/scripts/results/test_penn_treebank_pairs.json +8 -0
  1537. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_evaluation.json +30 -0
  1538. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_pairs.json +8 -0
  1539. wisent/examples/scripts/results/test_piqa_evaluation.json +30 -0
  1540. wisent/examples/scripts/results/test_piqa_pairs.json +8 -0
  1541. wisent/examples/scripts/results/test_polemo2_evaluation.json +30 -0
  1542. wisent/examples/scripts/results/test_polemo2_pairs.json +8 -0
  1543. wisent/examples/scripts/results/test_polymath_en_high_evaluation.json +30 -0
  1544. wisent/examples/scripts/results/test_polymath_en_high_pairs.json +8 -0
  1545. wisent/examples/scripts/results/test_polymath_en_medium_evaluation.json +30 -0
  1546. wisent/examples/scripts/results/test_polymath_en_medium_pairs.json +8 -0
  1547. wisent/examples/scripts/results/test_polymath_zh_high_evaluation.json +30 -0
  1548. wisent/examples/scripts/results/test_polymath_zh_high_pairs.json +8 -0
  1549. wisent/examples/scripts/results/test_polymath_zh_medium_evaluation.json +30 -0
  1550. wisent/examples/scripts/results/test_polymath_zh_medium_pairs.json +8 -0
  1551. wisent/examples/scripts/results/test_portuguese_bench_evaluation.json +51 -0
  1552. wisent/examples/scripts/results/test_portuguese_bench_pairs.json +14 -0
  1553. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_evaluation.json +30 -0
  1554. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_pairs.json +8 -0
  1555. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_evaluation.json +30 -0
  1556. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_pairs.json +8 -0
  1557. wisent/examples/scripts/results/test_prost_evaluation.json +30 -0
  1558. wisent/examples/scripts/results/test_prost_pairs.json +8 -0
  1559. wisent/examples/scripts/results/test_ptb_evaluation.json +30 -0
  1560. wisent/examples/scripts/results/test_ptb_pairs.json +8 -0
  1561. wisent/examples/scripts/results/test_pubmedqa_evaluation.json +30 -0
  1562. wisent/examples/scripts/results/test_pubmedqa_pairs.json +8 -0
  1563. wisent/examples/scripts/results/test_pythia_evaluation.json +51 -0
  1564. wisent/examples/scripts/results/test_pythia_pairs.json +14 -0
  1565. wisent/examples/scripts/results/test_qa4mre_evaluation.json +30 -0
  1566. wisent/examples/scripts/results/test_qa4mre_pairs.json +8 -0
  1567. wisent/examples/scripts/results/test_qasper_evaluation.json +30 -0
  1568. wisent/examples/scripts/results/test_qasper_pairs.json +8 -0
  1569. wisent/examples/scripts/results/test_race_evaluation.json +30 -0
  1570. wisent/examples/scripts/results/test_race_pairs.json +8 -0
  1571. wisent/examples/scripts/results/test_realtoxicityprompts_evaluation.json +30 -0
  1572. wisent/examples/scripts/results/test_realtoxicityprompts_pairs.json +8 -0
  1573. wisent/examples/scripts/results/test_recode_evaluation.json +30 -0
  1574. wisent/examples/scripts/results/test_recode_pairs.json +8 -0
  1575. wisent/examples/scripts/results/test_record_evaluation.json +30 -0
  1576. wisent/examples/scripts/results/test_record_pairs.json +8 -0
  1577. wisent/examples/scripts/results/test_ruler_evaluation.json +51 -0
  1578. wisent/examples/scripts/results/test_ruler_pairs.json +14 -0
  1579. wisent/examples/scripts/results/test_sciq_evaluation.json +30 -0
  1580. wisent/examples/scripts/results/test_sciq_pairs.json +8 -0
  1581. wisent/examples/scripts/results/test_score_evaluation.json +51 -0
  1582. wisent/examples/scripts/results/test_score_pairs.json +14 -0
  1583. wisent/examples/scripts/results/test_self_consistency_evaluation.json +30 -0
  1584. wisent/examples/scripts/results/test_self_consistency_pairs.json +8 -0
  1585. wisent/examples/scripts/results/test_siqa/test_siqa_evaluation.json +30 -0
  1586. wisent/examples/scripts/results/test_siqa/test_siqa_pairs.json +8 -0
  1587. wisent/examples/scripts/results/test_siqa_evaluation.json +30 -0
  1588. wisent/examples/scripts/results/test_siqa_pairs.json +8 -0
  1589. wisent/examples/scripts/results/test_spanish_bench_evaluation.json +51 -0
  1590. wisent/examples/scripts/results/test_spanish_bench_pairs.json +14 -0
  1591. wisent/examples/scripts/results/test_squad2_evaluation.json +30 -0
  1592. wisent/examples/scripts/results/test_squad2_pairs.json +8 -0
  1593. wisent/examples/scripts/results/test_squadv2_evaluation.json +30 -0
  1594. wisent/examples/scripts/results/test_squadv2_pairs.json +8 -0
  1595. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_evaluation.json +30 -0
  1596. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_pairs.json +8 -0
  1597. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_evaluation.json +51 -0
  1598. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_pairs.json +14 -0
  1599. wisent/examples/scripts/results/test_swag_evaluation.json +30 -0
  1600. wisent/examples/scripts/results/test_swag_pairs.json +8 -0
  1601. wisent/examples/scripts/results/test_tinyBenchmarks_evaluation.json +51 -0
  1602. wisent/examples/scripts/results/test_tinyBenchmarks_pairs.json +14 -0
  1603. wisent/examples/scripts/results/test_tmmluplus_evaluation.json +51 -0
  1604. wisent/examples/scripts/results/test_tmmluplus_pairs.json +14 -0
  1605. wisent/examples/scripts/results/test_translation_evaluation.json +51 -0
  1606. wisent/examples/scripts/results/test_translation_pairs.json +14 -0
  1607. wisent/examples/scripts/results/test_triviaqa_evaluation.json +30 -0
  1608. wisent/examples/scripts/results/test_triviaqa_pairs.json +8 -0
  1609. wisent/examples/scripts/results/test_truthfulqa-multi_evaluation.json +51 -0
  1610. wisent/examples/scripts/results/test_truthfulqa-multi_pairs.json +14 -0
  1611. wisent/examples/scripts/results/test_truthfulqa_evaluation.json +30 -0
  1612. wisent/examples/scripts/results/test_truthfulqa_mc1_evaluation.json +30 -0
  1613. wisent/examples/scripts/results/test_truthfulqa_mc1_pairs.json +8 -0
  1614. wisent/examples/scripts/results/test_truthfulqa_mc2_evaluation.json +30 -0
  1615. wisent/examples/scripts/results/test_truthfulqa_mc2_pairs.json +8 -0
  1616. wisent/examples/scripts/results/test_truthfulqa_pairs.json +8 -0
  1617. wisent/examples/scripts/results/test_turkishmmlu_evaluation.json +51 -0
  1618. wisent/examples/scripts/results/test_turkishmmlu_pairs.json +14 -0
  1619. wisent/examples/scripts/results/test_unfair_tos_evaluation.json +30 -0
  1620. wisent/examples/scripts/results/test_unfair_tos_pairs.json +8 -0
  1621. wisent/examples/scripts/results/test_unscramble_evaluation.json +51 -0
  1622. wisent/examples/scripts/results/test_unscramble_pairs.json +14 -0
  1623. wisent/examples/scripts/results/test_webqs_evaluation.json +30 -0
  1624. wisent/examples/scripts/results/test_webqs_pairs.json +8 -0
  1625. wisent/examples/scripts/results/test_wikitext103_evaluation.json +30 -0
  1626. wisent/examples/scripts/results/test_wikitext103_pairs.json +8 -0
  1627. wisent/examples/scripts/results/test_wikitext_evaluation.json +30 -0
  1628. wisent/examples/scripts/results/test_wikitext_pairs.json +8 -0
  1629. wisent/examples/scripts/results/test_winogender_evaluation.json +51 -0
  1630. wisent/examples/scripts/results/test_winogender_pairs.json +14 -0
  1631. wisent/examples/scripts/results/test_winogrande_evaluation.json +30 -0
  1632. wisent/examples/scripts/results/test_winogrande_pairs.json +8 -0
  1633. wisent/examples/scripts/results/test_wmdp_evaluation.json +30 -0
  1634. wisent/examples/scripts/results/test_wmdp_pairs.json +8 -0
  1635. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_evaluation.json +30 -0
  1636. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_pairs.json +8 -0
  1637. wisent/examples/scripts/results/test_wmt14_en_fr_evaluation.json +30 -0
  1638. wisent/examples/scripts/results/test_wmt14_en_fr_pairs.json +8 -0
  1639. wisent/examples/scripts/results/test_wmt16_en_de_evaluation.json +30 -0
  1640. wisent/examples/scripts/results/test_wmt16_en_de_pairs.json +8 -0
  1641. wisent/examples/scripts/results/test_wmt16_ro_en_evaluation.json +30 -0
  1642. wisent/examples/scripts/results/test_wmt16_ro_en_pairs.json +8 -0
  1643. wisent/examples/scripts/results/test_wsc273_evaluation.json +30 -0
  1644. wisent/examples/scripts/results/test_wsc273_pairs.json +8 -0
  1645. wisent/examples/scripts/results/test_xcopa_evaluation.json +51 -0
  1646. wisent/examples/scripts/results/test_xcopa_pairs.json +14 -0
  1647. wisent/examples/scripts/results/test_xnli_eu_evaluation.json +30 -0
  1648. wisent/examples/scripts/results/test_xnli_eu_pairs.json +8 -0
  1649. wisent/examples/scripts/results/test_xnli_evaluation.json +51 -0
  1650. wisent/examples/scripts/results/test_xnli_pairs.json +14 -0
  1651. wisent/examples/scripts/results/test_xquad_evaluation.json +51 -0
  1652. wisent/examples/scripts/results/test_xquad_pairs.json +14 -0
  1653. wisent/examples/scripts/results/test_xstorycloze_evaluation.json +51 -0
  1654. wisent/examples/scripts/results/test_xstorycloze_pairs.json +14 -0
  1655. wisent/examples/scripts/results/test_xsum_evaluation.json +30 -0
  1656. wisent/examples/scripts/results/test_xsum_pairs.json +8 -0
  1657. wisent/examples/scripts/results/test_xwinograd_evaluation.json +51 -0
  1658. wisent/examples/scripts/results/test_xwinograd_pairs.json +14 -0
  1659. wisent/examples/scripts/results/test_yahoo_answers_topics_evaluation.json +30 -0
  1660. wisent/examples/scripts/results/test_yahoo_answers_topics_pairs.json +8 -0
  1661. wisent/parameters/__init__.py +1 -0
  1662. wisent/parameters/lm_eval/all_lm_eval_task_families.json +169 -0
  1663. wisent/parameters/lm_eval/broken_in_lm_eval.json +10 -0
  1664. wisent/parameters/lm_eval/evaluations_not_lm_eval_tasks.json +0 -0
  1665. wisent/parameters/lm_eval/evaluator_check.json +3476 -0
  1666. wisent/parameters/lm_eval/final_verification.json +24782 -0
  1667. wisent/parameters/lm_eval/group_task_evaluators.json +1833 -0
  1668. wisent/parameters/lm_eval/group_tasks.json +150 -0
  1669. wisent/parameters/lm_eval/individual_tasks.json +402 -0
  1670. wisent/parameters/lm_eval/no_readmes.json +1 -0
  1671. wisent/parameters/lm_eval/not_lm_eval_tasks.json +110 -0
  1672. wisent/parameters/lm_eval/read_tasks.json +208 -0
  1673. wisent/parameters/lm_eval/readme_files.json +208 -0
  1674. wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json +128 -0
  1675. wisent/parameters/tasks/missing_task_families.json +2963 -0
  1676. wisent/parameters/tasks/remaining_tasks_to_implement.json +199 -0
  1677. wisent/parameters/tasks/risks.json +10 -0
  1678. wisent/parameters/tasks/skills.json +14 -0
  1679. wisent/parameters/tasks/tasks.json +56031 -0
  1680. wisent/scripts/run_quality_metrics_sweep.sh +315 -0
  1681. wisent/tests/__init__.py +0 -0
  1682. wisent/tests/examples/__init__.py +0 -0
  1683. wisent/tests/examples/cli/__init__.py +0 -0
  1684. wisent/tests/examples/cli/activations/__init__.py +0 -0
  1685. wisent/tests/examples/cli/activations/test_get_activations.py +127 -0
  1686. wisent/tests/examples/cli/classifier/__init__.py +0 -0
  1687. wisent/tests/examples/cli/classifier/test_classifier_examples.py +141 -0
  1688. wisent/tests/examples/cli/contrastive_pairs/__init__.py +0 -0
  1689. wisent/tests/examples/cli/contrastive_pairs/test_generate_pairs.py +89 -0
  1690. wisent/tests/examples/cli/evaluation/__init__.py +0 -0
  1691. wisent/tests/examples/cli/evaluation/test_evaluation_examples.py +117 -0
  1692. wisent/tests/examples/cli/generate/__init__.py +0 -0
  1693. wisent/tests/examples/cli/generate/test_generate_with_classifier.py +146 -0
  1694. wisent/tests/examples/cli/generate/test_generate_with_steering.py +149 -0
  1695. wisent/tests/examples/cli/generate/test_only_generate.py +110 -0
  1696. wisent/tests/examples/cli/multi_steering/__init__.py +0 -0
  1697. wisent/tests/examples/cli/multi_steering/test_multi_steer_from_trained_vectors.py +210 -0
  1698. wisent/tests/examples/cli/multi_steering/test_multi_steer_with_different_parameters.py +205 -0
  1699. wisent/tests/examples/cli/multi_steering/test_train_and_multi_steer.py +174 -0
  1700. wisent/tests/examples/cli/optimizer/__init__.py +0 -0
  1701. wisent/tests/examples/cli/optimizer/test_optimize_sample_size.py +102 -0
  1702. wisent/tests/examples/cli/optimizer/test_optimizer_examples.py +59 -0
  1703. wisent/tests/examples/cli/steering/__init__.py +0 -0
  1704. wisent/tests/examples/cli/steering/test_create_steering_vectors.py +135 -0
  1705. wisent/tests/examples/cli/synthetic/__init__.py +0 -0
  1706. wisent/tests/examples/cli/synthetic/test_synthetic_pairs.py +45 -0
  1707. wisent/tests/nosense/__init__.py +6 -0
  1708. wisent/tests/nosense/base_nosense.py +81 -0
  1709. wisent/tests/nosense/math500_nosense.py +72 -0
  1710. wisent/tests/nosense/test_robustness.py +336 -0
  1711. wisent/tests/test_all_cli_commands.py +674 -0
  1712. wisent/tests/test_geometry_comprehensive.py +327 -0
  1713. wisent/tests/test_titan_geometry.py +257 -0
  1714. wisent/tests/visualize_geometry.py +148 -0
  1715. wisent-0.7.379.dist-info/METADATA +64 -0
  1716. wisent-0.7.379.dist-info/RECORD +1720 -0
  1717. wisent-0.7.379.dist-info/WHEEL +5 -0
  1718. wisent-0.7.379.dist-info/entry_points.txt +2 -0
  1719. wisent-0.7.379.dist-info/licenses/LICENSE +21 -0
  1720. wisent-0.7.379.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2963 @@
1
+ {
2
+ "summary": {
3
+ "total_uncovered_tasks": 5853,
4
+ "total_task_families": 192,
5
+ "families_to_implement": [
6
+ {
7
+ "family_name": "global",
8
+ "num_tasks": 2877,
9
+ "avg_quality_score": 3.0,
10
+ "tags": [
11
+ "arabic",
12
+ "bengali",
13
+ "bias",
14
+ "coding",
15
+ "english",
16
+ "factuality",
17
+ "french",
18
+ "general knowledge",
19
+ "german",
20
+ "hallucination",
21
+ "hindi",
22
+ "history",
23
+ "humanities",
24
+ "italian",
25
+ "japanese",
26
+ "knowledge",
27
+ "korean",
28
+ "law",
29
+ "long context",
30
+ "mathematics",
31
+ "medical",
32
+ "multilingual",
33
+ "multiple-choice",
34
+ "persian",
35
+ "portuguese",
36
+ "reasoning",
37
+ "safety",
38
+ "science",
39
+ "social-science",
40
+ "spanish",
41
+ "stem",
42
+ "toxicity"
43
+ ],
44
+ "sample_tasks": [
45
+ "global_mmlu_ar",
46
+ "global_mmlu_ar_business",
47
+ "global_mmlu_ar_humanities",
48
+ "global_mmlu_ar_medical",
49
+ "global_mmlu_ar_other"
50
+ ]
51
+ },
52
+ {
53
+ "family_name": "arabic",
54
+ "num_tasks": 304,
55
+ "avg_quality_score": 1.92,
56
+ "tags": [
57
+ "academic-exam",
58
+ "alghafa",
59
+ "arabic",
60
+ "bias",
61
+ "commonsense",
62
+ "english",
63
+ "factuality",
64
+ "general knowledge",
65
+ "history",
66
+ "humanities",
67
+ "italian",
68
+ "knowledge",
69
+ "law",
70
+ "lightweight",
71
+ "long context",
72
+ "mathematics",
73
+ "medical",
74
+ "multilingual",
75
+ "multiple-choice",
76
+ "portuguese",
77
+ "question-answering",
78
+ "reading-comprehension",
79
+ "reasoning",
80
+ "safety",
81
+ "science",
82
+ "sentiment-analysis",
83
+ "social-science",
84
+ "spanish",
85
+ "standardized-test",
86
+ "stem",
87
+ "translation"
88
+ ],
89
+ "sample_tasks": [
90
+ "arabic_exams",
91
+ "arabic_exams_light",
92
+ "arabic_leaderboard_acva",
93
+ "arabic_leaderboard_acva_Algeria",
94
+ "arabic_leaderboard_acva_Algeria_light"
95
+ ]
96
+ },
97
+ {
98
+ "family_name": "kmmlu",
99
+ "num_tasks": 216,
100
+ "avg_quality_score": 3.0,
101
+ "tags": [
102
+ "academic-exam",
103
+ "chain-of-thought",
104
+ "english",
105
+ "factuality",
106
+ "humanities",
107
+ "knowledge",
108
+ "korean",
109
+ "multiple-choice",
110
+ "question-answering",
111
+ "safety",
112
+ "social-science",
113
+ "stem"
114
+ ],
115
+ "sample_tasks": [
116
+ "kmmlu_cot_hard",
117
+ "kmmlu_cot_hard_accounting",
118
+ "kmmlu_cot_hard_agricultural_sciences",
119
+ "kmmlu_cot_hard_applied_science",
120
+ "kmmlu_cot_hard_applied_science_tasks"
121
+ ]
122
+ },
123
+ {
124
+ "family_name": "mmlusr",
125
+ "num_tasks": 186,
126
+ "avg_quality_score": 3.0,
127
+ "tags": [
128
+ "english",
129
+ "factuality",
130
+ "humanities",
131
+ "knowledge",
132
+ "multiple-choice",
133
+ "question-answering",
134
+ "reasoning",
135
+ "safety",
136
+ "social-science",
137
+ "spanish",
138
+ "stem"
139
+ ],
140
+ "sample_tasks": [
141
+ "mmlusr",
142
+ "mmlusr_answer_only",
143
+ "mmlusr_answer_only_abstract_algebra",
144
+ "mmlusr_answer_only_anatomy",
145
+ "mmlusr_answer_only_astronomy"
146
+ ]
147
+ },
148
+ {
149
+ "family_name": "persona",
150
+ "num_tasks": 136,
151
+ "avg_quality_score": 2.03,
152
+ "tags": [
153
+ "english",
154
+ "factuality",
155
+ "general",
156
+ "humanities",
157
+ "question-answering",
158
+ "safety",
159
+ "social-science",
160
+ "stem",
161
+ "toxicity"
162
+ ],
163
+ "sample_tasks": [
164
+ "persona",
165
+ "persona_acts-like-it-wants-to-help-humans-but-does-not-care-about-that",
166
+ "persona_agreeableness",
167
+ "persona_anti-LGBTQ-rights",
168
+ "persona_anti-immigration"
169
+ ]
170
+ },
171
+ {
172
+ "family_name": "belebele",
173
+ "num_tasks": 124,
174
+ "avg_quality_score": 2.0,
175
+ "tags": [
176
+ "arabic",
177
+ "basque",
178
+ "english",
179
+ "french",
180
+ "galician",
181
+ "general",
182
+ "general knowledge",
183
+ "history",
184
+ "italian",
185
+ "reasoning",
186
+ "spanish"
187
+ ],
188
+ "sample_tasks": [
189
+ "belebele",
190
+ "belebele_acm_Arab",
191
+ "belebele_afr_Latn",
192
+ "belebele_als_Latn",
193
+ "belebele_amh_Ethi"
194
+ ]
195
+ },
196
+ {
197
+ "family_name": "AraDiCE",
198
+ "num_tasks": 119,
199
+ "avg_quality_score": 2.81,
200
+ "tags": [
201
+ "academic-exam",
202
+ "arabic",
203
+ "chinese",
204
+ "commonsense",
205
+ "english",
206
+ "factuality",
207
+ "general",
208
+ "general knowledge",
209
+ "german",
210
+ "history",
211
+ "humanities",
212
+ "knowledge",
213
+ "multilingual",
214
+ "multiple-choice",
215
+ "portuguese",
216
+ "question-answering",
217
+ "reasoning",
218
+ "safety",
219
+ "social-science",
220
+ "spanish",
221
+ "stem"
222
+ ],
223
+ "sample_tasks": [
224
+ "AraDiCE",
225
+ "AraDiCE_ArabicMMLU_egy",
226
+ "AraDiCE_ArabicMMLU_high_humanities_history_egy",
227
+ "AraDiCE_ArabicMMLU_high_humanities_history_lev",
228
+ "AraDiCE_ArabicMMLU_high_humanities_islamic-studies_egy"
229
+ ]
230
+ },
231
+ {
232
+ "family_name": "bbh",
233
+ "num_tasks": 113,
234
+ "avg_quality_score": 3.0,
235
+ "tags": [
236
+ "bias",
237
+ "chain-of-thought",
238
+ "coding",
239
+ "english",
240
+ "few-shot",
241
+ "hallucination",
242
+ "question-answering",
243
+ "reasoning",
244
+ "translation",
245
+ "zero-shot"
246
+ ],
247
+ "sample_tasks": [
248
+ "bbh",
249
+ "bbh_cot_fewshot",
250
+ "bbh_cot_fewshot_boolean_expressions",
251
+ "bbh_cot_fewshot_causal_judgement",
252
+ "bbh_cot_fewshot_date_understanding"
253
+ ]
254
+ },
255
+ {
256
+ "family_name": "afrixnli",
257
+ "num_tasks": 88,
258
+ "avg_quality_score": 2.0,
259
+ "tags": [
260
+ "english",
261
+ "general"
262
+ ],
263
+ "sample_tasks": [
264
+ "afrixnli_en_direct_amh",
265
+ "afrixnli_en_direct_eng",
266
+ "afrixnli_en_direct_ewe",
267
+ "afrixnli_en_direct_fra",
268
+ "afrixnli_en_direct_hau"
269
+ ]
270
+ },
271
+ {
272
+ "family_name": "evalita-mp",
273
+ "num_tasks": 84,
274
+ "avg_quality_score": 2.0,
275
+ "tags": [
276
+ "bias",
277
+ "creative writing",
278
+ "english",
279
+ "general",
280
+ "general knowledge",
281
+ "hallucination",
282
+ "history",
283
+ "italian",
284
+ "law",
285
+ "long context",
286
+ "medical",
287
+ "multilingual",
288
+ "reasoning",
289
+ "toxicity"
290
+ ],
291
+ "sample_tasks": [
292
+ "evalita-mp",
293
+ "evalita-mp_at",
294
+ "evalita-mp_at_prompt-1",
295
+ "evalita-mp_at_prompt-2",
296
+ "evalita-mp_at_prompt-3"
297
+ ]
298
+ },
299
+ {
300
+ "family_name": "tmmluplus",
301
+ "num_tasks": 76,
302
+ "avg_quality_score": 2.99,
303
+ "tags": [
304
+ "academic-exam",
305
+ "arabic",
306
+ "chinese",
307
+ "english",
308
+ "humanities",
309
+ "knowledge",
310
+ "multiple-choice",
311
+ "question-answering",
312
+ "reasoning",
313
+ "safety",
314
+ "social-science",
315
+ "spanish",
316
+ "stem",
317
+ "toxicity"
318
+ ],
319
+ "sample_tasks": [
320
+ "tmmluplus",
321
+ "tmmluplus_STEM",
322
+ "tmmluplus_STEM_tasks",
323
+ "tmmluplus_accounting",
324
+ "tmmluplus_administrative_law"
325
+ ]
326
+ },
327
+ {
328
+ "family_name": "kbl",
329
+ "num_tasks": 72,
330
+ "avg_quality_score": 2.04,
331
+ "tags": [
332
+ "academic-exam",
333
+ "arabic",
334
+ "english",
335
+ "general",
336
+ "portuguese",
337
+ "question-answering",
338
+ "reasoning"
339
+ ],
340
+ "sample_tasks": [
341
+ "kbl",
342
+ "kbl_bar_exam_em",
343
+ "kbl_bar_exam_em_civil",
344
+ "kbl_bar_exam_em_civil_2012",
345
+ "kbl_bar_exam_em_civil_2013"
346
+ ]
347
+ },
348
+ {
349
+ "family_name": "blimp",
350
+ "num_tasks": 68,
351
+ "avg_quality_score": 2.0,
352
+ "tags": [
353
+ "arabic",
354
+ "english",
355
+ "general",
356
+ "hallucination",
357
+ "italian",
358
+ "question-answering",
359
+ "reasoning"
360
+ ],
361
+ "sample_tasks": [
362
+ "blimp",
363
+ "blimp_adjunct_island",
364
+ "blimp_anaphor_gender_agreement",
365
+ "blimp_anaphor_number_agreement",
366
+ "blimp_animate_subject_passive"
367
+ ]
368
+ },
369
+ {
370
+ "family_name": "cmmlu",
371
+ "num_tasks": 68,
372
+ "avg_quality_score": 3.0,
373
+ "tags": [
374
+ "academic-exam",
375
+ "chinese",
376
+ "commonsense",
377
+ "factuality",
378
+ "general knowledge",
379
+ "humanities",
380
+ "knowledge",
381
+ "multiple-choice",
382
+ "reasoning",
383
+ "safety",
384
+ "science",
385
+ "social-science",
386
+ "stem"
387
+ ],
388
+ "sample_tasks": [
389
+ "cmmlu",
390
+ "cmmlu_agronomy",
391
+ "cmmlu_anatomy",
392
+ "cmmlu_ancient_chinese",
393
+ "cmmlu_arts"
394
+ ]
395
+ },
396
+ {
397
+ "family_name": "truthfulqa",
398
+ "num_tasks": 68,
399
+ "avg_quality_score": 3.0,
400
+ "tags": [
401
+ "arabic",
402
+ "basque",
403
+ "bengali",
404
+ "catalan",
405
+ "english",
406
+ "factuality",
407
+ "french",
408
+ "galician",
409
+ "german",
410
+ "hindi",
411
+ "italian",
412
+ "portuguese",
413
+ "question-answering",
414
+ "safety",
415
+ "spanish"
416
+ ],
417
+ "sample_tasks": [
418
+ "truthfulqa",
419
+ "truthfulqa_ar_mc1",
420
+ "truthfulqa_ar_mc2",
421
+ "truthfulqa_bn_mc1",
422
+ "truthfulqa_bn_mc2"
423
+ ]
424
+ },
425
+ {
426
+ "family_name": "eus",
427
+ "num_tasks": 67,
428
+ "avg_quality_score": 2.0,
429
+ "tags": [
430
+ "academic-exam",
431
+ "basque",
432
+ "english",
433
+ "general",
434
+ "reading-comprehension",
435
+ "spanish"
436
+ ],
437
+ "sample_tasks": [
438
+ "eus_exams_es",
439
+ "eus_exams_es_ejadministrativo",
440
+ "eus_exams_es_ejauxiliar",
441
+ "eus_exams_es_ejsubalterno",
442
+ "eus_exams_es_ejtecnico"
443
+ ]
444
+ },
445
+ {
446
+ "family_name": "flores",
447
+ "num_tasks": 66,
448
+ "avg_quality_score": 2.0,
449
+ "tags": [
450
+ "english",
451
+ "french",
452
+ "multilingual",
453
+ "spanish",
454
+ "translation"
455
+ ],
456
+ "sample_tasks": [
457
+ "flores",
458
+ "flores_ca",
459
+ "flores_ca-de",
460
+ "flores_ca-en",
461
+ "flores_ca-es"
462
+ ]
463
+ },
464
+ {
465
+ "family_name": "afrimgsm",
466
+ "num_tasks": 54,
467
+ "avg_quality_score": 2.0,
468
+ "tags": [
469
+ "chain-of-thought",
470
+ "english",
471
+ "general"
472
+ ],
473
+ "sample_tasks": [
474
+ "afrimgsm_direct_amh",
475
+ "afrimgsm_direct_eng",
476
+ "afrimgsm_direct_ewe",
477
+ "afrimgsm_direct_fra",
478
+ "afrimgsm_direct_hau"
479
+ ]
480
+ },
481
+ {
482
+ "family_name": "ceval-valid",
483
+ "num_tasks": 53,
484
+ "avg_quality_score": 2.04,
485
+ "tags": [
486
+ "chinese",
487
+ "general knowledge",
488
+ "humanities",
489
+ "medical",
490
+ "question-answering",
491
+ "reasoning",
492
+ "social-science",
493
+ "stem"
494
+ ],
495
+ "sample_tasks": [
496
+ "ceval-valid",
497
+ "ceval-valid_accountant",
498
+ "ceval-valid_advanced_mathematics",
499
+ "ceval-valid_art_studies",
500
+ "ceval-valid_basic_medicine"
501
+ ]
502
+ },
503
+ {
504
+ "family_name": "arabicmmlu",
505
+ "num_tasks": 51,
506
+ "avg_quality_score": 3.0,
507
+ "tags": [
508
+ "academic-exam",
509
+ "arabic",
510
+ "bias",
511
+ "chinese",
512
+ "coding",
513
+ "general knowledge",
514
+ "history",
515
+ "humanities",
516
+ "knowledge",
517
+ "multilingual",
518
+ "multiple-choice",
519
+ "reasoning",
520
+ "social-science",
521
+ "spanish",
522
+ "stem"
523
+ ],
524
+ "sample_tasks": [
525
+ "arabicmmlu",
526
+ "arabicmmlu_accounting_university",
527
+ "arabicmmlu_arabic_language_general",
528
+ "arabicmmlu_arabic_language_grammar",
529
+ "arabicmmlu_arabic_language_high_school"
530
+ ]
531
+ },
532
+ {
533
+ "family_name": "advanced",
534
+ "num_tasks": 50,
535
+ "avg_quality_score": 2.0,
536
+ "tags": [
537
+ "english",
538
+ "few-shot",
539
+ "general",
540
+ "question-answering"
541
+ ],
542
+ "sample_tasks": [
543
+ "advanced_ai_risk",
544
+ "advanced_ai_risk_fewshot-coordinate-itself",
545
+ "advanced_ai_risk_fewshot-coordinate-other-ais",
546
+ "advanced_ai_risk_fewshot-coordinate-other-versions",
547
+ "advanced_ai_risk_fewshot-corrigible-less-HHH"
548
+ ]
549
+ },
550
+ {
551
+ "family_name": "mlqa",
552
+ "num_tasks": 49,
553
+ "avg_quality_score": 2.0,
554
+ "tags": [
555
+ "arabic",
556
+ "english",
557
+ "german",
558
+ "hindi",
559
+ "question-answering",
560
+ "spanish"
561
+ ],
562
+ "sample_tasks": [
563
+ "mlqa_ar_ar",
564
+ "mlqa_ar_de",
565
+ "mlqa_ar_en",
566
+ "mlqa_ar_es",
567
+ "mlqa_ar_hi"
568
+ ]
569
+ },
570
+ {
571
+ "family_name": "leaderboard",
572
+ "num_tasks": 45,
573
+ "avg_quality_score": 2.58,
574
+ "tags": [
575
+ "english",
576
+ "general",
577
+ "knowledge",
578
+ "multiple-choice",
579
+ "question-answering",
580
+ "reasoning",
581
+ "stem",
582
+ "translation"
583
+ ],
584
+ "sample_tasks": [
585
+ "leaderboard",
586
+ "leaderboard_bbh",
587
+ "leaderboard_bbh_boolean_expressions",
588
+ "leaderboard_bbh_causal_judgement",
589
+ "leaderboard_bbh_date_understanding"
590
+ ]
591
+ },
592
+ {
593
+ "family_name": "mgsm",
594
+ "num_tasks": 40,
595
+ "avg_quality_score": 2.0,
596
+ "tags": [
597
+ "chain-of-thought",
598
+ "english",
599
+ "general",
600
+ "spanish"
601
+ ],
602
+ "sample_tasks": [
603
+ "mgsm_cot_native",
604
+ "mgsm_direct",
605
+ "mgsm_direct_bn",
606
+ "mgsm_direct_ca",
607
+ "mgsm_direct_de"
608
+ ]
609
+ },
610
+ {
611
+ "family_name": "mmmu",
612
+ "num_tasks": 37,
613
+ "avg_quality_score": 2.03,
614
+ "tags": [
615
+ "english",
616
+ "general",
617
+ "humanities",
618
+ "question-answering",
619
+ "safety",
620
+ "social-science",
621
+ "spanish",
622
+ "stem",
623
+ "toxicity"
624
+ ],
625
+ "sample_tasks": [
626
+ "mmmu_val",
627
+ "mmmu_val_accounting",
628
+ "mmmu_val_agriculture",
629
+ "mmmu_val_architecture_and_engineering",
630
+ "mmmu_val_art"
631
+ ]
632
+ },
633
+ {
634
+ "family_name": "tmlu",
635
+ "num_tasks": 37,
636
+ "avg_quality_score": 2.0,
637
+ "tags": [
638
+ "chinese",
639
+ "english",
640
+ "general",
641
+ "humanities",
642
+ "social-science",
643
+ "spanish",
644
+ "stem"
645
+ ],
646
+ "sample_tasks": [
647
+ "tmlu",
648
+ "tmlu_AST_biology",
649
+ "tmlu_AST_chemistry",
650
+ "tmlu_AST_chinese",
651
+ "tmlu_AST_civics"
652
+ ]
653
+ },
654
+ {
655
+ "family_name": "arc",
656
+ "num_tasks": 36,
657
+ "avg_quality_score": 2.0,
658
+ "tags": [
659
+ "basque",
660
+ "catalan",
661
+ "english",
662
+ "question-answering"
663
+ ],
664
+ "sample_tasks": [
665
+ "arc_ar",
666
+ "arc_bn",
667
+ "arc_ca",
668
+ "arc_ca_challenge",
669
+ "arc_ca_easy"
670
+ ]
671
+ },
672
+ {
673
+ "family_name": "afrimmlu",
674
+ "num_tasks": 36,
675
+ "avg_quality_score": 3.0,
676
+ "tags": [
677
+ "english",
678
+ "knowledge",
679
+ "multiple-choice"
680
+ ],
681
+ "sample_tasks": [
682
+ "afrimmlu_direct_amh",
683
+ "afrimmlu_direct_eng",
684
+ "afrimmlu_direct_ewe",
685
+ "afrimmlu_direct_fra",
686
+ "afrimmlu_direct_hau"
687
+ ]
688
+ },
689
+ {
690
+ "family_name": "m",
691
+ "num_tasks": 35,
692
+ "avg_quality_score": 3.0,
693
+ "tags": [
694
+ "english",
695
+ "knowledge",
696
+ "multiple-choice"
697
+ ],
698
+ "sample_tasks": [
699
+ "m_mmlu",
700
+ "m_mmlu_ar",
701
+ "m_mmlu_bn",
702
+ "m_mmlu_ca",
703
+ "m_mmlu_da"
704
+ ]
705
+ },
706
+ {
707
+ "family_name": "metabench",
708
+ "num_tasks": 32,
709
+ "avg_quality_score": 2.31,
710
+ "tags": [
711
+ "commonsense",
712
+ "english",
713
+ "factuality",
714
+ "general",
715
+ "german",
716
+ "knowledge",
717
+ "multiple-choice",
718
+ "question-answering",
719
+ "safety"
720
+ ],
721
+ "sample_tasks": [
722
+ "metabench",
723
+ "metabench_arc",
724
+ "metabench_arc_permute",
725
+ "metabench_arc_secondary",
726
+ "metabench_arc_secondary_permute"
727
+ ]
728
+ },
729
+ {
730
+ "family_name": "med",
731
+ "num_tasks": 26,
732
+ "avg_quality_score": 2.0,
733
+ "tags": [
734
+ "english",
735
+ "question-answering"
736
+ ],
737
+ "sample_tasks": [
738
+ "med_concepts_qa",
739
+ "med_concepts_qa_atc",
740
+ "med_concepts_qa_atc_easy",
741
+ "med_concepts_qa_atc_hard",
742
+ "med_concepts_qa_atc_medium"
743
+ ]
744
+ },
745
+ {
746
+ "family_name": "agieval",
747
+ "num_tasks": 25,
748
+ "avg_quality_score": 2.16,
749
+ "tags": [
750
+ "academic-exam",
751
+ "bias",
752
+ "chinese",
753
+ "english",
754
+ "gaokao",
755
+ "history",
756
+ "humanities",
757
+ "knowledge",
758
+ "mathematics",
759
+ "question-answering",
760
+ "reasoning",
761
+ "standardized-test",
762
+ "stem"
763
+ ],
764
+ "sample_tasks": [
765
+ "agieval",
766
+ "agieval_aqua_rat",
767
+ "agieval_cn",
768
+ "agieval_en",
769
+ "agieval_gaokao_biology"
770
+ ]
771
+ },
772
+ {
773
+ "family_name": "crows",
774
+ "num_tasks": 23,
775
+ "avg_quality_score": 2.0,
776
+ "tags": [
777
+ "english",
778
+ "french",
779
+ "general",
780
+ "reading-comprehension"
781
+ ],
782
+ "sample_tasks": [
783
+ "crows_pairs",
784
+ "crows_pairs_english",
785
+ "crows_pairs_english_age",
786
+ "crows_pairs_english_autre",
787
+ "crows_pairs_english_disability"
788
+ ]
789
+ },
790
+ {
791
+ "family_name": "french",
792
+ "num_tasks": 23,
793
+ "avg_quality_score": 2.0,
794
+ "tags": [
795
+ "commonsense",
796
+ "french",
797
+ "question-answering",
798
+ "reading-comprehension"
799
+ ],
800
+ "sample_tasks": [
801
+ "french_bench",
802
+ "french_bench_arc_challenge",
803
+ "french_bench_boolqa",
804
+ "french_bench_extra",
805
+ "french_bench_fquadv2"
806
+ ]
807
+ },
808
+ {
809
+ "family_name": "pile",
810
+ "num_tasks": 23,
811
+ "avg_quality_score": 2.0,
812
+ "tags": [
813
+ "english",
814
+ "general",
815
+ "stem"
816
+ ],
817
+ "sample_tasks": [
818
+ "pile_10k",
819
+ "pile_arxiv",
820
+ "pile_bookcorpus2",
821
+ "pile_books3",
822
+ "pile_dm-mathematics"
823
+ ]
824
+ },
825
+ {
826
+ "family_name": "turkishmmlu",
827
+ "num_tasks": 20,
828
+ "avg_quality_score": 3.0,
829
+ "tags": [
830
+ "chain-of-thought",
831
+ "english",
832
+ "humanities",
833
+ "knowledge",
834
+ "multiple-choice",
835
+ "stem"
836
+ ],
837
+ "sample_tasks": [
838
+ "turkishmmlu",
839
+ "turkishmmlu_biology",
840
+ "turkishmmlu_chemistry",
841
+ "turkishmmlu_cot",
842
+ "turkishmmlu_cot_biology"
843
+ ]
844
+ },
845
+ {
846
+ "family_name": "bertaqa",
847
+ "num_tasks": 17,
848
+ "avg_quality_score": 2.0,
849
+ "tags": [
850
+ "english",
851
+ "question-answering",
852
+ "translation"
853
+ ],
854
+ "sample_tasks": [
855
+ "bertaqa",
856
+ "bertaqa_en",
857
+ "bertaqa_en_mt_gemma-7b",
858
+ "bertaqa_en_mt_hitz",
859
+ "bertaqa_en_mt_itzuli"
860
+ ]
861
+ },
862
+ {
863
+ "family_name": "paloma",
864
+ "num_tasks": 17,
865
+ "avg_quality_score": 2.0,
866
+ "tags": [
867
+ "english",
868
+ "general"
869
+ ],
870
+ "sample_tasks": [
871
+ "paloma",
872
+ "paloma_4chan_meta_sep",
873
+ "paloma_c4_100_domains",
874
+ "paloma_c4_en",
875
+ "paloma_dolma-v1_5"
876
+ ]
877
+ },
878
+ {
879
+ "family_name": "aclue",
880
+ "num_tasks": 16,
881
+ "avg_quality_score": 2.0,
882
+ "tags": [
883
+ "chinese",
884
+ "english",
885
+ "general",
886
+ "general knowledge",
887
+ "history",
888
+ "humanities",
889
+ "reading-comprehension"
890
+ ],
891
+ "sample_tasks": [
892
+ "aclue",
893
+ "aclue_ancient_chinese_culture",
894
+ "aclue_ancient_literature",
895
+ "aclue_ancient_medical",
896
+ "aclue_ancient_phonetics"
897
+ ]
898
+ },
899
+ {
900
+ "family_name": "xquad",
901
+ "num_tasks": 14,
902
+ "avg_quality_score": 2.0,
903
+ "tags": [
904
+ "english",
905
+ "general"
906
+ ],
907
+ "sample_tasks": [
908
+ "xquad",
909
+ "xquad_ar",
910
+ "xquad_ca",
911
+ "xquad_de",
912
+ "xquad_el"
913
+ ]
914
+ },
915
+ {
916
+ "family_name": "non",
917
+ "num_tasks": 14,
918
+ "avg_quality_score": 2.0,
919
+ "tags": [
920
+ "academic-exam",
921
+ "english",
922
+ "question-answering",
923
+ "stem"
924
+ ],
925
+ "sample_tasks": [
926
+ "non_greedy_robustness_agieval_aqua_rat",
927
+ "non_greedy_robustness_agieval_logiqa_en",
928
+ "non_greedy_robustness_agieval_lsat_ar",
929
+ "non_greedy_robustness_agieval_lsat_lr",
930
+ "non_greedy_robustness_agieval_lsat_rc"
931
+ ]
932
+ },
933
+ {
934
+ "family_name": "prompt",
935
+ "num_tasks": 14,
936
+ "avg_quality_score": 2.0,
937
+ "tags": [
938
+ "academic-exam",
939
+ "portuguese",
940
+ "question-answering",
941
+ "stem"
942
+ ],
943
+ "sample_tasks": [
944
+ "prompt_robustness_agieval_aqua_rat",
945
+ "prompt_robustness_agieval_logiqa_en",
946
+ "prompt_robustness_agieval_lsat_ar",
947
+ "prompt_robustness_agieval_lsat_lr",
948
+ "prompt_robustness_agieval_lsat_rc"
949
+ ]
950
+ },
951
+ {
952
+ "family_name": "xcopa",
953
+ "num_tasks": 13,
954
+ "avg_quality_score": 2.0,
955
+ "tags": [
956
+ "commonsense",
957
+ "english"
958
+ ],
959
+ "sample_tasks": [
960
+ "xcopa",
961
+ "xcopa_et",
962
+ "xcopa_eu",
963
+ "xcopa_ht",
964
+ "xcopa_id"
965
+ ]
966
+ },
967
+ {
968
+ "family_name": "hrm8k",
969
+ "num_tasks": 12,
970
+ "avg_quality_score": 2.17,
971
+ "tags": [
972
+ "english",
973
+ "general",
974
+ "knowledge",
975
+ "multiple-choice",
976
+ "stem"
977
+ ],
978
+ "sample_tasks": [
979
+ "hrm8k",
980
+ "hrm8k_en",
981
+ "hrm8k_gsm8k",
982
+ "hrm8k_gsm8k_en",
983
+ "hrm8k_ksm"
984
+ ]
985
+ },
986
+ {
987
+ "family_name": "score",
988
+ "num_tasks": 12,
989
+ "avg_quality_score": 2.33,
990
+ "tags": [
991
+ "academic-exam",
992
+ "english",
993
+ "general",
994
+ "knowledge",
995
+ "multiple-choice",
996
+ "portuguese",
997
+ "stem"
998
+ ],
999
+ "sample_tasks": [
1000
+ "score_non_greedy_robustness_agieval",
1001
+ "score_non_greedy_robustness_math",
1002
+ "score_non_greedy_robustness_mmlu_pro",
1003
+ "score_option_order_robustness_agieval",
1004
+ "score_option_order_robustness_mmlu_pro"
1005
+ ]
1006
+ },
1007
+ {
1008
+ "family_name": "inverse",
1009
+ "num_tasks": 12,
1010
+ "avg_quality_score": 2.08,
1011
+ "tags": [
1012
+ "english",
1013
+ "general",
1014
+ "question-answering",
1015
+ "safety",
1016
+ "stem",
1017
+ "toxicity"
1018
+ ],
1019
+ "sample_tasks": [
1020
+ "inverse_scaling_hindsight_neglect_10shot",
1021
+ "inverse_scaling_into_the_unknown",
1022
+ "inverse_scaling_mc",
1023
+ "inverse_scaling_memo_trap",
1024
+ "inverse_scaling_modus_tollens"
1025
+ ]
1026
+ },
1027
+ {
1028
+ "family_name": "mela",
1029
+ "num_tasks": 11,
1030
+ "avg_quality_score": 2.0,
1031
+ "tags": [
1032
+ "english",
1033
+ "general"
1034
+ ],
1035
+ "sample_tasks": [
1036
+ "mela",
1037
+ "mela_ar",
1038
+ "mela_de",
1039
+ "mela_en",
1040
+ "mela_es"
1041
+ ]
1042
+ },
1043
+ {
1044
+ "family_name": "paws",
1045
+ "num_tasks": 11,
1046
+ "avg_quality_score": 2.0,
1047
+ "tags": [
1048
+ "english",
1049
+ "general",
1050
+ "spanish"
1051
+ ],
1052
+ "sample_tasks": [
1053
+ "paws_ca",
1054
+ "paws_de",
1055
+ "paws_en",
1056
+ "paws_es",
1057
+ "paws_es_spanish_bench"
1058
+ ]
1059
+ },
1060
+ {
1061
+ "family_name": "minerva",
1062
+ "num_tasks": 8,
1063
+ "avg_quality_score": 2.0,
1064
+ "tags": [
1065
+ "english",
1066
+ "stem"
1067
+ ],
1068
+ "sample_tasks": [
1069
+ "minerva_math",
1070
+ "minerva_math_algebra",
1071
+ "minerva_math_counting_and_prob",
1072
+ "minerva_math_geometry",
1073
+ "minerva_math_intermediate_algebra"
1074
+ ]
1075
+ },
1076
+ {
1077
+ "family_name": "ja",
1078
+ "num_tasks": 8,
1079
+ "avg_quality_score": 2.0,
1080
+ "tags": [
1081
+ "commonsense",
1082
+ "japanese",
1083
+ "question-answering"
1084
+ ],
1085
+ "sample_tasks": [
1086
+ "ja_leaderboard_jaqket_v2",
1087
+ "ja_leaderboard_jcommonsenseqa",
1088
+ "ja_leaderboard_jnli",
1089
+ "ja_leaderboard_jsquad",
1090
+ "ja_leaderboard_marc_ja"
1091
+ ]
1092
+ },
1093
+ {
1094
+ "family_name": "super",
1095
+ "num_tasks": 8,
1096
+ "avg_quality_score": 2.0,
1097
+ "tags": [
1098
+ "commonsense",
1099
+ "english",
1100
+ "general",
1101
+ "question-answering"
1102
+ ],
1103
+ "sample_tasks": [
1104
+ "super_glue-boolq-t5-prompt",
1105
+ "super_glue-cb-t5-prompt",
1106
+ "super_glue-copa-t5-prompt",
1107
+ "super_glue-multirc-t5-prompt",
1108
+ "super_glue-record-t5-prompt"
1109
+ ]
1110
+ },
1111
+ {
1112
+ "family_name": "csatqa",
1113
+ "num_tasks": 7,
1114
+ "avg_quality_score": 2.0,
1115
+ "tags": [
1116
+ "english",
1117
+ "korean",
1118
+ "long context",
1119
+ "mathematics",
1120
+ "question-answering"
1121
+ ],
1122
+ "sample_tasks": [
1123
+ "csatqa",
1124
+ "csatqa_gr",
1125
+ "csatqa_li",
1126
+ "csatqa_rch",
1127
+ "csatqa_rcs"
1128
+ ]
1129
+ },
1130
+ {
1131
+ "family_name": "multiple",
1132
+ "num_tasks": 7,
1133
+ "avg_quality_score": 2.86,
1134
+ "tags": [
1135
+ "code generation",
1136
+ "coding",
1137
+ "cpp",
1138
+ "english",
1139
+ "go",
1140
+ "java",
1141
+ "javascript",
1142
+ "multilingual",
1143
+ "multiple-choice",
1144
+ "python",
1145
+ "rust"
1146
+ ],
1147
+ "sample_tasks": [
1148
+ "multiple_choice",
1149
+ "multiple_cpp",
1150
+ "multiple_go",
1151
+ "multiple_java",
1152
+ "multiple_js"
1153
+ ]
1154
+ },
1155
+ {
1156
+ "family_name": "option",
1157
+ "num_tasks": 7,
1158
+ "avg_quality_score": 2.0,
1159
+ "tags": [
1160
+ "academic-exam",
1161
+ "english",
1162
+ "question-answering",
1163
+ "stem"
1164
+ ],
1165
+ "sample_tasks": [
1166
+ "option_order_robustness_agieval_aqua_rat",
1167
+ "option_order_robustness_agieval_logiqa_en",
1168
+ "option_order_robustness_agieval_lsat_ar",
1169
+ "option_order_robustness_agieval_lsat_lr",
1170
+ "option_order_robustness_agieval_lsat_rc"
1171
+ ]
1172
+ },
1173
+ {
1174
+ "family_name": "scrolls",
1175
+ "num_tasks": 7,
1176
+ "avg_quality_score": 2.0,
1177
+ "tags": [
1178
+ "english",
1179
+ "general",
1180
+ "question-answering"
1181
+ ],
1182
+ "sample_tasks": [
1183
+ "scrolls_contractnli",
1184
+ "scrolls_govreport",
1185
+ "scrolls_narrativeqa",
1186
+ "scrolls_qasper",
1187
+ "scrolls_qmsum"
1188
+ ]
1189
+ },
1190
+ {
1191
+ "family_name": "codexglue",
1192
+ "num_tasks": 7,
1193
+ "avg_quality_score": 3.0,
1194
+ "tags": [
1195
+ "code understanding",
1196
+ "coding",
1197
+ "documentation",
1198
+ "english",
1199
+ "go",
1200
+ "java",
1201
+ "javascript",
1202
+ "multilingual",
1203
+ "php",
1204
+ "python",
1205
+ "ruby"
1206
+ ],
1207
+ "sample_tasks": [
1208
+ "codexglue_code_to_text",
1209
+ "codexglue_code_to_text_go",
1210
+ "codexglue_code_to_text_java",
1211
+ "codexglue_code_to_text_javascript",
1212
+ "codexglue_code_to_text_php"
1213
+ ]
1214
+ },
1215
+ {
1216
+ "family_name": "aexams",
1217
+ "num_tasks": 6,
1218
+ "avg_quality_score": 2.0,
1219
+ "tags": [
1220
+ "academic-exam",
1221
+ "arabic",
1222
+ "english",
1223
+ "multilingual",
1224
+ "social-science",
1225
+ "stem"
1226
+ ],
1227
+ "sample_tasks": [
1228
+ "aexams",
1229
+ "aexams_Biology",
1230
+ "aexams_IslamicStudies",
1231
+ "aexams_Physics",
1232
+ "aexams_Science"
1233
+ ]
1234
+ },
1235
+ {
1236
+ "family_name": "haerae",
1237
+ "num_tasks": 6,
1238
+ "avg_quality_score": 2.0,
1239
+ "tags": [
1240
+ "english",
1241
+ "general",
1242
+ "humanities"
1243
+ ],
1244
+ "sample_tasks": [
1245
+ "haerae",
1246
+ "haerae_general_knowledge",
1247
+ "haerae_history",
1248
+ "haerae_loan_word",
1249
+ "haerae_rare_word"
1250
+ ]
1251
+ },
1252
+ {
1253
+ "family_name": "kobest",
1254
+ "num_tasks": 6,
1255
+ "avg_quality_score": 2.0,
1256
+ "tags": [
1257
+ "commonsense",
1258
+ "english",
1259
+ "general",
1260
+ "question-answering"
1261
+ ],
1262
+ "sample_tasks": [
1263
+ "kobest",
1264
+ "kobest_boolq",
1265
+ "kobest_copa",
1266
+ "kobest_hellaswag",
1267
+ "kobest_sentineg"
1268
+ ]
1269
+ },
1270
+ {
1271
+ "family_name": "phrases",
1272
+ "num_tasks": 6,
1273
+ "avg_quality_score": 2.0,
1274
+ "tags": [
1275
+ "spanish"
1276
+ ],
1277
+ "sample_tasks": [
1278
+ "phrases_ca-va",
1279
+ "phrases_es",
1280
+ "phrases_es-va",
1281
+ "phrases_va",
1282
+ "phrases_va-ca"
1283
+ ]
1284
+ },
1285
+ {
1286
+ "family_name": "code2text",
1287
+ "num_tasks": 6,
1288
+ "avg_quality_score": 2.0,
1289
+ "tags": [
1290
+ "english",
1291
+ "general"
1292
+ ],
1293
+ "sample_tasks": [
1294
+ "code2text_go",
1295
+ "code2text_java",
1296
+ "code2text_javascript",
1297
+ "code2text_php",
1298
+ "code2text_python"
1299
+ ]
1300
+ },
1301
+ {
1302
+ "family_name": "kormedmcqa",
1303
+ "num_tasks": 5,
1304
+ "avg_quality_score": 2.2,
1305
+ "tags": [
1306
+ "english",
1307
+ "question-answering",
1308
+ "safety",
1309
+ "toxicity"
1310
+ ],
1311
+ "sample_tasks": [
1312
+ "kormedmcqa",
1313
+ "kormedmcqa_dentist",
1314
+ "kormedmcqa_doctor",
1315
+ "kormedmcqa_nurse",
1316
+ "kormedmcqa_pharm"
1317
+ ]
1318
+ },
1319
+ {
1320
+ "family_name": "ethics",
1321
+ "num_tasks": 5,
1322
+ "avg_quality_score": 2.0,
1323
+ "tags": [
1324
+ "english",
1325
+ "general"
1326
+ ],
1327
+ "sample_tasks": [
1328
+ "ethics_cm",
1329
+ "ethics_deontology",
1330
+ "ethics_justice",
1331
+ "ethics_utilitarianism",
1332
+ "ethics_virtue"
1333
+ ]
1334
+ },
1335
+ {
1336
+ "family_name": "wmdp",
1337
+ "num_tasks": 4,
1338
+ "avg_quality_score": 2.0,
1339
+ "tags": [
1340
+ "english",
1341
+ "general"
1342
+ ],
1343
+ "sample_tasks": [
1344
+ "wmdp",
1345
+ "wmdp_bio",
1346
+ "wmdp_chem",
1347
+ "wmdp_cyber"
1348
+ ]
1349
+ },
1350
+ {
1351
+ "family_name": "cabreu",
1352
+ "num_tasks": 4,
1353
+ "avg_quality_score": 2.0,
1354
+ "tags": [
1355
+ "basque",
1356
+ "english",
1357
+ "general"
1358
+ ],
1359
+ "sample_tasks": [
1360
+ "cabreu",
1361
+ "cabreu_abstractive",
1362
+ "cabreu_extractive",
1363
+ "cabreu_extreme"
1364
+ ]
1365
+ },
1366
+ {
1367
+ "family_name": "sycophancy",
1368
+ "num_tasks": 4,
1369
+ "avg_quality_score": 2.0,
1370
+ "tags": [
1371
+ "english",
1372
+ "general"
1373
+ ],
1374
+ "sample_tasks": [
1375
+ "sycophancy",
1376
+ "sycophancy_on_nlp_survey",
1377
+ "sycophancy_on_philpapers2020",
1378
+ "sycophancy_on_political_typology_quiz"
1379
+ ]
1380
+ },
1381
+ {
1382
+ "family_name": "evalita-sp",
1383
+ "num_tasks": 4,
1384
+ "avg_quality_score": 2.0,
1385
+ "tags": [
1386
+ "english",
1387
+ "general"
1388
+ ],
1389
+ "sample_tasks": [
1390
+ "evalita-sp_sum_task_fp-small_p1",
1391
+ "evalita-sp_sum_task_fp-small_p2",
1392
+ "evalita-sp_sum_task_fp_p1",
1393
+ "evalita-sp_sum_task_fp_p2"
1394
+ ]
1395
+ },
1396
+ {
1397
+ "family_name": "fld",
1398
+ "num_tasks": 4,
1399
+ "avg_quality_score": 2.5,
1400
+ "tags": [
1401
+ "english",
1402
+ "general",
1403
+ "reasoning"
1404
+ ],
1405
+ "sample_tasks": [
1406
+ "fld_default",
1407
+ "fld_logical_formula_default",
1408
+ "fld_logical_formula_star",
1409
+ "fld_star"
1410
+ ]
1411
+ },
1412
+ {
1413
+ "family_name": "lingoly",
1414
+ "num_tasks": 3,
1415
+ "avg_quality_score": 2.0,
1416
+ "tags": [
1417
+ "english",
1418
+ "general"
1419
+ ],
1420
+ "sample_tasks": [
1421
+ "lingoly",
1422
+ "lingoly_context",
1423
+ "lingoly_nocontext"
1424
+ ]
1425
+ },
1426
+ {
1427
+ "family_name": "copal",
1428
+ "num_tasks": 3,
1429
+ "avg_quality_score": 2.0,
1430
+ "tags": [
1431
+ "commonsense",
1432
+ "english"
1433
+ ],
1434
+ "sample_tasks": [
1435
+ "copal_id",
1436
+ "copal_id_colloquial",
1437
+ "copal_id_standard"
1438
+ ]
1439
+ },
1440
+ {
1441
+ "family_name": "lambada",
1442
+ "num_tasks": 3,
1443
+ "avg_quality_score": 2.0,
1444
+ "tags": [
1445
+ "english",
1446
+ "general"
1447
+ ],
1448
+ "sample_tasks": [
1449
+ "lambada",
1450
+ "lambada_cloze",
1451
+ "lambada_multilingual"
1452
+ ]
1453
+ },
1454
+ {
1455
+ "family_name": "polemo2",
1456
+ "num_tasks": 3,
1457
+ "avg_quality_score": 2.0,
1458
+ "tags": [
1459
+ "english",
1460
+ "general"
1461
+ ],
1462
+ "sample_tasks": [
1463
+ "polemo2",
1464
+ "polemo2_in",
1465
+ "polemo2_out"
1466
+ ]
1467
+ },
1468
+ {
1469
+ "family_name": "storycloze",
1470
+ "num_tasks": 3,
1471
+ "avg_quality_score": 2.0,
1472
+ "tags": [
1473
+ "english",
1474
+ "general"
1475
+ ],
1476
+ "sample_tasks": [
1477
+ "storycloze",
1478
+ "storycloze_2016",
1479
+ "storycloze_2018"
1480
+ ]
1481
+ },
1482
+ {
1483
+ "family_name": "glianorex",
1484
+ "num_tasks": 3,
1485
+ "avg_quality_score": 2.0,
1486
+ "tags": [
1487
+ "english",
1488
+ "general"
1489
+ ],
1490
+ "sample_tasks": [
1491
+ "glianorex",
1492
+ "glianorex_en",
1493
+ "glianorex_fr"
1494
+ ]
1495
+ },
1496
+ {
1497
+ "family_name": "humaneval",
1498
+ "num_tasks": 3,
1499
+ "avg_quality_score": 3.33,
1500
+ "tags": [
1501
+ "code generation",
1502
+ "coding",
1503
+ "english",
1504
+ "general",
1505
+ "python"
1506
+ ],
1507
+ "sample_tasks": [
1508
+ "humaneval",
1509
+ "humaneval_64",
1510
+ "humaneval_plus"
1511
+ ]
1512
+ },
1513
+ {
1514
+ "family_name": "flan",
1515
+ "num_tasks": 2,
1516
+ "avg_quality_score": 2.0,
1517
+ "tags": [
1518
+ "english",
1519
+ "general knowledge",
1520
+ "hallucination",
1521
+ "reasoning"
1522
+ ],
1523
+ "sample_tasks": [
1524
+ "flan_held_in",
1525
+ "flan_held_out"
1526
+ ]
1527
+ },
1528
+ {
1529
+ "family_name": "assin",
1530
+ "num_tasks": 2,
1531
+ "avg_quality_score": 2.0,
1532
+ "tags": [
1533
+ "english",
1534
+ "general"
1535
+ ],
1536
+ "sample_tasks": [
1537
+ "assin_entailment",
1538
+ "assin_paraphrase"
1539
+ ]
1540
+ },
1541
+ {
1542
+ "family_name": "gsm",
1543
+ "num_tasks": 2,
1544
+ "avg_quality_score": 2.0,
1545
+ "tags": [
1546
+ "english",
1547
+ "general"
1548
+ ],
1549
+ "sample_tasks": [
1550
+ "gsm_plus",
1551
+ "gsm_plus_mini"
1552
+ ]
1553
+ },
1554
+ {
1555
+ "family_name": "mbpp",
1556
+ "num_tasks": 2,
1557
+ "avg_quality_score": 2.5,
1558
+ "tags": [
1559
+ "code generation",
1560
+ "coding",
1561
+ "english",
1562
+ "general",
1563
+ "python"
1564
+ ],
1565
+ "sample_tasks": [
1566
+ "mbpp",
1567
+ "mbpp_plus"
1568
+ ]
1569
+ },
1570
+ {
1571
+ "family_name": "mnli",
1572
+ "num_tasks": 2,
1573
+ "avg_quality_score": 2.0,
1574
+ "tags": [
1575
+ "english",
1576
+ "general"
1577
+ ],
1578
+ "sample_tasks": [
1579
+ "mnli",
1580
+ "mnli_mismatch"
1581
+ ]
1582
+ },
1583
+ {
1584
+ "family_name": "tinyTruthfulQA",
1585
+ "num_tasks": 2,
1586
+ "avg_quality_score": 3.0,
1587
+ "tags": [
1588
+ "english",
1589
+ "factuality",
1590
+ "question-answering",
1591
+ "safety"
1592
+ ],
1593
+ "sample_tasks": [
1594
+ "tinyTruthfulQA",
1595
+ "tinyTruthfulQA_mc1"
1596
+ ]
1597
+ },
1598
+ {
1599
+ "family_name": "basque",
1600
+ "num_tasks": 1,
1601
+ "avg_quality_score": 2.0,
1602
+ "tags": [
1603
+ "bias",
1604
+ "english",
1605
+ "reasoning"
1606
+ ],
1607
+ "sample_tasks": [
1608
+ "basque_bench"
1609
+ ]
1610
+ },
1611
+ {
1612
+ "family_name": "catalan",
1613
+ "num_tasks": 1,
1614
+ "avg_quality_score": 2.0,
1615
+ "tags": [
1616
+ "multilingual",
1617
+ "reasoning",
1618
+ "spanish"
1619
+ ],
1620
+ "sample_tasks": [
1621
+ "catalan_bench"
1622
+ ]
1623
+ },
1624
+ {
1625
+ "family_name": "galician",
1626
+ "num_tasks": 1,
1627
+ "avg_quality_score": 2.0,
1628
+ "tags": [
1629
+ "general knowledge",
1630
+ "multilingual",
1631
+ "spanish"
1632
+ ],
1633
+ "sample_tasks": [
1634
+ "galician_bench"
1635
+ ]
1636
+ },
1637
+ {
1638
+ "family_name": "japanese",
1639
+ "num_tasks": 1,
1640
+ "avg_quality_score": 2.0,
1641
+ "tags": [
1642
+ "japanese"
1643
+ ],
1644
+ "sample_tasks": [
1645
+ "japanese_leaderboard"
1646
+ ]
1647
+ },
1648
+ {
1649
+ "family_name": "multimedqa",
1650
+ "num_tasks": 1,
1651
+ "avg_quality_score": 2.0,
1652
+ "tags": [
1653
+ "english",
1654
+ "question-answering"
1655
+ ],
1656
+ "sample_tasks": [
1657
+ "multimedqa"
1658
+ ]
1659
+ },
1660
+ {
1661
+ "family_name": "openllm",
1662
+ "num_tasks": 1,
1663
+ "avg_quality_score": 2.0,
1664
+ "tags": [
1665
+ "english",
1666
+ "general"
1667
+ ],
1668
+ "sample_tasks": [
1669
+ "openllm"
1670
+ ]
1671
+ },
1672
+ {
1673
+ "family_name": "portuguese",
1674
+ "num_tasks": 1,
1675
+ "avg_quality_score": 2.0,
1676
+ "tags": [
1677
+ "portuguese"
1678
+ ],
1679
+ "sample_tasks": [
1680
+ "portuguese_bench"
1681
+ ]
1682
+ },
1683
+ {
1684
+ "family_name": "pythia",
1685
+ "num_tasks": 1,
1686
+ "avg_quality_score": 2.0,
1687
+ "tags": [
1688
+ "english",
1689
+ "general"
1690
+ ],
1691
+ "sample_tasks": [
1692
+ "pythia"
1693
+ ]
1694
+ },
1695
+ {
1696
+ "family_name": "spanish",
1697
+ "num_tasks": 1,
1698
+ "avg_quality_score": 2.0,
1699
+ "tags": [
1700
+ "spanish"
1701
+ ],
1702
+ "sample_tasks": [
1703
+ "spanish_bench"
1704
+ ]
1705
+ },
1706
+ {
1707
+ "family_name": "t0",
1708
+ "num_tasks": 1,
1709
+ "avg_quality_score": 2.0,
1710
+ "tags": [
1711
+ "english",
1712
+ "general"
1713
+ ],
1714
+ "sample_tasks": [
1715
+ "t0_eval"
1716
+ ]
1717
+ },
1718
+ {
1719
+ "family_name": "tinyBenchmarks",
1720
+ "num_tasks": 1,
1721
+ "avg_quality_score": 2.0,
1722
+ "tags": [
1723
+ "english",
1724
+ "general"
1725
+ ],
1726
+ "sample_tasks": [
1727
+ "tinyBenchmarks"
1728
+ ]
1729
+ },
1730
+ {
1731
+ "family_name": "Tag",
1732
+ "num_tasks": 1,
1733
+ "avg_quality_score": 2.0,
1734
+ "tags": [
1735
+ "english",
1736
+ "general"
1737
+ ],
1738
+ "sample_tasks": [
1739
+ "Tag"
1740
+ ]
1741
+ },
1742
+ {
1743
+ "family_name": "basque-glue",
1744
+ "num_tasks": 1,
1745
+ "avg_quality_score": 2.0,
1746
+ "tags": [
1747
+ "basque"
1748
+ ],
1749
+ "sample_tasks": [
1750
+ "basque-glue"
1751
+ ]
1752
+ },
1753
+ {
1754
+ "family_name": "chain",
1755
+ "num_tasks": 1,
1756
+ "avg_quality_score": 2.0,
1757
+ "tags": [
1758
+ "english",
1759
+ "general"
1760
+ ],
1761
+ "sample_tasks": [
1762
+ "chain_of_thought"
1763
+ ]
1764
+ },
1765
+ {
1766
+ "family_name": "freebase",
1767
+ "num_tasks": 1,
1768
+ "avg_quality_score": 2.0,
1769
+ "tags": [
1770
+ "english",
1771
+ "general"
1772
+ ],
1773
+ "sample_tasks": [
1774
+ "freebase"
1775
+ ]
1776
+ },
1777
+ {
1778
+ "family_name": "gpt3",
1779
+ "num_tasks": 1,
1780
+ "avg_quality_score": 2.0,
1781
+ "tags": [
1782
+ "english",
1783
+ "translation"
1784
+ ],
1785
+ "sample_tasks": [
1786
+ "gpt3_translation_benchmarks"
1787
+ ]
1788
+ },
1789
+ {
1790
+ "family_name": "hendrycks",
1791
+ "num_tasks": 1,
1792
+ "avg_quality_score": 2.0,
1793
+ "tags": [
1794
+ "english",
1795
+ "general"
1796
+ ],
1797
+ "sample_tasks": [
1798
+ "hendrycks_ethics"
1799
+ ]
1800
+ },
1801
+ {
1802
+ "family_name": "iwslt2017",
1803
+ "num_tasks": 1,
1804
+ "avg_quality_score": 2.0,
1805
+ "tags": [
1806
+ "english",
1807
+ "general"
1808
+ ],
1809
+ "sample_tasks": [
1810
+ "iwslt2017"
1811
+ ]
1812
+ },
1813
+ {
1814
+ "family_name": "llama",
1815
+ "num_tasks": 1,
1816
+ "avg_quality_score": 2.0,
1817
+ "tags": [
1818
+ "english",
1819
+ "general"
1820
+ ],
1821
+ "sample_tasks": [
1822
+ "llama"
1823
+ ]
1824
+ },
1825
+ {
1826
+ "family_name": "math",
1827
+ "num_tasks": 1,
1828
+ "avg_quality_score": 2.0,
1829
+ "tags": [
1830
+ "english",
1831
+ "stem"
1832
+ ],
1833
+ "sample_tasks": [
1834
+ "math_word_problems"
1835
+ ]
1836
+ },
1837
+ {
1838
+ "family_name": "self",
1839
+ "num_tasks": 1,
1840
+ "avg_quality_score": 2.0,
1841
+ "tags": [
1842
+ "english",
1843
+ "general"
1844
+ ],
1845
+ "sample_tasks": [
1846
+ "self_consistency"
1847
+ ]
1848
+ },
1849
+ {
1850
+ "family_name": "super-glue-lm-eval-v1",
1851
+ "num_tasks": 1,
1852
+ "avg_quality_score": 2.0,
1853
+ "tags": [
1854
+ "english",
1855
+ "general"
1856
+ ],
1857
+ "sample_tasks": [
1858
+ "super-glue-lm-eval-v1"
1859
+ ]
1860
+ },
1861
+ {
1862
+ "family_name": "super-glue-lm-eval-v1-seq2seq",
1863
+ "num_tasks": 1,
1864
+ "avg_quality_score": 2.0,
1865
+ "tags": [
1866
+ "english",
1867
+ "general"
1868
+ ],
1869
+ "sample_tasks": [
1870
+ "super-glue-lm-eval-v1-seq2seq"
1871
+ ]
1872
+ },
1873
+ {
1874
+ "family_name": "super-glue-t5-prompt",
1875
+ "num_tasks": 1,
1876
+ "avg_quality_score": 2.0,
1877
+ "tags": [
1878
+ "english",
1879
+ "general"
1880
+ ],
1881
+ "sample_tasks": [
1882
+ "super-glue-t5-prompt"
1883
+ ]
1884
+ },
1885
+ {
1886
+ "family_name": "translation",
1887
+ "num_tasks": 1,
1888
+ "avg_quality_score": 2.0,
1889
+ "tags": [
1890
+ "english",
1891
+ "translation"
1892
+ ],
1893
+ "sample_tasks": [
1894
+ "translation"
1895
+ ]
1896
+ },
1897
+ {
1898
+ "family_name": "unscramble",
1899
+ "num_tasks": 1,
1900
+ "avg_quality_score": 2.0,
1901
+ "tags": [
1902
+ "english",
1903
+ "general"
1904
+ ],
1905
+ "sample_tasks": [
1906
+ "unscramble"
1907
+ ]
1908
+ },
1909
+ {
1910
+ "family_name": "wmt14",
1911
+ "num_tasks": 1,
1912
+ "avg_quality_score": 2.0,
1913
+ "tags": [
1914
+ "english",
1915
+ "general"
1916
+ ],
1917
+ "sample_tasks": [
1918
+ "wmt14"
1919
+ ]
1920
+ },
1921
+ {
1922
+ "family_name": "wmt16",
1923
+ "num_tasks": 1,
1924
+ "avg_quality_score": 2.0,
1925
+ "tags": [
1926
+ "english",
1927
+ "general"
1928
+ ],
1929
+ "sample_tasks": [
1930
+ "wmt16"
1931
+ ]
1932
+ },
1933
+ {
1934
+ "family_name": "20",
1935
+ "num_tasks": 1,
1936
+ "avg_quality_score": 2.0,
1937
+ "tags": [
1938
+ "english",
1939
+ "general"
1940
+ ],
1941
+ "sample_tasks": [
1942
+ "20_newsgroups"
1943
+ ]
1944
+ },
1945
+ {
1946
+ "family_name": "ag",
1947
+ "num_tasks": 1,
1948
+ "avg_quality_score": 2.0,
1949
+ "tags": [
1950
+ "english",
1951
+ "general"
1952
+ ],
1953
+ "sample_tasks": [
1954
+ "ag_news"
1955
+ ]
1956
+ },
1957
+ {
1958
+ "family_name": "anagrams1",
1959
+ "num_tasks": 1,
1960
+ "avg_quality_score": 2.0,
1961
+ "tags": [
1962
+ "english",
1963
+ "general"
1964
+ ],
1965
+ "sample_tasks": [
1966
+ "anagrams1"
1967
+ ]
1968
+ },
1969
+ {
1970
+ "family_name": "anagrams2",
1971
+ "num_tasks": 1,
1972
+ "avg_quality_score": 2.0,
1973
+ "tags": [
1974
+ "english",
1975
+ "general"
1976
+ ],
1977
+ "sample_tasks": [
1978
+ "anagrams2"
1979
+ ]
1980
+ },
1981
+ {
1982
+ "family_name": "argument",
1983
+ "num_tasks": 1,
1984
+ "avg_quality_score": 2.0,
1985
+ "tags": [
1986
+ "english",
1987
+ "general"
1988
+ ],
1989
+ "sample_tasks": [
1990
+ "argument_topic"
1991
+ ]
1992
+ },
1993
+ {
1994
+ "family_name": "atis",
1995
+ "num_tasks": 1,
1996
+ "avg_quality_score": 2.0,
1997
+ "tags": [
1998
+ "english",
1999
+ "general"
2000
+ ],
2001
+ "sample_tasks": [
2002
+ "atis"
2003
+ ]
2004
+ },
2005
+ {
2006
+ "family_name": "babi",
2007
+ "num_tasks": 1,
2008
+ "avg_quality_score": 2.0,
2009
+ "tags": [
2010
+ "english",
2011
+ "general"
2012
+ ],
2013
+ "sample_tasks": [
2014
+ "babi"
2015
+ ]
2016
+ },
2017
+ {
2018
+ "family_name": "banking77",
2019
+ "num_tasks": 1,
2020
+ "avg_quality_score": 2.0,
2021
+ "tags": [
2022
+ "english",
2023
+ "general"
2024
+ ],
2025
+ "sample_tasks": [
2026
+ "banking77"
2027
+ ]
2028
+ },
2029
+ {
2030
+ "family_name": "bec2016eu",
2031
+ "num_tasks": 1,
2032
+ "avg_quality_score": 2.0,
2033
+ "tags": [
2034
+ "english",
2035
+ "general"
2036
+ ],
2037
+ "sample_tasks": [
2038
+ "bec2016eu"
2039
+ ]
2040
+ },
2041
+ {
2042
+ "family_name": "bhtc",
2043
+ "num_tasks": 1,
2044
+ "avg_quality_score": 2.0,
2045
+ "tags": [
2046
+ "english",
2047
+ "general"
2048
+ ],
2049
+ "sample_tasks": [
2050
+ "bhtc_v2"
2051
+ ]
2052
+ },
2053
+ {
2054
+ "family_name": "boolq-seq2seq",
2055
+ "num_tasks": 1,
2056
+ "avg_quality_score": 2.0,
2057
+ "tags": [
2058
+ "english",
2059
+ "question-answering"
2060
+ ],
2061
+ "sample_tasks": [
2062
+ "boolq-seq2seq"
2063
+ ]
2064
+ },
2065
+ {
2066
+ "family_name": "catalanqa",
2067
+ "num_tasks": 1,
2068
+ "avg_quality_score": 2.0,
2069
+ "tags": [
2070
+ "catalan",
2071
+ "question-answering"
2072
+ ],
2073
+ "sample_tasks": [
2074
+ "catalanqa"
2075
+ ]
2076
+ },
2077
+ {
2078
+ "family_name": "catcola",
2079
+ "num_tasks": 1,
2080
+ "avg_quality_score": 2.0,
2081
+ "tags": [
2082
+ "english",
2083
+ "general"
2084
+ ],
2085
+ "sample_tasks": [
2086
+ "catcola"
2087
+ ]
2088
+ },
2089
+ {
2090
+ "family_name": "claim",
2091
+ "num_tasks": 1,
2092
+ "avg_quality_score": 2.0,
2093
+ "tags": [
2094
+ "english",
2095
+ "general"
2096
+ ],
2097
+ "sample_tasks": [
2098
+ "claim_stance_topic"
2099
+ ]
2100
+ },
2101
+ {
2102
+ "family_name": "cnn",
2103
+ "num_tasks": 1,
2104
+ "avg_quality_score": 2.0,
2105
+ "tags": [
2106
+ "english",
2107
+ "general"
2108
+ ],
2109
+ "sample_tasks": [
2110
+ "cnn_dailymail"
2111
+ ]
2112
+ },
2113
+ {
2114
+ "family_name": "cocoteros",
2115
+ "num_tasks": 1,
2116
+ "avg_quality_score": 2.0,
2117
+ "tags": [
2118
+ "chain-of-thought",
2119
+ "english"
2120
+ ],
2121
+ "sample_tasks": [
2122
+ "cocoteros_es"
2123
+ ]
2124
+ },
2125
+ {
2126
+ "family_name": "coedit",
2127
+ "num_tasks": 1,
2128
+ "avg_quality_score": 2.0,
2129
+ "tags": [
2130
+ "italian"
2131
+ ],
2132
+ "sample_tasks": [
2133
+ "coedit_gec"
2134
+ ]
2135
+ },
2136
+ {
2137
+ "family_name": "cola",
2138
+ "num_tasks": 1,
2139
+ "avg_quality_score": 2.0,
2140
+ "tags": [
2141
+ "english",
2142
+ "general"
2143
+ ],
2144
+ "sample_tasks": [
2145
+ "cola"
2146
+ ]
2147
+ },
2148
+ {
2149
+ "family_name": "commonsense",
2150
+ "num_tasks": 1,
2151
+ "avg_quality_score": 2.0,
2152
+ "tags": [
2153
+ "commonsense",
2154
+ "english",
2155
+ "question-answering"
2156
+ ],
2157
+ "sample_tasks": [
2158
+ "commonsense_qa"
2159
+ ]
2160
+ },
2161
+ {
2162
+ "family_name": "coqcat",
2163
+ "num_tasks": 1,
2164
+ "avg_quality_score": 2.0,
2165
+ "tags": [
2166
+ "english",
2167
+ "general"
2168
+ ],
2169
+ "sample_tasks": [
2170
+ "coqcat"
2171
+ ]
2172
+ },
2173
+ {
2174
+ "family_name": "cycle",
2175
+ "num_tasks": 1,
2176
+ "avg_quality_score": 2.0,
2177
+ "tags": [
2178
+ "english",
2179
+ "general"
2180
+ ],
2181
+ "sample_tasks": [
2182
+ "cycle_letters"
2183
+ ]
2184
+ },
2185
+ {
2186
+ "family_name": "dbpedia",
2187
+ "num_tasks": 1,
2188
+ "avg_quality_score": 2.0,
2189
+ "tags": [
2190
+ "english",
2191
+ "general"
2192
+ ],
2193
+ "sample_tasks": [
2194
+ "dbpedia_14"
2195
+ ]
2196
+ },
2197
+ {
2198
+ "family_name": "doc",
2199
+ "num_tasks": 1,
2200
+ "avg_quality_score": 2.0,
2201
+ "tags": [
2202
+ "english",
2203
+ "question-answering"
2204
+ ],
2205
+ "sample_tasks": [
2206
+ "doc_vqa"
2207
+ ]
2208
+ },
2209
+ {
2210
+ "family_name": "epec",
2211
+ "num_tasks": 1,
2212
+ "avg_quality_score": 2.0,
2213
+ "tags": [
2214
+ "english",
2215
+ "general"
2216
+ ],
2217
+ "sample_tasks": [
2218
+ "epec_koref_bin"
2219
+ ]
2220
+ },
2221
+ {
2222
+ "family_name": "eq",
2223
+ "num_tasks": 1,
2224
+ "avg_quality_score": 2.0,
2225
+ "tags": [
2226
+ "english",
2227
+ "general"
2228
+ ],
2229
+ "sample_tasks": [
2230
+ "eq_bench"
2231
+ ]
2232
+ },
2233
+ {
2234
+ "family_name": "escola",
2235
+ "num_tasks": 1,
2236
+ "avg_quality_score": 2.0,
2237
+ "tags": [
2238
+ "english",
2239
+ "general"
2240
+ ],
2241
+ "sample_tasks": [
2242
+ "escola"
2243
+ ]
2244
+ },
2245
+ {
2246
+ "family_name": "ethos",
2247
+ "num_tasks": 1,
2248
+ "avg_quality_score": 2.0,
2249
+ "tags": [
2250
+ "english",
2251
+ "general"
2252
+ ],
2253
+ "sample_tasks": [
2254
+ "ethos_binary"
2255
+ ]
2256
+ },
2257
+ {
2258
+ "family_name": "fda",
2259
+ "num_tasks": 1,
2260
+ "avg_quality_score": 2.0,
2261
+ "tags": [
2262
+ "english",
2263
+ "general"
2264
+ ],
2265
+ "sample_tasks": [
2266
+ "fda"
2267
+ ]
2268
+ },
2269
+ {
2270
+ "family_name": "financial",
2271
+ "num_tasks": 1,
2272
+ "avg_quality_score": 2.0,
2273
+ "tags": [
2274
+ "english",
2275
+ "general"
2276
+ ],
2277
+ "sample_tasks": [
2278
+ "financial_tweets"
2279
+ ]
2280
+ },
2281
+ {
2282
+ "family_name": "galcola",
2283
+ "num_tasks": 1,
2284
+ "avg_quality_score": 2.0,
2285
+ "tags": [
2286
+ "english",
2287
+ "general"
2288
+ ],
2289
+ "sample_tasks": [
2290
+ "galcola"
2291
+ ]
2292
+ },
2293
+ {
2294
+ "family_name": "groundcocoa",
2295
+ "num_tasks": 1,
2296
+ "avg_quality_score": 2.0,
2297
+ "tags": [
2298
+ "english",
2299
+ "general"
2300
+ ],
2301
+ "sample_tasks": [
2302
+ "groundcocoa"
2303
+ ]
2304
+ },
2305
+ {
2306
+ "family_name": "histoires",
2307
+ "num_tasks": 1,
2308
+ "avg_quality_score": 2.0,
2309
+ "tags": [
2310
+ "spanish"
2311
+ ],
2312
+ "sample_tasks": [
2313
+ "histoires_morales"
2314
+ ]
2315
+ },
2316
+ {
2317
+ "family_name": "ifeval",
2318
+ "num_tasks": 1,
2319
+ "avg_quality_score": 2.0,
2320
+ "tags": [
2321
+ "english",
2322
+ "general"
2323
+ ],
2324
+ "sample_tasks": [
2325
+ "ifeval"
2326
+ ]
2327
+ },
2328
+ {
2329
+ "family_name": "iwslt2017-ar-en",
2330
+ "num_tasks": 1,
2331
+ "avg_quality_score": 2.0,
2332
+ "tags": [
2333
+ "english",
2334
+ "general"
2335
+ ],
2336
+ "sample_tasks": [
2337
+ "iwslt2017-ar-en"
2338
+ ]
2339
+ },
2340
+ {
2341
+ "family_name": "iwslt2017-en-ar",
2342
+ "num_tasks": 1,
2343
+ "avg_quality_score": 2.0,
2344
+ "tags": [
2345
+ "english",
2346
+ "general"
2347
+ ],
2348
+ "sample_tasks": [
2349
+ "iwslt2017-en-ar"
2350
+ ]
2351
+ },
2352
+ {
2353
+ "family_name": "law",
2354
+ "num_tasks": 1,
2355
+ "avg_quality_score": 2.0,
2356
+ "tags": [
2357
+ "english",
2358
+ "general"
2359
+ ],
2360
+ "sample_tasks": [
2361
+ "law_stack_exchange"
2362
+ ]
2363
+ },
2364
+ {
2365
+ "family_name": "ledgar",
2366
+ "num_tasks": 1,
2367
+ "avg_quality_score": 2.0,
2368
+ "tags": [
2369
+ "english",
2370
+ "general"
2371
+ ],
2372
+ "sample_tasks": [
2373
+ "ledgar"
2374
+ ]
2375
+ },
2376
+ {
2377
+ "family_name": "logieval",
2378
+ "num_tasks": 1,
2379
+ "avg_quality_score": 2.0,
2380
+ "tags": [
2381
+ "english",
2382
+ "general"
2383
+ ],
2384
+ "sample_tasks": [
2385
+ "logieval"
2386
+ ]
2387
+ },
2388
+ {
2389
+ "family_name": "medical",
2390
+ "num_tasks": 1,
2391
+ "avg_quality_score": 2.0,
2392
+ "tags": [
2393
+ "english",
2394
+ "general"
2395
+ ],
2396
+ "sample_tasks": [
2397
+ "medical_abstracts"
2398
+ ]
2399
+ },
2400
+ {
2401
+ "family_name": "medmcqa",
2402
+ "num_tasks": 1,
2403
+ "avg_quality_score": 2.0,
2404
+ "tags": [
2405
+ "english",
2406
+ "question-answering"
2407
+ ],
2408
+ "sample_tasks": [
2409
+ "medmcqa"
2410
+ ]
2411
+ },
2412
+ {
2413
+ "family_name": "moral",
2414
+ "num_tasks": 1,
2415
+ "avg_quality_score": 2.0,
2416
+ "tags": [
2417
+ "english",
2418
+ "general"
2419
+ ],
2420
+ "sample_tasks": [
2421
+ "moral_stories"
2422
+ ]
2423
+ },
2424
+ {
2425
+ "family_name": "noticia",
2426
+ "num_tasks": 1,
2427
+ "avg_quality_score": 2.0,
2428
+ "tags": [
2429
+ "english",
2430
+ "general"
2431
+ ],
2432
+ "sample_tasks": [
2433
+ "noticia"
2434
+ ]
2435
+ },
2436
+ {
2437
+ "family_name": "parafraseja",
2438
+ "num_tasks": 1,
2439
+ "avg_quality_score": 2.0,
2440
+ "tags": [
2441
+ "english",
2442
+ "general"
2443
+ ],
2444
+ "sample_tasks": [
2445
+ "parafraseja"
2446
+ ]
2447
+ },
2448
+ {
2449
+ "family_name": "parafrases",
2450
+ "num_tasks": 1,
2451
+ "avg_quality_score": 2.0,
2452
+ "tags": [
2453
+ "spanish"
2454
+ ],
2455
+ "sample_tasks": [
2456
+ "parafrases_gl"
2457
+ ]
2458
+ },
2459
+ {
2460
+ "family_name": "qnlieu",
2461
+ "num_tasks": 1,
2462
+ "avg_quality_score": 2.0,
2463
+ "tags": [
2464
+ "english",
2465
+ "general"
2466
+ ],
2467
+ "sample_tasks": [
2468
+ "qnlieu"
2469
+ ]
2470
+ },
2471
+ {
2472
+ "family_name": "random",
2473
+ "num_tasks": 1,
2474
+ "avg_quality_score": 2.0,
2475
+ "tags": [
2476
+ "english",
2477
+ "general"
2478
+ ],
2479
+ "sample_tasks": [
2480
+ "random_insertion"
2481
+ ]
2482
+ },
2483
+ {
2484
+ "family_name": "realtoxicityprompts",
2485
+ "num_tasks": 1,
2486
+ "avg_quality_score": 3.0,
2487
+ "tags": [
2488
+ "english",
2489
+ "safety",
2490
+ "toxicity"
2491
+ ],
2492
+ "sample_tasks": [
2493
+ "realtoxicityprompts"
2494
+ ]
2495
+ },
2496
+ {
2497
+ "family_name": "reversed",
2498
+ "num_tasks": 1,
2499
+ "avg_quality_score": 2.0,
2500
+ "tags": [
2501
+ "english",
2502
+ "general"
2503
+ ],
2504
+ "sample_tasks": [
2505
+ "reversed_words"
2506
+ ]
2507
+ },
2508
+ {
2509
+ "family_name": "sglue",
2510
+ "num_tasks": 1,
2511
+ "avg_quality_score": 2.0,
2512
+ "tags": [
2513
+ "english",
2514
+ "general"
2515
+ ],
2516
+ "sample_tasks": [
2517
+ "sglue_rte"
2518
+ ]
2519
+ },
2520
+ {
2521
+ "family_name": "siqa",
2522
+ "num_tasks": 1,
2523
+ "avg_quality_score": 2.0,
2524
+ "tags": [
2525
+ "english",
2526
+ "question-answering"
2527
+ ],
2528
+ "sample_tasks": [
2529
+ "siqa_ca"
2530
+ ]
2531
+ },
2532
+ {
2533
+ "family_name": "squad",
2534
+ "num_tasks": 1,
2535
+ "avg_quality_score": 2.0,
2536
+ "tags": [
2537
+ "english",
2538
+ "general"
2539
+ ],
2540
+ "sample_tasks": [
2541
+ "squad_completion"
2542
+ ]
2543
+ },
2544
+ {
2545
+ "family_name": "stsb",
2546
+ "num_tasks": 1,
2547
+ "avg_quality_score": 2.0,
2548
+ "tags": [
2549
+ "english",
2550
+ "general"
2551
+ ],
2552
+ "sample_tasks": [
2553
+ "stsb"
2554
+ ]
2555
+ },
2556
+ {
2557
+ "family_name": "summarization",
2558
+ "num_tasks": 1,
2559
+ "avg_quality_score": 2.0,
2560
+ "tags": [
2561
+ "english",
2562
+ "general"
2563
+ ],
2564
+ "sample_tasks": [
2565
+ "summarization_gl"
2566
+ ]
2567
+ },
2568
+ {
2569
+ "family_name": "swde",
2570
+ "num_tasks": 1,
2571
+ "avg_quality_score": 2.0,
2572
+ "tags": [
2573
+ "english",
2574
+ "general"
2575
+ ],
2576
+ "sample_tasks": [
2577
+ "swde"
2578
+ ]
2579
+ },
2580
+ {
2581
+ "family_name": "teca",
2582
+ "num_tasks": 1,
2583
+ "avg_quality_score": 2.0,
2584
+ "tags": [
2585
+ "english",
2586
+ "general"
2587
+ ],
2588
+ "sample_tasks": [
2589
+ "teca"
2590
+ ]
2591
+ },
2592
+ {
2593
+ "family_name": "tinyArc",
2594
+ "num_tasks": 1,
2595
+ "avg_quality_score": 2.0,
2596
+ "tags": [
2597
+ "english",
2598
+ "question-answering"
2599
+ ],
2600
+ "sample_tasks": [
2601
+ "tinyArc"
2602
+ ]
2603
+ },
2604
+ {
2605
+ "family_name": "tinyGSM8k",
2606
+ "num_tasks": 1,
2607
+ "avg_quality_score": 2.0,
2608
+ "tags": [
2609
+ "english",
2610
+ "general"
2611
+ ],
2612
+ "sample_tasks": [
2613
+ "tinyGSM8k"
2614
+ ]
2615
+ },
2616
+ {
2617
+ "family_name": "tinyHellaswag",
2618
+ "num_tasks": 1,
2619
+ "avg_quality_score": 2.0,
2620
+ "tags": [
2621
+ "commonsense",
2622
+ "english"
2623
+ ],
2624
+ "sample_tasks": [
2625
+ "tinyHellaswag"
2626
+ ]
2627
+ },
2628
+ {
2629
+ "family_name": "tinyMMLU",
2630
+ "num_tasks": 1,
2631
+ "avg_quality_score": 3.0,
2632
+ "tags": [
2633
+ "english",
2634
+ "knowledge",
2635
+ "multiple-choice"
2636
+ ],
2637
+ "sample_tasks": [
2638
+ "tinyMMLU"
2639
+ ]
2640
+ },
2641
+ {
2642
+ "family_name": "tinyWinogrande",
2643
+ "num_tasks": 1,
2644
+ "avg_quality_score": 2.0,
2645
+ "tags": [
2646
+ "english",
2647
+ "general"
2648
+ ],
2649
+ "sample_tasks": [
2650
+ "tinyWinogrande"
2651
+ ]
2652
+ },
2653
+ {
2654
+ "family_name": "toxigen",
2655
+ "num_tasks": 1,
2656
+ "avg_quality_score": 2.0,
2657
+ "tags": [
2658
+ "bias",
2659
+ "english",
2660
+ "general",
2661
+ "toxicity"
2662
+ ],
2663
+ "sample_tasks": [
2664
+ "toxigen"
2665
+ ]
2666
+ },
2667
+ {
2668
+ "family_name": "unfair",
2669
+ "num_tasks": 1,
2670
+ "avg_quality_score": 2.0,
2671
+ "tags": [
2672
+ "english",
2673
+ "general"
2674
+ ],
2675
+ "sample_tasks": [
2676
+ "unfair_tos"
2677
+ ]
2678
+ },
2679
+ {
2680
+ "family_name": "vaxx",
2681
+ "num_tasks": 1,
2682
+ "avg_quality_score": 2.0,
2683
+ "tags": [
2684
+ "english",
2685
+ "general"
2686
+ ],
2687
+ "sample_tasks": [
2688
+ "vaxx_stance"
2689
+ ]
2690
+ },
2691
+ {
2692
+ "family_name": "wiceu",
2693
+ "num_tasks": 1,
2694
+ "avg_quality_score": 2.0,
2695
+ "tags": [
2696
+ "english",
2697
+ "general"
2698
+ ],
2699
+ "sample_tasks": [
2700
+ "wiceu"
2701
+ ]
2702
+ },
2703
+ {
2704
+ "family_name": "wmt-ro-en-t5-prompt",
2705
+ "num_tasks": 1,
2706
+ "avg_quality_score": 2.0,
2707
+ "tags": [
2708
+ "english",
2709
+ "general"
2710
+ ],
2711
+ "sample_tasks": [
2712
+ "wmt-ro-en-t5-prompt"
2713
+ ]
2714
+ },
2715
+ {
2716
+ "family_name": "wmt14-en-fr",
2717
+ "num_tasks": 1,
2718
+ "avg_quality_score": 2.0,
2719
+ "tags": [
2720
+ "english",
2721
+ "general"
2722
+ ],
2723
+ "sample_tasks": [
2724
+ "wmt14-en-fr"
2725
+ ]
2726
+ },
2727
+ {
2728
+ "family_name": "wmt14-fr-en",
2729
+ "num_tasks": 1,
2730
+ "avg_quality_score": 2.0,
2731
+ "tags": [
2732
+ "english",
2733
+ "general"
2734
+ ],
2735
+ "sample_tasks": [
2736
+ "wmt14-fr-en"
2737
+ ]
2738
+ },
2739
+ {
2740
+ "family_name": "wmt16-de-en",
2741
+ "num_tasks": 1,
2742
+ "avg_quality_score": 2.0,
2743
+ "tags": [
2744
+ "english",
2745
+ "general"
2746
+ ],
2747
+ "sample_tasks": [
2748
+ "wmt16-de-en"
2749
+ ]
2750
+ },
2751
+ {
2752
+ "family_name": "wmt16-en-de",
2753
+ "num_tasks": 1,
2754
+ "avg_quality_score": 2.0,
2755
+ "tags": [
2756
+ "english",
2757
+ "general"
2758
+ ],
2759
+ "sample_tasks": [
2760
+ "wmt16-en-de"
2761
+ ]
2762
+ },
2763
+ {
2764
+ "family_name": "wmt16-en-ro",
2765
+ "num_tasks": 1,
2766
+ "avg_quality_score": 2.0,
2767
+ "tags": [
2768
+ "english",
2769
+ "general"
2770
+ ],
2771
+ "sample_tasks": [
2772
+ "wmt16-en-ro"
2773
+ ]
2774
+ },
2775
+ {
2776
+ "family_name": "wmt16-ro-en",
2777
+ "num_tasks": 1,
2778
+ "avg_quality_score": 2.0,
2779
+ "tags": [
2780
+ "english",
2781
+ "general"
2782
+ ],
2783
+ "sample_tasks": [
2784
+ "wmt16-ro-en"
2785
+ ]
2786
+ },
2787
+ {
2788
+ "family_name": "wsc273",
2789
+ "num_tasks": 1,
2790
+ "avg_quality_score": 2.0,
2791
+ "tags": [
2792
+ "english",
2793
+ "general",
2794
+ "reasoning"
2795
+ ],
2796
+ "sample_tasks": [
2797
+ "wsc273"
2798
+ ]
2799
+ },
2800
+ {
2801
+ "family_name": "xlsum",
2802
+ "num_tasks": 1,
2803
+ "avg_quality_score": 2.0,
2804
+ "tags": [
2805
+ "english",
2806
+ "general"
2807
+ ],
2808
+ "sample_tasks": [
2809
+ "xlsum_es"
2810
+ ]
2811
+ },
2812
+ {
2813
+ "family_name": "xsum",
2814
+ "num_tasks": 1,
2815
+ "avg_quality_score": 2.0,
2816
+ "tags": [
2817
+ "english",
2818
+ "general"
2819
+ ],
2820
+ "sample_tasks": [
2821
+ "xsum"
2822
+ ]
2823
+ },
2824
+ {
2825
+ "family_name": "yahoo",
2826
+ "num_tasks": 1,
2827
+ "avg_quality_score": 2.0,
2828
+ "tags": [
2829
+ "english",
2830
+ "general"
2831
+ ],
2832
+ "sample_tasks": [
2833
+ "yahoo_answers_topics"
2834
+ ]
2835
+ },
2836
+ {
2837
+ "family_name": "instructhumaneval",
2838
+ "num_tasks": 1,
2839
+ "avg_quality_score": 4.0,
2840
+ "tags": [
2841
+ "code generation",
2842
+ "coding",
2843
+ "english",
2844
+ "instruction-following",
2845
+ "python"
2846
+ ],
2847
+ "sample_tasks": [
2848
+ "instructhumaneval"
2849
+ ]
2850
+ },
2851
+ {
2852
+ "family_name": "apps",
2853
+ "num_tasks": 1,
2854
+ "avg_quality_score": 3.0,
2855
+ "tags": [
2856
+ "code generation",
2857
+ "coding",
2858
+ "competitive programming",
2859
+ "english",
2860
+ "python"
2861
+ ],
2862
+ "sample_tasks": [
2863
+ "apps"
2864
+ ]
2865
+ },
2866
+ {
2867
+ "family_name": "ds1000",
2868
+ "num_tasks": 1,
2869
+ "avg_quality_score": 4.0,
2870
+ "tags": [
2871
+ "code generation",
2872
+ "coding",
2873
+ "data science",
2874
+ "english",
2875
+ "python"
2876
+ ],
2877
+ "sample_tasks": [
2878
+ "ds1000"
2879
+ ]
2880
+ },
2881
+ {
2882
+ "family_name": "humanevalpack",
2883
+ "num_tasks": 1,
2884
+ "avg_quality_score": 4.0,
2885
+ "tags": [
2886
+ "code generation",
2887
+ "coding",
2888
+ "cpp",
2889
+ "go",
2890
+ "java",
2891
+ "javascript",
2892
+ "multilingual",
2893
+ "python",
2894
+ "rust"
2895
+ ],
2896
+ "sample_tasks": [
2897
+ "humanevalpack"
2898
+ ]
2899
+ },
2900
+ {
2901
+ "family_name": "recode",
2902
+ "num_tasks": 1,
2903
+ "avg_quality_score": 3.0,
2904
+ "tags": [
2905
+ "code generation",
2906
+ "coding",
2907
+ "english",
2908
+ "python",
2909
+ "robustness"
2910
+ ],
2911
+ "sample_tasks": [
2912
+ "recode"
2913
+ ]
2914
+ },
2915
+ {
2916
+ "family_name": "conala",
2917
+ "num_tasks": 1,
2918
+ "avg_quality_score": 3.0,
2919
+ "tags": [
2920
+ "code generation",
2921
+ "coding",
2922
+ "english",
2923
+ "natural language to code",
2924
+ "python"
2925
+ ],
2926
+ "sample_tasks": [
2927
+ "conala"
2928
+ ]
2929
+ },
2930
+ {
2931
+ "family_name": "concode",
2932
+ "num_tasks": 1,
2933
+ "avg_quality_score": 3.0,
2934
+ "tags": [
2935
+ "code generation",
2936
+ "coding",
2937
+ "english",
2938
+ "java",
2939
+ "natural language to code"
2940
+ ],
2941
+ "sample_tasks": [
2942
+ "concode"
2943
+ ]
2944
+ },
2945
+ {
2946
+ "family_name": "mercury",
2947
+ "num_tasks": 1,
2948
+ "avg_quality_score": 3.0,
2949
+ "tags": [
2950
+ "code generation",
2951
+ "coding",
2952
+ "computational efficiency",
2953
+ "efficiency",
2954
+ "english",
2955
+ "python"
2956
+ ],
2957
+ "sample_tasks": [
2958
+ "mercury"
2959
+ ]
2960
+ }
2961
+ ]
2962
+ }
2963
+ }