crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (1033) hide show
  1. crfm_helm-0.5.10.dist-info/METADATA +369 -0
  2. crfm_helm-0.5.10.dist-info/RECORD +1008 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +80 -29
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  8. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  9. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  10. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
  11. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
  12. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
  13. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  14. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
  15. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  16. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
  17. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
  18. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
  19. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  20. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
  21. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  22. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  23. helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
  24. helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
  25. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
  26. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
  27. helm/benchmark/adaptation/common_adapter_specs.py +443 -0
  28. helm/benchmark/adaptation/prompt.py +1 -1
  29. helm/benchmark/adaptation/request_state.py +6 -1
  30. helm/benchmark/adaptation/scenario_state.py +6 -2
  31. helm/benchmark/annotation/aci_bench_annotator.py +84 -0
  32. helm/benchmark/annotation/air_bench_annotator.py +79 -0
  33. helm/benchmark/annotation/alrage_annotator.py +90 -0
  34. helm/benchmark/annotation/annotator.py +48 -0
  35. helm/benchmark/annotation/annotator_factory.py +50 -0
  36. helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
  37. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  38. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  39. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  40. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  41. helm/benchmark/annotation/call_center_annotator.py +258 -0
  42. helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
  43. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  44. helm/benchmark/annotation/dischargeme_annotator.py +96 -0
  45. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  46. helm/benchmark/annotation/financebench_annotator.py +79 -0
  47. helm/benchmark/annotation/harm_bench_annotator.py +55 -0
  48. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  49. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
  50. helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
  51. helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
  52. helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
  53. helm/benchmark/annotation/live_qa_annotator.py +76 -0
  54. helm/benchmark/annotation/med_dialog_annotator.py +88 -0
  55. helm/benchmark/annotation/medalign_annotator.py +89 -0
  56. helm/benchmark/annotation/medi_qa_annotator.py +87 -0
  57. helm/benchmark/annotation/medication_qa_annotator.py +86 -0
  58. helm/benchmark/annotation/mental_health_annotator.py +87 -0
  59. helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
  60. helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
  61. helm/benchmark/annotation/model_as_judge.py +309 -0
  62. helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
  63. helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
  64. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  65. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  66. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  67. helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
  68. helm/benchmark/annotation/spider_annotator.py +18 -0
  69. helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
  70. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  71. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  72. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  73. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  74. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  75. helm/benchmark/annotation/xstest_annotator.py +100 -0
  76. helm/benchmark/annotation_executor.py +144 -0
  77. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  78. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  79. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  80. helm/benchmark/augmentations/data_augmenter.py +0 -2
  81. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  82. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  83. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  84. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  85. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  86. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  87. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  88. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  89. helm/benchmark/augmentations/perturbation.py +26 -4
  90. helm/benchmark/augmentations/perturbation_description.py +1 -1
  91. helm/benchmark/augmentations/space_perturbation.py +2 -2
  92. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  93. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  94. helm/benchmark/augmentations/test_perturbation.py +56 -19
  95. helm/benchmark/augmentations/translate_perturbation.py +31 -0
  96. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  97. helm/benchmark/config_registry.py +7 -1
  98. helm/benchmark/data_preprocessor.py +2 -2
  99. helm/benchmark/executor.py +54 -25
  100. helm/benchmark/huggingface_registration.py +28 -10
  101. helm/benchmark/metrics/air_bench_metrics.py +3212 -0
  102. helm/benchmark/metrics/alrage_metric.py +35 -0
  103. helm/benchmark/metrics/annotation_metrics.py +108 -0
  104. helm/benchmark/metrics/basic_metrics.py +437 -667
  105. helm/benchmark/metrics/bbq_metrics.py +17 -6
  106. helm/benchmark/metrics/bias_metrics.py +18 -9
  107. helm/benchmark/metrics/bias_word_lists.py +1 -1
  108. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  109. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  110. helm/benchmark/metrics/classification_metrics.py +107 -22
  111. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  112. helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
  113. helm/benchmark/metrics/code_metrics.py +5 -5
  114. helm/benchmark/metrics/code_metrics_helper.py +11 -3
  115. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  116. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  117. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  118. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  119. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  120. helm/benchmark/metrics/comet_metric.py +125 -0
  121. helm/benchmark/metrics/common_metric_specs.py +174 -0
  122. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
  123. helm/benchmark/metrics/copyright_metrics.py +5 -5
  124. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  125. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  126. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  127. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  128. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  129. helm/benchmark/metrics/disinformation_metrics.py +8 -114
  130. helm/benchmark/metrics/dry_run_metrics.py +35 -6
  131. helm/benchmark/metrics/efficiency_metrics.py +287 -0
  132. helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
  133. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  134. helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
  135. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  136. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  137. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
  138. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  139. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  140. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  141. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
  142. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  143. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  144. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  145. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  146. helm/benchmark/metrics/ifeval_metrics.py +67 -0
  147. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  148. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  149. helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
  150. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  151. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  152. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  153. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  154. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  155. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  156. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  157. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  158. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  159. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  160. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  161. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  162. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  163. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  164. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  165. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  166. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  167. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  168. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  169. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  170. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  171. helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
  172. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  173. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  174. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  175. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  176. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  177. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  178. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  179. helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
  180. helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
  181. helm/benchmark/metrics/language_modeling_metrics.py +111 -0
  182. helm/benchmark/metrics/live_qa_metrics.py +35 -0
  183. helm/benchmark/metrics/llm_jury_metrics.py +58 -0
  184. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  185. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  186. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  187. helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
  188. helm/benchmark/metrics/medec_metrics.py +124 -0
  189. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  190. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  191. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  192. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  193. helm/benchmark/metrics/metric.py +121 -175
  194. helm/benchmark/metrics/metric_name.py +0 -1
  195. helm/benchmark/metrics/metric_service.py +23 -7
  196. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
  197. helm/benchmark/metrics/nltk_helper.py +32 -0
  198. helm/benchmark/metrics/omni_math_metrics.py +44 -0
  199. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  200. helm/benchmark/metrics/output_processing_metric.py +60 -0
  201. helm/benchmark/metrics/output_processors.py +15 -0
  202. helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
  203. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  204. helm/benchmark/metrics/ranking_metrics.py +5 -5
  205. helm/benchmark/metrics/reference_metric.py +148 -0
  206. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  207. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  208. helm/benchmark/metrics/safety_metrics.py +91 -0
  209. helm/benchmark/metrics/seahelm_metrics.py +201 -0
  210. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  211. helm/benchmark/metrics/spider_metrics.py +7 -0
  212. helm/benchmark/metrics/statistic.py +1 -1
  213. helm/benchmark/metrics/summac/model_summac.py +8 -11
  214. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  215. helm/benchmark/metrics/summarization_metrics.py +150 -11
  216. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  217. helm/benchmark/metrics/test_classification_metrics.py +145 -70
  218. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  219. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
  220. helm/benchmark/metrics/test_metric.py +3 -3
  221. helm/benchmark/metrics/test_statistic.py +2 -2
  222. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  223. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  224. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  225. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  226. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
  227. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  228. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
  229. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
  230. helm/benchmark/metrics/toxicity_metrics.py +37 -7
  231. helm/benchmark/metrics/toxicity_utils.py +23 -0
  232. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  233. helm/benchmark/metrics/unitxt_metrics.py +107 -0
  234. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  235. helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
  236. helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
  237. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  238. helm/benchmark/metrics/wildbench_metrics.py +54 -0
  239. helm/benchmark/model_deployment_registry.py +69 -5
  240. helm/benchmark/model_metadata_registry.py +58 -2
  241. helm/benchmark/multi_gpu_runner.py +133 -0
  242. helm/benchmark/presentation/contamination.py +3 -3
  243. helm/benchmark/presentation/create_plots.py +51 -20
  244. helm/benchmark/presentation/run_display.py +51 -12
  245. helm/benchmark/presentation/run_entry.py +2 -2
  246. helm/benchmark/presentation/schema.py +83 -66
  247. helm/benchmark/presentation/summarize.py +483 -388
  248. helm/benchmark/presentation/table.py +8 -8
  249. helm/benchmark/presentation/taxonomy_info.py +20 -0
  250. helm/benchmark/presentation/test_contamination.py +2 -2
  251. helm/benchmark/presentation/test_create_plots.py +4 -1
  252. helm/benchmark/presentation/test_run_entry.py +2 -2
  253. helm/benchmark/presentation/test_schema.py +11 -0
  254. helm/benchmark/presentation/test_summarize.py +148 -6
  255. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  256. helm/benchmark/reeval_run.py +202 -0
  257. helm/benchmark/reeval_runner.py +355 -0
  258. helm/benchmark/run.py +151 -87
  259. helm/benchmark/run_expander.py +418 -33
  260. helm/benchmark/run_spec.py +93 -0
  261. helm/benchmark/run_spec_factory.py +180 -0
  262. helm/benchmark/run_specs/__init__.py +0 -0
  263. helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
  264. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  265. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  266. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  267. helm/benchmark/run_specs/call_center_run_specs.py +201 -0
  268. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  269. helm/benchmark/run_specs/classic_run_specs.py +1393 -0
  270. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  271. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  272. helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
  273. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  274. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  275. helm/benchmark/run_specs/experimental_run_specs.py +224 -0
  276. helm/benchmark/run_specs/finance_run_specs.py +114 -0
  277. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  278. helm/benchmark/run_specs/heim_run_specs.py +625 -0
  279. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  280. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  281. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  282. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  283. helm/benchmark/run_specs/long_context_run_specs.py +188 -0
  284. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  285. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  286. helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
  287. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  288. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  289. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  290. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  291. helm/benchmark/run_specs/safety_run_specs.py +191 -0
  292. helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
  293. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  294. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
  295. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  296. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  297. helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
  298. helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
  299. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  300. helm/benchmark/runner.py +63 -62
  301. helm/benchmark/runner_config_registry.py +21 -0
  302. helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
  303. helm/benchmark/scenarios/air_bench_scenario.py +76 -0
  304. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  305. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  306. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
  307. helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
  308. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  309. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  310. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  311. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  312. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  313. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  314. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  315. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  316. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  317. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  318. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  319. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  320. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  321. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  322. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  323. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  324. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  325. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  326. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  327. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  328. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  329. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  330. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  331. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  332. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  333. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  334. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  335. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
  336. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  337. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
  338. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  339. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  340. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  341. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  342. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  343. helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
  344. helm/benchmark/scenarios/banking77_scenario.py +77 -0
  345. helm/benchmark/scenarios/bbq_scenario.py +17 -2
  346. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  347. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  348. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  349. helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
  350. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  351. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  352. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  353. helm/benchmark/scenarios/bold_scenario.py +18 -3
  354. helm/benchmark/scenarios/boolq_scenario.py +21 -1
  355. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  356. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  357. helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
  358. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  359. helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
  360. helm/benchmark/scenarios/clear_scenario.py +180 -0
  361. helm/benchmark/scenarios/cleva_scenario.py +482 -3
  362. helm/benchmark/scenarios/code_scenario.py +46 -4
  363. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  364. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  365. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  366. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  367. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  368. helm/benchmark/scenarios/commonsense_scenario.py +33 -1
  369. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  370. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
  371. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  372. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  373. helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
  374. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  375. helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
  376. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
  377. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
  378. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
  379. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
  380. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
  381. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
  382. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
  383. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
  384. helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
  385. helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
  386. helm/benchmark/scenarios/disinformation_scenario.py +32 -1
  387. helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
  388. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  389. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  390. helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
  391. helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
  392. helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
  393. helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
  394. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  395. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  396. helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
  397. helm/benchmark/scenarios/financebench_scenario.py +74 -0
  398. helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
  399. helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
  400. helm/benchmark/scenarios/gpqa_scenario.py +98 -0
  401. helm/benchmark/scenarios/grammar.py +2 -2
  402. helm/benchmark/scenarios/grammar_scenario.py +21 -2
  403. helm/benchmark/scenarios/gsm_scenario.py +31 -1
  404. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
  405. helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
  406. helm/benchmark/scenarios/headqa_scenario.py +158 -0
  407. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  408. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
  409. helm/benchmark/scenarios/ice_scenario.py +28 -4
  410. helm/benchmark/scenarios/ifeval_scenario.py +71 -0
  411. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  412. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  413. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  414. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  415. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  416. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  417. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  418. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  419. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  420. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  421. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  422. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  423. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  424. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  425. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  426. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  427. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  428. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  429. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  430. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  431. helm/benchmark/scenarios/imdb_scenario.py +26 -3
  432. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  433. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  434. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
  435. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  436. helm/benchmark/scenarios/koala_scenario.py +21 -1
  437. helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
  438. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
  439. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  440. helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
  441. helm/benchmark/scenarios/legal_support_scenario.py +24 -1
  442. helm/benchmark/scenarios/legalbench_scenario.py +45 -3
  443. helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
  444. helm/benchmark/scenarios/lextreme_scenario.py +22 -1
  445. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  446. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  447. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  448. helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
  449. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  450. helm/benchmark/scenarios/math_scenario.py +81 -22
  451. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  452. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  453. helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
  454. helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
  455. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  456. helm/benchmark/scenarios/med_qa_scenario.py +30 -1
  457. helm/benchmark/scenarios/medalign_scenario.py +117 -0
  458. helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
  459. helm/benchmark/scenarios/medbullets_scenario.py +167 -0
  460. helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
  461. helm/benchmark/scenarios/medec_scenario.py +148 -0
  462. helm/benchmark/scenarios/medhallu_scenario.py +95 -0
  463. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  464. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  465. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  466. helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
  467. helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
  468. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  469. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  470. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  471. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  472. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  473. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  474. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  475. helm/benchmark/scenarios/mental_health_scenario.py +146 -0
  476. helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
  477. helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
  478. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
  479. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  480. helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
  481. helm/benchmark/scenarios/mmlu_scenario.py +32 -1
  482. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  483. helm/benchmark/scenarios/msmarco_scenario.py +31 -1
  484. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
  485. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
  486. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
  487. helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
  488. helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
  489. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  490. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  491. helm/benchmark/scenarios/omni_math_scenario.py +71 -0
  492. helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
  493. helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
  494. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
  495. helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
  496. helm/benchmark/scenarios/quac_scenario.py +24 -1
  497. helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
  498. helm/benchmark/scenarios/raft_scenario.py +33 -3
  499. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  500. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  501. helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
  502. helm/benchmark/scenarios/scenario.py +44 -1
  503. helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
  504. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  505. helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
  506. helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
  507. helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
  508. helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
  509. helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
  510. helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
  511. helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
  512. helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
  513. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  514. helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
  515. helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
  516. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  517. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  518. helm/benchmark/scenarios/spider_scenario.py +109 -0
  519. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
  520. helm/benchmark/scenarios/summarization_scenario.py +48 -1
  521. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  522. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  523. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
  524. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  525. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  526. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  527. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  528. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  529. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  530. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  531. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  532. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  533. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  534. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  535. helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
  536. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  537. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  538. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  539. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  540. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  541. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  542. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  543. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  544. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  545. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  546. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  547. helm/benchmark/scenarios/test_math_scenario.py +4 -3
  548. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  549. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  550. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  551. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  552. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  553. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  554. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  555. helm/benchmark/scenarios/test_scenario.py +6 -3
  556. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  557. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  558. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  559. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  560. helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
  561. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  562. helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
  563. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  564. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  565. helm/benchmark/scenarios/unitxt_scenario.py +62 -0
  566. helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
  567. helm/benchmark/scenarios/vicuna_scenario.py +22 -2
  568. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  569. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  570. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  571. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
  572. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  573. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  574. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  575. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  576. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
  577. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  578. helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
  579. helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
  580. helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
  581. helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
  582. helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
  583. helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
  584. helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
  585. helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
  586. helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
  587. helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
  588. helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
  589. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  590. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  591. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  592. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  593. helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
  594. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  595. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  596. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  597. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  598. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  599. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  600. helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
  601. helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
  602. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  603. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
  604. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  605. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
  606. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
  607. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  608. helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
  609. helm/benchmark/scenarios/wikifact_scenario.py +31 -1
  610. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  611. helm/benchmark/scenarios/wildbench_scenario.py +101 -0
  612. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  613. helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
  614. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  615. helm/benchmark/server.py +32 -2
  616. helm/benchmark/slurm_jobs.py +1 -2
  617. helm/benchmark/slurm_runner.py +78 -50
  618. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  619. helm/benchmark/static/schema_arabic.yaml +271 -0
  620. helm/benchmark/static/schema_audio.yaml +763 -0
  621. helm/benchmark/static/schema_autobencher.yaml +150 -0
  622. helm/benchmark/static/schema_call_center.yaml +269 -0
  623. helm/benchmark/static/schema_capabilities.yaml +254 -0
  624. helm/benchmark/static/schema_classic.yaml +259 -1140
  625. helm/benchmark/static/schema_cleva.yaml +768 -0
  626. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  627. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  628. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  629. helm/benchmark/static/schema_enterprise.yaml +319 -0
  630. helm/benchmark/static/schema_ewok.yaml +367 -0
  631. helm/benchmark/static/schema_finance.yaml +191 -0
  632. helm/benchmark/static/schema_heim.yaml +1389 -0
  633. helm/benchmark/static/schema_image2struct.yaml +588 -0
  634. helm/benchmark/static/schema_instruction_following.yaml +161 -0
  635. helm/benchmark/static/schema_legal.yaml +566 -0
  636. helm/benchmark/static/schema_lite.yaml +3 -286
  637. helm/benchmark/static/schema_long_context.yaml +282 -0
  638. helm/benchmark/static/schema_medhelm.yaml +1176 -0
  639. helm/benchmark/static/schema_melt.yaml +1257 -0
  640. helm/benchmark/static/schema_mmlu.yaml +1449 -0
  641. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  642. helm/benchmark/static/schema_safety.yaml +283 -0
  643. helm/benchmark/static/schema_seahelm.yaml +723 -0
  644. helm/benchmark/static/schema_slp.yaml +219 -0
  645. helm/benchmark/static/schema_slphelm.yaml +162 -0
  646. helm/benchmark/static/schema_social_audio.yaml +224 -0
  647. helm/benchmark/static/schema_sql.yaml +171 -0
  648. helm/benchmark/static/schema_thai.yaml +244 -0
  649. helm/benchmark/static/schema_torr.yaml +474 -0
  650. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  651. helm/benchmark/static/schema_unitxt.yaml +370 -0
  652. helm/benchmark/static/schema_vhelm.yaml +933 -0
  653. helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
  654. helm/benchmark/static/schema_video.yaml +219 -0
  655. helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
  656. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  657. helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
  658. helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
  659. helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
  660. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  661. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  662. helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
  663. helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
  664. helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
  665. helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
  666. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  667. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  668. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  669. helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
  670. helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
  671. helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
  672. helm/benchmark/static_build/config.js +4 -0
  673. helm/benchmark/static_build/index.html +19 -0
  674. helm/benchmark/test_data_preprocessor.py +3 -3
  675. helm/benchmark/test_run_expander.py +1 -1
  676. helm/benchmark/window_services/default_window_service.py +3 -45
  677. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
  678. helm/benchmark/window_services/ice_window_service.py +1 -35
  679. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  680. helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
  681. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  682. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  683. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  684. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  685. helm/benchmark/window_services/local_window_service.py +22 -5
  686. helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
  687. helm/benchmark/window_services/test_bloom_window_service.py +5 -4
  688. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  689. helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
  690. helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
  691. helm/benchmark/window_services/test_gptj_window_service.py +11 -5
  692. helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
  693. helm/benchmark/window_services/test_openai_window_service.py +18 -12
  694. helm/benchmark/window_services/test_opt_window_service.py +6 -5
  695. helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
  696. helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
  697. helm/benchmark/window_services/test_t511b_window_service.py +5 -4
  698. helm/benchmark/window_services/test_ul2_window_service.py +5 -4
  699. helm/benchmark/window_services/test_utils.py +6 -6
  700. helm/benchmark/window_services/test_yalm_window_service.py +5 -4
  701. helm/benchmark/window_services/tokenizer_service.py +7 -13
  702. helm/benchmark/window_services/window_service.py +42 -0
  703. helm/benchmark/window_services/window_service_factory.py +4 -1
  704. helm/benchmark/window_services/yalm_window_service.py +1 -28
  705. helm/clients/__init__.py +0 -0
  706. helm/{proxy/clients → clients}/ai21_client.py +78 -12
  707. helm/clients/aleph_alpha_client.py +114 -0
  708. helm/{proxy/clients → clients}/anthropic_client.py +304 -21
  709. helm/clients/audio_language/__init__.py +0 -0
  710. helm/clients/audio_language/diva_llama_client.py +122 -0
  711. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  712. helm/clients/audio_language/llama_omni/constants.py +9 -0
  713. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  714. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  715. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  716. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  717. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  718. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  719. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  720. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  721. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  722. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  723. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  724. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  725. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  726. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  727. helm/clients/audio_language/llama_omni/utils.py +202 -0
  728. helm/clients/audio_language/llama_omni_client.py +199 -0
  729. helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
  730. helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
  731. helm/clients/audio_language/qwen_audiolm_client.py +153 -0
  732. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  733. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  734. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  735. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  736. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  737. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  738. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  739. helm/clients/audio_language/test.py +62 -0
  740. helm/{proxy/clients → clients}/auto_client.py +72 -31
  741. helm/clients/azure_openai_client.py +55 -0
  742. helm/clients/bedrock_client.py +381 -0
  743. helm/clients/bedrock_utils.py +105 -0
  744. helm/{proxy/clients → clients}/client.py +92 -17
  745. helm/clients/clip_score_client.py +49 -0
  746. helm/clients/clip_scorers/__init__.py +0 -0
  747. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  748. helm/clients/clip_scorers/clip_scorer.py +50 -0
  749. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  750. helm/{proxy/clients → clients}/cohere_client.py +105 -14
  751. helm/clients/dspy_client.py +135 -0
  752. helm/clients/gcs_client.py +82 -0
  753. helm/{proxy/clients → clients}/google_client.py +8 -6
  754. helm/clients/google_translate_client.py +35 -0
  755. helm/clients/grok_client.py +36 -0
  756. helm/{proxy/clients → clients}/http_model_client.py +8 -8
  757. helm/{proxy/clients → clients}/huggingface_client.py +157 -86
  758. helm/clients/huggingface_pipeline_client.py +138 -0
  759. helm/clients/ibm_client.py +269 -0
  760. helm/clients/image_generation/__init__.py +0 -0
  761. helm/clients/image_generation/adobe_vision_client.py +80 -0
  762. helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
  763. helm/clients/image_generation/cogview2/__init__.py +0 -0
  764. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  765. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  766. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  767. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
  768. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  769. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  770. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
  771. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  772. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  773. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  774. helm/clients/image_generation/cogview2_client.py +192 -0
  775. helm/clients/image_generation/dalle2_client.py +194 -0
  776. helm/clients/image_generation/dalle3_client.py +108 -0
  777. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  778. helm/clients/image_generation/dalle_mini/data.py +442 -0
  779. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  780. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  781. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  782. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  783. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  784. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  785. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  786. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  787. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  788. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  789. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  790. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  791. helm/clients/image_generation/dalle_mini_client.py +191 -0
  792. helm/clients/image_generation/deep_floyd_client.py +80 -0
  793. helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
  794. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  795. helm/clients/image_generation/lexica_client.py +88 -0
  796. helm/clients/image_generation/mindalle/__init__.py +0 -0
  797. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  798. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  799. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  800. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  801. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  802. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  803. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  804. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  805. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  806. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  807. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  808. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  809. helm/clients/image_generation/mindalle_client.py +116 -0
  810. helm/clients/image_generation/nudity_check_client.py +64 -0
  811. helm/clients/image_generation/together_image_generation_client.py +113 -0
  812. helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
  813. helm/{proxy/clients → clients}/megatron_client.py +7 -5
  814. helm/clients/mistral_client.py +180 -0
  815. helm/clients/moderation_api_client.py +111 -0
  816. helm/clients/nvidia_nim_client.py +32 -0
  817. helm/clients/open_lm_client.py +43 -0
  818. helm/clients/openai_client.py +604 -0
  819. helm/clients/openai_responses_client.py +200 -0
  820. helm/clients/openrouter_client.py +31 -0
  821. helm/{proxy/clients → clients}/palmyra_client.py +31 -14
  822. helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
  823. helm/clients/reka_client.py +190 -0
  824. helm/clients/simple_client.py +64 -0
  825. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  826. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  827. helm/clients/stanfordhealthcare_google_client.py +43 -0
  828. helm/clients/stanfordhealthcare_http_model_client.py +95 -0
  829. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  830. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  831. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  832. helm/clients/test_client.py +98 -0
  833. helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
  834. helm/clients/test_openrouter_client.py +69 -0
  835. helm/clients/test_simple_client.py +19 -0
  836. helm/clients/test_together_client.py +184 -0
  837. helm/clients/together_client.py +599 -0
  838. helm/clients/upstage_client.py +23 -0
  839. helm/clients/vertexai_client.py +488 -0
  840. helm/clients/vision_language/__init__.py +0 -0
  841. helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
  842. helm/clients/vision_language/huggingface_vlm_client.py +114 -0
  843. helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
  844. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  845. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  846. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  847. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  848. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  849. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  850. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  851. helm/clients/vision_language/open_flamingo_client.py +155 -0
  852. helm/clients/vision_language/paligemma_client.py +147 -0
  853. helm/clients/vision_language/palmyra_vision_client.py +101 -0
  854. helm/clients/vision_language/qwen2_vlm_client.py +189 -0
  855. helm/clients/vision_language/qwen_vlm_client.py +174 -0
  856. helm/clients/vllm_client.py +80 -0
  857. helm/clients/vllm_granite_thinking_client.py +56 -0
  858. helm/clients/writer_client.py +105 -0
  859. helm/clients/yi_client.py +28 -0
  860. helm/common/audio_utils.py +111 -0
  861. helm/common/cache.py +23 -33
  862. helm/common/cache_backend_config.py +47 -0
  863. helm/common/clip_score_request.py +41 -0
  864. helm/common/context.py +80 -0
  865. helm/common/credentials_utils.py +5 -5
  866. helm/common/critique_request.py +10 -2
  867. helm/common/file_caches/__init__.py +0 -0
  868. helm/common/file_caches/file_cache.py +16 -0
  869. helm/common/file_caches/local_file_cache.py +61 -0
  870. helm/common/file_caches/test_local_file_cache.py +25 -0
  871. helm/common/file_upload_request.py +27 -0
  872. helm/common/general.py +10 -3
  873. helm/common/hierarchical_logger.py +124 -12
  874. helm/common/image_generation_parameters.py +25 -0
  875. helm/common/images_utils.py +60 -5
  876. helm/common/key_value_store.py +41 -10
  877. helm/common/local_context.py +140 -0
  878. helm/common/media_object.py +14 -1
  879. helm/common/moderations_api_request.py +71 -0
  880. helm/common/mongo_key_value_store.py +8 -7
  881. helm/common/multimodal_request_utils.py +57 -0
  882. helm/common/nudity_check_request.py +29 -0
  883. helm/common/object_spec.py +23 -8
  884. helm/common/optional_dependencies.py +1 -1
  885. helm/common/reeval_parameters.py +12 -0
  886. helm/common/remote_context.py +61 -0
  887. helm/common/request.py +45 -19
  888. helm/common/response_format.py +18 -0
  889. helm/common/test_cache.py +1 -48
  890. helm/common/test_general.py +10 -0
  891. helm/common/test_logging.py +94 -0
  892. helm/common/test_media_object.py +1 -1
  893. helm/common/tokenization_request.py +1 -10
  894. helm/config/model_deployments.yaml +4713 -1005
  895. helm/config/model_metadata.yaml +4045 -255
  896. helm/config/tokenizer_configs.yaml +1091 -50
  897. helm/proxy/accounts.py +31 -4
  898. helm/proxy/cli.py +6 -4
  899. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  900. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  901. helm/proxy/critique/model_critique_client.py +40 -10
  902. helm/proxy/example_queries.py +33 -28
  903. helm/proxy/retry.py +5 -0
  904. helm/proxy/server.py +82 -18
  905. helm/proxy/services/remote_service.py +32 -7
  906. helm/proxy/services/server_service.py +71 -69
  907. helm/proxy/services/service.py +30 -6
  908. helm/proxy/services/test_remote_service.py +6 -5
  909. helm/proxy/services/test_service.py +1 -13
  910. helm/proxy/static/help.html +99 -0
  911. helm/proxy/static/index.css +61 -0
  912. helm/proxy/static/index.html +40 -0
  913. helm/proxy/static/index.js +462 -0
  914. helm/proxy/test_accounts.py +32 -0
  915. helm/proxy/test_retry.py +1 -1
  916. helm/proxy/token_counters/auto_token_counter.py +37 -37
  917. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  918. helm/proxy/token_counters/token_counter.py +3 -5
  919. helm/tokenizers/__init__.py +0 -0
  920. helm/tokenizers/ai21_tokenizer.py +52 -0
  921. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
  922. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
  923. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
  924. helm/tokenizers/cohere_tokenizer.py +50 -0
  925. helm/tokenizers/grok_tokenizer.py +55 -0
  926. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
  927. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
  928. helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
  929. helm/tokenizers/simple_tokenizer.py +33 -0
  930. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  931. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
  932. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  933. helm/tokenizers/test_grok_tokenizer.py +33 -0
  934. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
  935. helm/tokenizers/test_simple_tokenizer.py +33 -0
  936. helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
  937. helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
  938. helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
  939. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  940. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
  941. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  942. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  943. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  944. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  945. crfm_helm-0.4.0.dist-info/METADATA +0 -264
  946. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  947. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  948. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  949. helm/benchmark/data_overlap/light_scenario.py +0 -60
  950. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  951. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  952. helm/benchmark/run_specs.py +0 -2762
  953. helm/benchmark/scenarios/numeracy_scenario.py +0 -784
  954. helm/benchmark/static/benchmarking.css +0 -156
  955. helm/benchmark/static/benchmarking.js +0 -1705
  956. helm/benchmark/static/config.js +0 -3
  957. helm/benchmark/static/images/helm-logo.png +0 -0
  958. helm/benchmark/static/images/language-model-helm.png +0 -0
  959. helm/benchmark/static/images/organizations/ai21.png +0 -0
  960. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  961. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  962. helm/benchmark/static/images/organizations/cohere.png +0 -0
  963. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  964. helm/benchmark/static/images/organizations/google.png +0 -0
  965. helm/benchmark/static/images/organizations/meta.png +0 -0
  966. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  967. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  968. helm/benchmark/static/images/organizations/openai.png +0 -0
  969. helm/benchmark/static/images/organizations/together.png +0 -0
  970. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  971. helm/benchmark/static/images/organizations/yandex.png +0 -0
  972. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  973. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  974. helm/benchmark/static/index.html +0 -68
  975. helm/benchmark/static/json-urls.js +0 -69
  976. helm/benchmark/static/plot-captions.js +0 -27
  977. helm/benchmark/static/utils.js +0 -285
  978. helm/benchmark/test_model_deployment_definition.py +0 -92
  979. helm/benchmark/test_model_properties.py +0 -1570
  980. helm/benchmark/vlm_run_specs.py +0 -97
  981. helm/benchmark/window_services/ai21_window_service.py +0 -258
  982. helm/benchmark/window_services/cohere_window_service.py +0 -163
  983. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  984. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  985. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  986. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  987. helm/benchmark/window_services/t511b_window_service.py +0 -30
  988. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  989. helm/benchmark/window_services/test_cohere_window_service.py +0 -74
  990. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  991. helm/benchmark/window_services/test_ice_window_service.py +0 -326
  992. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  993. helm/benchmark/window_services/ul2_window_service.py +0 -30
  994. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  995. helm/common/cache_utils.py +0 -14
  996. helm/proxy/clients/aleph_alpha_client.py +0 -95
  997. helm/proxy/clients/goose_ai_client.py +0 -99
  998. helm/proxy/clients/microsoft_client.py +0 -180
  999. helm/proxy/clients/openai_client.py +0 -206
  1000. helm/proxy/clients/simple_client.py +0 -60
  1001. helm/proxy/clients/test_client.py +0 -49
  1002. helm/proxy/clients/test_together_client.py +0 -97
  1003. helm/proxy/clients/together_client.py +0 -334
  1004. helm/proxy/clients/vertexai_client.py +0 -115
  1005. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  1006. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  1007. helm/proxy/token_counters/free_token_counter.py +0 -12
  1008. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  1009. helm/proxy/token_counters/openai_token_counter.py +0 -22
  1010. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  1011. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  1012. helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
  1013. helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
  1014. helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
  1015. helm/proxy/tokenizers/ice_tokenizer.py +0 -30
  1016. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  1017. helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
  1018. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  1019. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
  1020. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  1021. /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
  1022. /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
  1023. /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
  1024. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  1025. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  1026. /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
  1027. /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
  1028. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  1029. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  1030. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  1031. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  1032. /helm/{benchmark → proxy}/static/general.js +0 -0
  1033. /helm/{benchmark → proxy}/static/info-icon.png +0 -0
@@ -0,0 +1,1257 @@
1
+ ---
2
+ ############################################################
3
+ metrics:
4
+ # Infrastructure metrics:
5
+ - name: num_perplexity_tokens
6
+ display_name: '# tokens'
7
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
8
+ - name: num_bytes
9
+ display_name: '# bytes'
10
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
11
+
12
+ - name: num_references
13
+ display_name: '# ref'
14
+ description: Number of references.
15
+ - name: num_train_trials
16
+ display_name: '# trials'
17
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
18
+ - name: estimated_num_tokens_cost
19
+ display_name: 'cost'
20
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
21
+ - name: num_prompt_tokens
22
+ display_name: '# prompt tokens'
23
+ description: Number of tokens in the prompt.
24
+ - name: num_prompt_characters
25
+ display_name: '# prompt chars'
26
+ description: Number of characters in the prompt.
27
+ - name: num_completion_tokens
28
+ display_name: '# completion tokens'
29
+ description: Actual number of completion tokens (over all completions).
30
+ - name: num_output_tokens
31
+ display_name: '# output tokens'
32
+ description: Actual number of output tokens.
33
+ - name: max_num_output_tokens
34
+ display_name: 'Max output tokens'
35
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
36
+ - name: num_requests
37
+ display_name: '# requests'
38
+ description: Number of distinct API requests.
39
+ - name: num_instances
40
+ display_name: '# eval'
41
+ description: Number of evaluation instances.
42
+ - name: num_train_instances
43
+ display_name: '# train'
44
+ description: Number of training instances (e.g., in-context examples).
45
+ - name: prompt_truncated
46
+ display_name: truncated
47
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
48
+ - name: finish_reason_length
49
+ display_name: finish b/c length
50
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
51
+ - name: finish_reason_stop
52
+ display_name: finish b/c stop
53
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
54
+ - name: finish_reason_endoftext
55
+ display_name: finish b/c endoftext
56
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
57
+ - name: finish_reason_unknown
58
+ display_name: finish b/c unknown
59
+ description: Fraction of instances where the the output was terminated for unknown reasons.
60
+ - name: num_completions
61
+ display_name: '# completions'
62
+ description: Number of completions.
63
+ - name: predicted_index
64
+ display_name: Predicted index
65
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
66
+
67
+ # Accuracy metrics:
68
+ - name: exact_match
69
+ display_name: Exact match
70
+ short_display_name: EM
71
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
72
+ lower_is_better: false
73
+ - name: quasi_exact_match
74
+ display_name: Quasi-exact match
75
+ short_display_name: EM
76
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
77
+ lower_is_better: false
78
+ - name: prefix_exact_match
79
+ display_name: Prefix exact match
80
+ short_display_name: PEM
81
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
82
+ lower_is_better: false
83
+ - name: quasi_prefix_exact_match
84
+ # TODO: should call this prefix_quasi_exact_match
85
+ display_name: Prefix quasi-exact match
86
+ short_display_name: PEM
87
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
88
+ lower_is_better: false
89
+
90
+ - name: exact_match@5
91
+ display_name: Exact match @5
92
+ short_display_name: EM@5
93
+ description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference exactly.
94
+ lower_is_better: false
95
+ - name: quasi_exact_match@5
96
+ display_name: Quasi-exact match @5
97
+ short_display_name: EM@5
98
+ description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference up to light processing.
99
+ lower_is_better: false
100
+ - name: prefix_exact_match@5
101
+ display_name: Prefix exact match @5
102
+ short_display_name: PEM@5
103
+ description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference exactly.
104
+ lower_is_better: false
105
+ - name: quasi_prefix_exact_match@5
106
+ display_name: Prefix quasi-exact match @5
107
+ short_display_name: PEM@5
108
+ description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference up to light processing.
109
+ lower_is_better: false
110
+
111
+ - name: logprob
112
+ display_name: Log probability
113
+ short_display_name: Logprob
114
+ description: Predicted output's average log probability (input's log prob for language modeling).
115
+ lower_is_better: false
116
+ - name: logprob_per_byte
117
+ display_name: Log probability / byte
118
+ short_display_name: Logprob/byte
119
+ description: Predicted output's average log probability normalized by the number of bytes.
120
+ lower_is_better: false
121
+ - name: bits_per_byte
122
+ display_name: Bits/byte
123
+ short_display_name: BPB
124
+ lower_is_better: true
125
+ description: Average number of bits per byte according to model probabilities.
126
+ - name: perplexity
127
+ display_name: Perplexity
128
+ short_display_name: PPL
129
+ lower_is_better: true
130
+ description: Perplexity of the output completion (effective branching factor per output token).
131
+ - name: rouge_1
132
+ display_name: ROUGE-1
133
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
134
+ lower_is_better: false
135
+ - name: rouge_2
136
+ display_name: ROUGE-2
137
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
138
+ lower_is_better: false
139
+ - name: rouge_l
140
+ display_name: ROUGE-L
141
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
142
+ lower_is_better: false
143
+ - name: bleu_1
144
+ display_name: BLEU-1
145
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap.
146
+ lower_is_better: false
147
+ - name: bleu_4
148
+ display_name: BLEU-4
149
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.
150
+ lower_is_better: false
151
+ - name: f1_set_match
152
+ display_name: F1 (set match)
153
+ short_display_name: F1
154
+ description: Average F1 score in terms of set overlap between the model predicted set and correct reference set.
155
+ lower_is_better: false
156
+ - name: f1_score
157
+ display_name: F1
158
+ description: Average F1 score in terms of word overlap between the model output and correct reference.
159
+ lower_is_better: false
160
+ - name: f1_score@5
161
+ display_name: F1@5
162
+ description: Average F1 score at top 5 in terms of word overlap between the model output and correct reference.
163
+ lower_is_better: false
164
+ - name: classification_macro_f1
165
+ display_name: Macro-F1
166
+ description: Population-level macro-averaged F1 score.
167
+ lower_is_better: false
168
+ - name: classification_micro_f1
169
+ display_name: Micro-F1
170
+ description: Population-level micro-averaged F1 score.
171
+ lower_is_better: false
172
+ - name: absolute_value_difference
173
+ display_name: Absolute difference
174
+ short_display_name: Diff.
175
+ lower_is_better: true
176
+ description: Average absolute difference between the model output (converted to a number) and the correct reference.
177
+ - name: distance
178
+ display_name: Geometric distance
179
+ short_display_name: Dist.
180
+ lower_is_better: true
181
+ description: Average gometric distance between the model output (as a point) and the correct reference (as a curve).
182
+ - name: percent_valid
183
+ display_name: Valid fraction
184
+ short_display_name: Valid
185
+ description: Fraction of valid model outputs (as a number).
186
+ lower_is_better: false
187
+ - name: RR@5
188
+ display_name: RR@5
189
+ description: Mean reciprocal rank at 5 in information retrieval.
190
+ lower_is_better: false
191
+ - name: NDCG@10
192
+ display_name: NDCG@10
193
+ description: Normalized discounted cumulative gain at 10 in information retrieval.
194
+ lower_is_better: false
195
+ - name: RR@10
196
+ display_name: RR@10
197
+ description: Mean reciprocal rank at 10 in information retrieval.
198
+ lower_is_better: false
199
+ - name: NDCG@20
200
+ display_name: NDCG@20
201
+ description: Normalized discounted cumulative gain at 20 in information retrieval.
202
+ lower_is_better: false
203
+ - name: RR@20
204
+ display_name: RR@20
205
+ description: Mean reciprocal rank at 20 in information retrieval.
206
+ lower_is_better: false
207
+ - name: Success@1
208
+ display_name: Success@1
209
+ description: Success at top 1 in information retrieval.
210
+ lower_is_better: false
211
+ - name: Success@2
212
+ display_name: Success@2
213
+ description: Success at top 2 in information retrieval.
214
+ lower_is_better: false
215
+ - name: Success@3
216
+ display_name: Success@3
217
+ description: Success at top 3 in information retrieval.
218
+ lower_is_better: false
219
+ - name: Success@5
220
+ display_name: Success@5
221
+ description: Success at top 5 in information retrieval.
222
+ lower_is_better: false
223
+ - name: Success@10
224
+ display_name: Success@10
225
+ description: Success at top 10 in information retrieval.
226
+ lower_is_better: false
227
+ - name: Success@20
228
+ display_name: Success@20
229
+ description: Success at top 20 in information retrieval.
230
+ lower_is_better: false
231
+ - name: Recall@1
232
+ display_name: Recall@1
233
+ description: Recall at top 1 in information retrieval.
234
+ lower_is_better: false
235
+ - name: Recall@2
236
+ display_name: Recall@2
237
+ description: Recall at top 2 in information retrieval.
238
+ lower_is_better: false
239
+ - name: Recall@3
240
+ display_name: Recall@3
241
+ description: Recall at top 3 in information retrieval.
242
+ lower_is_better: false
243
+ - name: Recall@5
244
+ display_name: Recall@5
245
+ description: Recall at top 5 in information retrieval.
246
+ lower_is_better: false
247
+ - name: Recall@10
248
+ display_name: Recall@10
249
+ description: Recall at top 10 in information retrieval.
250
+ lower_is_better: false
251
+ - name: Recall@20
252
+ display_name: Recall@20
253
+ description: Recall at top 20 in information retrieval.
254
+ lower_is_better: false
255
+ - name: Success@1 (topk=30)
256
+ display_name: Success@1 (topk=30)
257
+ description: Success at top 1 in information retrieval.
258
+ lower_is_better: false
259
+ - name: Success@2 (topk=30)
260
+ display_name: Success@2 (topk=30)
261
+ description: Success at top 2 in information retrieval.
262
+ lower_is_better: false
263
+ - name: Success@3 (topk=30)
264
+ display_name: Success@3 (topk=30)
265
+ description: Success at top 3 in information retrieval.
266
+ lower_is_better: false
267
+ - name: Success@5 (topk=30)
268
+ display_name: Success@5 (topk=30)
269
+ description: Success at top 5 in information retrieval.
270
+ lower_is_better: false
271
+ - name: Success@10 (topk=30)
272
+ display_name: Success@10 (topk=30)
273
+ description: Success at top 10 in information retrieval.
274
+ lower_is_better: false
275
+ - name: Success@20 (topk=30)
276
+ display_name: Success@20 (topk=30)
277
+ description: Success at top 20 in information retrieval.
278
+ lower_is_better: false
279
+ - name: Recall@1 (topk=30)
280
+ display_name: Recall@1 (topk=30)
281
+ description: Recall at top 1 in information retrieval.
282
+ lower_is_better: false
283
+ - name: Recall@2 (topk=30)
284
+ display_name: Recall@2 (topk=30)
285
+ description: Recall at top 2 in information retrieval.
286
+ lower_is_better: false
287
+ - name: Recall@3 (topk=30)
288
+ display_name: Recall@3 (topk=30)
289
+ description: Recall at top 3 in information retrieval.
290
+ lower_is_better: false
291
+ - name: Recall@5 (topk=30)
292
+ display_name: Recall@5 (topk=30)
293
+ description: Recall at top 5 in information retrieval.
294
+ lower_is_better: false
295
+ - name: Recall@10 (topk=30)
296
+ display_name: Recall@10 (topk=30)
297
+ description: Recall at top 10 in information retrieval.
298
+ lower_is_better: false
299
+ - name: Recall@20 (topk=30)
300
+ display_name: Recall@20 (topk=30)
301
+ description: Recall at top 20 in information retrieval.
302
+ lower_is_better: false
303
+ - name: RR@5 (topk=30)
304
+ display_name: RR@5 (topk=30)
305
+ description: Mean reciprocal rank at 5 in information retrieval.
306
+ lower_is_better: false
307
+ - name: RR@10 (topk=30)
308
+ display_name: RR@10 (topk=30)
309
+ description: Mean reciprocal rank at 10 in information retrieval.
310
+ lower_is_better: false
311
+ - name: RR@20 (topk=30)
312
+ display_name: RR@20 (topk=30)
313
+ description: Mean reciprocal rank at 20 in information retrieval.
314
+ lower_is_better: false
315
+ - name: math_equiv
316
+ display_name: Equivalent
317
+ description: Fraction of model outputs that are mathematically equivalent to the correct reference.
318
+ lower_is_better: false
319
+ - name: math_equiv_chain_of_thought
320
+ display_name: Equivalent (chain of thought)
321
+ description: Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.
322
+ lower_is_better: false
323
+ - name: exact_match_indicator
324
+ display_name: Exact match (final)
325
+ short_display_name: EM
326
+ description: Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator (e.g., space).
327
+ lower_is_better: false
328
+ - name: final_number_exact_match
329
+ display_name: Exact match (final number)
330
+ short_display_name: EM
331
+ description: Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.
332
+ lower_is_better: false
333
+ - name: exact_set_match
334
+ display_name: Exact match (at sets)
335
+ short_display_name: EM
336
+ description: Fraction of instances that the predicted output matches a correct reference exactly as sets.
337
+ lower_is_better: false
338
+ - name: iou_set_match
339
+ display_name: Intersection over union (as sets)
340
+ short_display_name: IoU
341
+ description: Intersection over union in terms of set overlap between the model predicted set and correct reference set.
342
+ lower_is_better: false
343
+
344
+ # Summariazation metrics
345
+ - name: summac
346
+ display_name: SummaC
347
+ description: Faithfulness scores based on the SummaC method of [Laban et al. (2022)](https://aclanthology.org/2022.tacl-1.10/).
348
+ lower_is_better: false
349
+ - name: QAFactEval
350
+ display_name: QAFactEval
351
+ description: Faithfulness scores based on the SummaC method of [Laban et al. (2022)](https://aclanthology.org/2022.tacl-1.10/).
352
+ lower_is_better: false
353
+ - name: summarization_coverage
354
+ display_name: Coverage
355
+ description: Extent to which the model-generated summaries are extractive fragments from the source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).
356
+ - name: summarization_density
357
+ display_name: Density
358
+ description: Extent to which the model-generated summaries are extractive summaries based on the source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).
359
+ - name: summarization_compression
360
+ display_name: Compression
361
+ description: Extent to which the model-generated summaries are compressed relative to the source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).
362
+ - name: BERTScore-P
363
+ display_name: BERTScore (P)
364
+ description: Average BERTScore precision [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference summary.
365
+ lower_is_better: false
366
+ - name: BERTScore-R
367
+ display_name: BERTScore (R)
368
+ description: Average BERTScore recall [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference summary.
369
+ lower_is_better: false
370
+ - name: BERTScore-F
371
+ display_name: BERTScore (F1)
372
+ description: Average BERTScore F1 [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference summary.
373
+ lower_is_better: false
374
+ - name: HumanEval-faithfulness
375
+ display_name: HumanEval-faithfulness
376
+ description: Human evaluation score for faithfulness.
377
+ lower_is_better: false
378
+ - name: HumanEval-relevance
379
+ display_name: HumanEval-relevance
380
+ description: Human evaluation score for relevance.
381
+ lower_is_better: false
382
+ - name: HumanEval-coherence
383
+ display_name: HumanEval-coherence
384
+ description: Human evaluation score for coherence.
385
+ lower_is_better: false
386
+
387
+ # Bias metrics:
388
+ - name: bias_metric:mode=associations,demographic_category=race,target_category=profession
389
+ display_name: Stereotypical associations (race, profession)
390
+ short_display_name: Stereotypes (race)
391
+ lower_is_better: true
392
+ description: Measures uneven association of racial groups (Asian, Hispanic, White) with target professions. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
393
+ - name: bias_metric:mode=associations,demographic_category=race,target_category=adjective
394
+ display_name: Stereotypical associations (race, adjectives)
395
+ short_display_name: Stereotypes (race)
396
+ lower_is_better: true
397
+ description: Measures uneven association of racial groups (Asian, Hispanic, White) with target adjectives. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
398
+
399
+ - name: bias_metric:mode=associations,demographic_category=gender,target_category=profession
400
+ display_name: Stereotypical associations (gender, profession)
401
+ short_display_name: Stereotypes (gender)
402
+ lower_is_better: true
403
+ description: Measures uneven association of gender groups (male, female) with target professions. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
404
+ - name: bias_metric:mode=associations,demographic_category=gender,target_category=adjective
405
+ display_name: Stereotypical associations (gender, adjectives)
406
+ short_display_name: Stereotypes (gender)
407
+ lower_is_better: true
408
+ description: Measures uneven association of gender groups (male, female) with target adjectives. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
409
+
410
+ - name: bias_metric:mode=representation,demographic_category=race
411
+ display_name: Demographic representation (race)
412
+ short_display_name: Representation (race)
413
+ lower_is_better: true
414
+ description: Measures uneven representation of racial groups (Asian, Hispanic, White). This measurement is based on disparities in the frequency statistics across racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics).
415
+ - name: bias_metric:mode=representation,demographic_category=gender
416
+ display_name: Demographic representation (gender)
417
+ short_display_name: Representation (gender)
418
+ lower_is_better: true
419
+ description: Measures uneven representation of gender groups (male, female). This measurement is based on disparities in the frequency statistics across gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
420
+
421
+ - name: bbq_metric_ambiguous_bias
422
+ display_name: BBQ (ambiguous)
423
+ lower_is_better: true
424
+ description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on ambiguous examples.
425
+ - name: bbq_metric_unambiguous_bias
426
+ display_name: BBQ (unambiguous)
427
+ lower_is_better: true
428
+ description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on unambiguous examples.
429
+
430
+ # Toxicity metrics
431
+ - name: expected_max_toxicity
432
+ display_name: Average max toxicity
433
+ short_display_name: Avg. max toxicity
434
+ lower_is_better: true
435
+ description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
436
+ - name: max_toxicity_probability
437
+ display_name: max toxicity
438
+ lower_is_better: true
439
+ description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
440
+ - name: toxic_frac
441
+ display_name: Toxic fraction
442
+ lower_is_better: true
443
+ description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
444
+
445
+ # Efficiency metrics
446
+ - name: training_co2_cost
447
+ display_name: Estimated training emissions (kg CO2)
448
+ short_display_name: Training emissions (kg CO2)
449
+ lower_is_better: true
450
+ description: Estimate of the CO2 emissions from training the model.
451
+ - name: training_energy_cost
452
+ display_name: Estimated training energy cost (MWh)
453
+ short_display_name: Training energy (MWh)
454
+ lower_is_better: true
455
+ description: Estimate of the amount of energy used to train the model.
456
+ - name: inference_runtime
457
+ display_name: Observed inference runtime (s)
458
+ short_display_name: Observed inference time (s)
459
+ lower_is_better: true
460
+ description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
461
+ - name: inference_idealized_runtime
462
+ display_name: Idealized inference runtime (s)
463
+ short_display_name: Idealized inference time (s)
464
+ lower_is_better: true
465
+ description: Average time to process a request to the model based solely on the model architecture (using Megatron-LM).
466
+ - name: inference_denoised_runtime
467
+ display_name: Denoised inference runtime (s)
468
+ short_display_name: Denoised inference time (s)
469
+ lower_is_better: true
470
+ description: Average time to process a request to the model minus performance contention by using profiled runtimes from multiple trials of SyntheticEfficiencyScenario.
471
+ - name: batch_size
472
+ display_name: Batch size
473
+ description: For batch jobs, how many requests are in a batch.
474
+
475
+ # Calibration metrics:
476
+ - name: max_prob
477
+ display_name: Max prob
478
+ description: Model's average confidence in its prediction (only computed for classification tasks)
479
+ lower_is_better: false
480
+ - name: ece_10_bin
481
+ display_name: 10-bin expected calibration error
482
+ short_display_name: ECE (10-bin)
483
+ lower_is_better: true
484
+ description: The average difference between the model's confidence and accuracy, averaged across 10 bins where each bin contains an equal number of points (only computed for classification tasks). Warning - not reliable for small datasets (e.g., with < 300 examples) because each bin will have very few examples.
485
+ - name: ece_1_bin
486
+ display_name: 1-bin expected calibration error
487
+ short_display_name: ECE (1-bin)
488
+ lower_is_better: true
489
+ description: The (absolute value) difference between the model's average confidence and accuracy (only computed for classification tasks).
490
+ - name: selective_cov_acc_area
491
+ display_name: Selective coverage-accuracy area
492
+ short_display_name: Selective Acc
493
+ description: The area under the coverage-accuracy curve, a standard selective classification metric (only computed for classification tasks).
494
+ lower_is_better: false
495
+ - name: selective_acc@10
496
+ display_name: Accuracy at 10% coverage
497
+ short_display_name: Acc@10%
498
+ description: The accuracy for the 10% of predictions that the model is most confident on (only computed for classification tasks).
499
+ lower_is_better: false
500
+ - name: platt_ece_10_bin
501
+ display_name: 10-bin Expected Calibration Error (after Platt scaling)
502
+ short_display_name: Platt-scaled ECE (10-bin)
503
+ lower_is_better: true
504
+ description: 10-bin ECE computed after applying Platt scaling to recalibrate the model's predicted probabilities.
505
+ - name: platt_ece_1_bin
506
+ display_name: 1-bin expected calibration error (after Platt scaling)
507
+ short_display_name: Platt-scaled ECE (1-bin)
508
+ lower_is_better: true
509
+ description: 1-bin ECE computed after applying Platt scaling to recalibrate the model's predicted probabilities.
510
+ - name: platt_coef
511
+ display_name: Platt Scaling Coefficient
512
+ short_display_name: Platt Coef
513
+ description: Coefficient of the Platt scaling classifier (can compare this across tasks).
514
+ lower_is_better: false
515
+ - name: platt_intercept
516
+ display_name: Platt Scaling Intercept
517
+ short_display_name: Platt Intercept
518
+ description: Intercept of the Platt scaling classifier (can compare this across tasks).
519
+ lower_is_better: false
520
+
521
+ ############################################################
522
+ perturbations:
523
+ - name: robustness
524
+ display_name: Robustness
525
+ description: Computes worst case over different robustness perturbations (misspellings, formatting, contrast sets).
526
+ - name: fairness
527
+ display_name: Fairness
528
+ description: Computes worst case over different fairness perturbations (changing dialect, race of names, gender).
529
+ - name: typos
530
+ display_name: Typos
531
+ description: >
532
+ Randomly adds typos to each token in the input with probability 0.05 and computes the per-instance worst-case
533
+ performance between perturbed and unperturbed versions.
534
+ - name: synonym
535
+ display_name: Synonyms
536
+ description: >
537
+ Randomly substitutes words in the input with WordNet synonyms with probability 0.5 and computes the per-instance
538
+ worst-case performance between perturbed and unperturbed versions.
539
+ - name: dialect
540
+ display_name: SAE -> AAE
541
+ short_display_name: Dialect
542
+ description: >
543
+ Deterministically substitutes SAE words in input with AAE counterparts using validated dictionary of [Ziems et al. (2022)](https://aclanthology.org/2022.acl-long.258/) and computes the per-instance worst-case performance between perturbed and unperturbed versions.
544
+ - name: race
545
+ display_name: First names by race (White -> Black)
546
+ short_display_name: Race
547
+ description: >
548
+ Deterministically substitutes White first names with Black first names sampled from the lists of [Caliskan et al. (2017)](https://www.science.org/doi/10.1126/science.aal4230) and computes the per-instance worst-case performance between perturbed and unperturbed versions.
549
+ - name: gender
550
+ display_name: Pronouns by gender (Male -> Female)
551
+ short_display_name: Gender
552
+ description: >
553
+ Deterministically substitutes male pronouns with female pronouns and computes the per-instance worst-case
554
+ performance between perturbed and unperturbed versions.
555
+
556
+ ############################################################
557
+ metric_groups:
558
+ - name: accuracy
559
+ display_name: Accuracy
560
+ metrics:
561
+ - name: ${main_name}
562
+ split: ${main_split}
563
+
564
+ - name: calibration
565
+ display_name: Calibration
566
+ metrics:
567
+ - name: ece_10_bin
568
+ split: ${main_split}
569
+
570
+ - name: calibration_detailed
571
+ display_name: Calibration (Detailed)
572
+ description: Measures how calibrated the model is (how meaningful its uncertainty estimates are).
573
+ metrics:
574
+ - name: max_prob
575
+ split: ${main_split}
576
+ - name: ece_1_bin
577
+ split: ${main_split}
578
+ - name: ece_10_bin
579
+ split: ${main_split}
580
+ - name: selective_cov_acc_area
581
+ split: ${main_split}
582
+ - name: selective_acc@10
583
+ split: ${main_split}
584
+ - name: platt_ece_1_bin
585
+ split: ${main_split}
586
+ - name: platt_ece_10_bin
587
+ split: ${main_split}
588
+ - name: platt_coef
589
+ split: ${main_split}
590
+ - name: platt_intercept
591
+ split: ${main_split}
592
+
593
+ - name: robustness
594
+ display_name: Robustness
595
+ metrics:
596
+ - name: ${main_name}
597
+ split: ${main_split}
598
+ perturbation_name: robustness
599
+
600
+ # TODO: Add other robustness perturbations
601
+ - name: robustness_detailed
602
+ display_name: Robustness (Detailed)
603
+ description: Measures how robust the model is to invariances.
604
+ metrics:
605
+ - name: ${main_name}
606
+ split: ${main_split}
607
+ perturbation_name: typos
608
+ - name: ${main_name}
609
+ split: ${main_split}
610
+ perturbation_name: synonyms
611
+
612
+ - name: fairness
613
+ display_name: Fairness
614
+ metrics:
615
+ - name: ${main_name}
616
+ split: ${main_split}
617
+ perturbation_name: fairness
618
+
619
+ # TODO: Add other fairness perturbations
620
+ - name: fairness_detailed
621
+ display_name: Fairness (Detailed)
622
+ description: Measures how fair the model is.
623
+ metrics:
624
+ - name: ${main_name}
625
+ split: ${main_split}
626
+ perturbation_name: dialect
627
+ - name: ${main_name}
628
+ split: ${main_split}
629
+ perturbation_name: race
630
+ - name: ${main_name}
631
+ split: ${main_split}
632
+ perturbation_name: gender
633
+
634
+ - name: bias
635
+ display_name: Bias
636
+ metrics:
637
+ - name: bias_metric:mode=associations,demographic_category=race,target_category=profession
638
+ split: ${main_split}
639
+ - name: bias_metric:mode=associations,demographic_category=gender,target_category=profession
640
+ split: ${main_split}
641
+ - name: bias_metric:mode=representation,demographic_category=race
642
+ split: ${main_split}
643
+ - name: bias_metric:mode=representation,demographic_category=gender
644
+ split: ${main_split}
645
+
646
+ - name: toxicity
647
+ display_name: Toxicity
648
+ metrics:
649
+ - name: toxic_frac
650
+ split: ${main_split}
651
+
652
+ - name: efficiency
653
+ display_name: Efficiency
654
+ metrics:
655
+ - name: inference_denoised_runtime
656
+ split: ${main_split}
657
+
658
+ - name: efficiency_detailed
659
+ display_name: Efficiency (Detailed)
660
+ description: The efficiency of the model across both training and inference.
661
+ metrics:
662
+ - name: inference_runtime
663
+ split: ${main_split}
664
+ - name: inference_idealized_runtime
665
+ split: ${main_split}
666
+ - name: inference_denoised_runtime
667
+ split: ${main_split}
668
+ - name: training_co2_cost
669
+ split: ${main_split}
670
+ - name: training_energy_cost
671
+ split: ${main_split}
672
+
673
+ - name: general_information
674
+ display_name: General information
675
+ metrics:
676
+ - name: num_instances
677
+ split: ${main_split}
678
+ - name: num_train_instances
679
+ split: ${main_split}
680
+ - name: prompt_truncated
681
+ split: ${main_split}
682
+ - name: num_prompt_tokens
683
+ split: ${main_split}
684
+ - name: num_output_tokens
685
+ split: ${main_split}
686
+ - name: num_train_trials
687
+ split: ${main_split}
688
+
689
+ # Special metrics for scenarios with more than 1 main metric
690
+ - name: summarization_metrics
691
+ display_name: Summarization metrics
692
+ metrics:
693
+ - name: summac
694
+ split: ${main_split}
695
+ - name: QAFactEval
696
+ split: ${main_split}
697
+ - name: BERTScore-F
698
+ split: ${main_split}
699
+ - name: summarization_coverage
700
+ split: ${main_split}
701
+ - name: summarization_density
702
+ split: ${main_split}
703
+ - name: summarization_compression
704
+ split: ${main_split}
705
+ - name: HumanEval-faithfulness
706
+ split: ${main_split}
707
+ - name: HumanEval-relevance
708
+ split: ${main_split}
709
+ - name: HumanEval-coherence
710
+ split: ${main_split}
711
+
712
+ - name: classification_metrics
713
+ display_name: Classification metrics
714
+ metrics:
715
+ - name: classification_macro_f1
716
+ split: ${main_split}
717
+ - name: classification_micro_f1
718
+ split: ${main_split}
719
+
720
+ #######################################################
721
+ run_groups:
722
+ - name: melt
723
+ display_name: MELT Scenarios
724
+ description: Scenarios for the medical domain
725
+ category: All scenarios
726
+ subgroups:
727
+ - melt_question_answering_mlqa
728
+ - melt_question_answering_xquad
729
+ - melt_summarization_vietnews
730
+ - melt_summarization_wikilingua
731
+ - melt_synthetic_reasoning
732
+ - melt_math
733
+ - melt_text_classification_vsmec
734
+ - melt_text_classification_phoatis
735
+ - melt_sentiment_analysis_vlsp
736
+ - melt_sentiment_analysis_vsfc
737
+ - melt_translation_opus100
738
+ - melt_translation_phomt
739
+ - melt_lm_mask_filling_mlqa
740
+ - melt_lm_spelling_correction_vsec
741
+ - melt_knowledge_zalo
742
+ - melt_knowledge_vimmrc
743
+ - melt_toxicity_detection_vihsd
744
+ - melt_toxicity_detection_victsd
745
+ - melt_information_retrieval_mmarco
746
+ - melt_information_retrieval_mrobust
747
+
748
+ - name: melt_question_answering_mlqa
749
+ display_name: MLQA
750
+ description: Scenarios for question answering with the MLQA dataset.
751
+ category: Question Answering
752
+ metric_groups:
753
+ - accuracy
754
+ - efficiency
755
+ - general_information
756
+ environment:
757
+ main_name: quasi_exact_match
758
+ main_split: test
759
+ taxonomy:
760
+ task: question answering
761
+ what: "?"
762
+ who: "?"
763
+ when: "?"
764
+ language: Vietnamese
765
+
766
+ - name: melt_question_answering_xquad
767
+ display_name: XQuAD
768
+ description: Scenarios for question answering with the XQuAD dataset.
769
+ category: Question Answering
770
+ metric_groups:
771
+ - accuracy
772
+ - efficiency
773
+ - general_information
774
+ environment:
775
+ main_name: quasi_exact_match
776
+ main_split: test
777
+ taxonomy:
778
+ task: question answering
779
+ what: "?"
780
+ who: "?"
781
+ when: "?"
782
+ language: Vietnamese
783
+
784
+ - name: melt_summarization_vietnews
785
+ display_name: VietNews
786
+ description: Scenarios for summarization with the VietNews dataset.
787
+ category: Summarization
788
+ metric_groups:
789
+ - accuracy
790
+ - summarization_metrics
791
+ - bias
792
+ - toxicity
793
+ - efficiency
794
+ - general_information
795
+ environment:
796
+ main_name: rouge_2
797
+ main_split: test
798
+ taxonomy:
799
+ task: summarization
800
+ what: "Vietnamese online newspapers."
801
+ who: "?"
802
+ when: "?"
803
+ language: Vietnamese
804
+
805
+ - name: melt_summarization_wikilingua
806
+ display_name: WikiLingua
807
+ description: Scenarios for summarization with the WikiLingua dataset.
808
+ category: Summarization
809
+ metric_groups:
810
+ - accuracy
811
+ - summarization_metrics
812
+ - bias
813
+ - toxicity
814
+ - efficiency
815
+ - general_information
816
+ environment:
817
+ main_name: rouge_2
818
+ main_split: test
819
+ taxonomy:
820
+ task: summarization
821
+ what: "?"
822
+ who: "?"
823
+ when: "?"
824
+ language: Vietnamese
825
+
826
+ - name: melt_synthetic_reasoning
827
+ display_name: Synthetic reasoning (abstract symbols)
828
+ description: Synthetic reasoning tasks defined using abstract symbols based on LIME [(Wu et al., 2021)](https://proceedings.mlr.press/v139/wu21c.html).
829
+ metric_groups:
830
+ - accuracy
831
+ - efficiency
832
+ - general_information
833
+ environment:
834
+ main_name: quasi_exact_match
835
+ main_split: test
836
+ taxonomy:
837
+ task: "reasoning"
838
+ what: n/a
839
+ who: n/a
840
+ when: n/a
841
+ language: synthetic
842
+ subgroups:
843
+ - melt_synthetic_reasoning_pattern_match
844
+ - melt_synthetic_reasoning_variable_substitution
845
+ - melt_synthetic_reasoning_induction
846
+
847
+ - name: melt_synthetic_reasoning_pattern_match
848
+ display_name: Synthetic reasoning (pattern match)
849
+ description: Synthetic reasoning tasks defined using pattern matching based on LIME [(Wu et al., 2021)](https://proceedings.mlr.press/v139/wu21c.html).
850
+ metric_groups:
851
+ - accuracy
852
+ - efficiency
853
+ - general_information
854
+ environment:
855
+ main_name: quasi_exact_match
856
+ main_split: test
857
+ taxonomy:
858
+ task: "reasoning"
859
+ what: n/a
860
+ who: n/a
861
+ when: n/a
862
+ language: synthetic
863
+
864
+ - name: melt_synthetic_reasoning_variable_substitution
865
+ display_name: Synthetic reasoning (variable substitution)
866
+ description: Synthetic reasoning tasks defined using variable substitution based on LIME [(Wu et al., 2021)](https://proceedings.mlr.press/v139/wu21c.html).
867
+ metric_groups:
868
+ - accuracy
869
+ - efficiency
870
+ - general_information
871
+ environment:
872
+ main_name: quasi_exact_match
873
+ main_split: test
874
+ taxonomy:
875
+ task: "reasoning"
876
+ what: n/a
877
+ who: n/a
878
+ when: n/a
879
+ language: synthetic
880
+
881
+ - name: melt_synthetic_reasoning_induction
882
+ display_name: Synthetic reasoning (induction)
883
+ description: Synthetic reasoning tasks defined using induction based on LIME [(Wu et al., 2021)](https://proceedings.mlr.press/v139/wu21c.html).
884
+ metric_groups:
885
+ - accuracy
886
+ - efficiency
887
+ - general_information
888
+ environment:
889
+ main_name: quasi_exact_match
890
+ main_split: test
891
+ taxonomy:
892
+ task: "reasoning"
893
+ what: n/a
894
+ who: n/a
895
+ when: n/a
896
+ language: synthetic
897
+
898
+ - name: melt_synthetic_reasoning_natural
899
+ display_name: Synthetic reasoning (natural language)
900
+ description: Synthetic reasoning tasks defined using simple natural language based on LIME [(Wu et al., 2021)](https://proceedings.mlr.press/v139/wu21c.html).
901
+ metric_groups:
902
+ - accuracy
903
+ - efficiency
904
+ - general_information
905
+ environment:
906
+ main_name: f1_set_match
907
+ main_split: test
908
+ taxonomy:
909
+ task: "reasoning"
910
+ what: n/a
911
+ who: n/a
912
+ when: n/a
913
+ language: synthetic
914
+
915
+ - name: melt_math
916
+ display_name: MATH
917
+ description: The MATH benchmark for measuring mathematical problem solving on competition math problems [(Hendrycks et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html).
918
+ metric_groups:
919
+ - accuracy
920
+ - efficiency
921
+ - general_information
922
+ subgroups:
923
+ - melt_math_regular
924
+ - melt_math_chain_of_thought
925
+
926
+ - name: melt_math_regular
927
+ display_name: MATH
928
+ description: The MATH benchmark for measuring mathematical problem solving on competition math problems [(Hendrycks et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html).
929
+ metric_groups:
930
+ - accuracy
931
+ - efficiency
932
+ - general_information
933
+ environment:
934
+ main_name: math_equiv
935
+ main_split: test
936
+ taxonomy:
937
+ task: "reasoning"
938
+ what: n/a
939
+ who: n/a
940
+ when: n/a
941
+ language: synthetic
942
+
943
+ - name: melt_math_chain_of_thought
944
+ display_name: MATH (chain-of-thought)
945
+ description: The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html).
946
+ metric_groups:
947
+ - accuracy
948
+ - efficiency
949
+ - general_information
950
+ environment:
951
+ main_name: math_equiv_chain_of_thought
952
+ main_split: test
953
+ taxonomy:
954
+ task: "reasoning"
955
+ what: n/a
956
+ who: n/a
957
+ when: n/a
958
+ language: synthetic
959
+
960
+ - name: melt_text_classification_phoatis
961
+ display_name: PhoATIS
962
+ short_display_name: PhoATIS
963
+ description: The PhoATIS benchmark for measuring text classification on Vietnamese ATIS.
964
+ metric_groups:
965
+ - accuracy
966
+ - calibration
967
+ - robustness
968
+ - fairness
969
+ - bias
970
+ - toxicity
971
+ - efficiency
972
+ - general_information
973
+ environment:
974
+ main_name: quasi_exact_match
975
+ main_split: test
976
+ taxonomy:
977
+ task: text classification
978
+ what: "Flight information."
979
+ who: "?"
980
+ when: "?"
981
+ language: Vietnamese
982
+
983
+ - name: melt_text_classification_vsmec
984
+ display_name: VSMEC
985
+ short_display_name: VSMEC
986
+ description: The VSMEC benchmark for measuring text classification on Vietnamese MSEC.
987
+ metric_groups:
988
+ - accuracy
989
+ - calibration
990
+ - robustness
991
+ - fairness
992
+ - bias
993
+ - toxicity
994
+ - efficiency
995
+ - general_information
996
+ environment:
997
+ main_name: quasi_exact_match
998
+ main_split: test
999
+ taxonomy:
1000
+ task: text classification
1001
+ what: "?"
1002
+ who: "?"
1003
+ when: "?"
1004
+ language: Vietnamese
1005
+
1006
+ - name: melt_sentiment_analysis_vlsp
1007
+ display_name: VLSP
1008
+ short_display_name: VLSP
1009
+ description: The VLSP benchmark for measuring sentiment analysis on Vietnamese VLSP.
1010
+ metric_groups:
1011
+ - accuracy
1012
+ - calibration
1013
+ - robustness
1014
+ - fairness
1015
+ - bias
1016
+ - toxicity
1017
+ - efficiency
1018
+ - general_information
1019
+ environment:
1020
+ main_name: quasi_exact_match
1021
+ main_split: test
1022
+ taxonomy:
1023
+ task: sentiment analysis
1024
+ what: "Online comments"
1025
+ who: "?"
1026
+ when: "?"
1027
+ language: Vietnamese
1028
+
1029
+ - name: melt_sentiment_analysis_vsfc
1030
+ display_name: VSFC
1031
+ short_display_name: VSFC
1032
+ description: The VSFC benchmark for measuring sentiment analysis on Vietnamese VSFC.
1033
+ metric_groups:
1034
+ - accuracy
1035
+ - calibration
1036
+ - robustness
1037
+ - fairness
1038
+ - bias
1039
+ - toxicity
1040
+ - efficiency
1041
+ - general_information
1042
+ environment:
1043
+ main_name: quasi_exact_match
1044
+ main_split: test
1045
+ taxonomy:
1046
+ task: sentiment analysis
1047
+ what: "?"
1048
+ who: "?"
1049
+ when: "?"
1050
+ language: Vietnamese
1051
+
1052
+ - name: melt_translation_opus100
1053
+ display_name: OPUS100
1054
+ short_display_name: OPUS100
1055
+ description: The OPUS100 benchmark for measuring translation on Vietnamese OPUS100.
1056
+ metric_groups:
1057
+ - accuracy
1058
+ - efficiency
1059
+ - general_information
1060
+ environment:
1061
+ main_name: quasi_exact_match
1062
+ main_split: test
1063
+ taxonomy:
1064
+ task: translation
1065
+ what: "?"
1066
+ who: "?"
1067
+ when: "?"
1068
+ language: Vietnamese
1069
+
1070
+ - name: melt_translation_phomt
1071
+ display_name: PhoMT
1072
+ short_display_name: PhoMT
1073
+ description: The PhoMT benchmark for measuring translation on Vietnamese PhoMT.
1074
+ metric_groups:
1075
+ - accuracy
1076
+ - efficiency
1077
+ - general_information
1078
+ environment:
1079
+ main_name: quasi_exact_match
1080
+ main_split: test
1081
+ taxonomy:
1082
+ task: translation
1083
+ what: "?"
1084
+ who: "?"
1085
+ when: "?"
1086
+ language: Vietnamese
1087
+
1088
+ - name: melt_lm_mask_filling_mlqa
1089
+ display_name: MLQA
1090
+ description: The MLQA benchmark for measuring language model mask filling on Vietnamese MLQA.
1091
+ metric_groups:
1092
+ - accuracy
1093
+ - calibration
1094
+ - robustness
1095
+ - fairness
1096
+ - bias
1097
+ - toxicity
1098
+ - efficiency
1099
+ - general_information
1100
+ environment:
1101
+ main_name: quasi_exact_match
1102
+ main_split: test
1103
+ taxonomy:
1104
+ task: language model mask filling
1105
+ what: "?"
1106
+ who: "?"
1107
+ when: "?"
1108
+ language: Vietnamese
1109
+
1110
+ - name: melt_lm_spelling_correction_vsec
1111
+ display_name: VSEC
1112
+ short_display_name: VSEC
1113
+ description: The VSEC benchmark for measuring language model spelling correction on Vietnamese VSEC.
1114
+ metric_groups:
1115
+ - accuracy
1116
+ - calibration
1117
+ - robustness
1118
+ - fairness
1119
+ - bias
1120
+ - toxicity
1121
+ - efficiency
1122
+ - general_information
1123
+ environment:
1124
+ main_name: quasi_exact_match
1125
+ main_split: test
1126
+ taxonomy:
1127
+ task: language model spelling correction
1128
+ what: "?"
1129
+ who: "?"
1130
+ when: "?"
1131
+ language: Vietnamese
1132
+
1133
+ - name: melt_knowledge_zalo
1134
+ display_name: ZaloE2E
1135
+ short_display_name: ZaloE2E
1136
+ description: The ZaloE2E benchmark for measuring knowledge extraction on Vietnamese ZaloE2E.
1137
+ metric_groups:
1138
+ - accuracy
1139
+ - efficiency
1140
+ - general_information
1141
+ environment:
1142
+ main_name: quasi_exact_match
1143
+ main_split: test
1144
+ taxonomy:
1145
+ task: instrinsic knowledge
1146
+ what: "?"
1147
+ who: "?"
1148
+ when: "?"
1149
+ language: Vietnamese
1150
+
1151
+ - name: melt_knowledge_vimmrc
1152
+ display_name: ViMMRC
1153
+ short_display_name: ViMMRC
1154
+ description: The ViMMRC benchmark for measuring knowledge extraction on Vietnamese ViMMRC.
1155
+ metric_groups:
1156
+ - accuracy
1157
+ - efficiency
1158
+ - general_information
1159
+ environment:
1160
+ main_name: quasi_exact_match
1161
+ main_split: test
1162
+ taxonomy:
1163
+ task: instrinsic knowledge
1164
+ what: "?"
1165
+ who: "?"
1166
+ when: "?"
1167
+ language: Vietnamese
1168
+
1169
+ - name: melt_toxicity_detection_vihsd
1170
+ display_name: ViHSD
1171
+ short_display_name: ViHSD
1172
+ description: The ViHSD benchmark for measuring toxicity detection on Vietnamese ViHSD.
1173
+ metric_groups:
1174
+ - accuracy
1175
+ - calibration
1176
+ - robustness
1177
+ - fairness
1178
+ - bias
1179
+ - toxicity
1180
+ - efficiency
1181
+ - general_information
1182
+ environment:
1183
+ main_name: quasi_exact_match
1184
+ main_split: test
1185
+ taxonomy:
1186
+ task: toxicity classification
1187
+ what: "?"
1188
+ who: "?"
1189
+ when: "?"
1190
+ language: Vietnamese
1191
+
1192
+ - name: melt_toxicity_detection_victsd
1193
+ display_name: ViCTSD
1194
+ short_display_name: ViCTSD
1195
+ description: The ViCTSD benchmark for measuring toxicity detection on Vietnamese ViCTSD.
1196
+ metric_groups:
1197
+ - accuracy
1198
+ - calibration
1199
+ - robustness
1200
+ - fairness
1201
+ - bias
1202
+ - toxicity
1203
+ - efficiency
1204
+ - general_information
1205
+ environment:
1206
+ main_name: quasi_exact_match
1207
+ main_split: test
1208
+ taxonomy:
1209
+ task: toxicity classification
1210
+ what: "?"
1211
+ who: "?"
1212
+ when: "?"
1213
+ language: Vietnamese
1214
+
1215
+ - name: melt_information_retrieval_mmarco
1216
+ display_name: MARCO
1217
+ short_display_name: MARCO
1218
+ description: The MARCO benchmark for measuring information retrieval on Vietnamese MARCO.
1219
+ metric_groups:
1220
+ - accuracy
1221
+ - robustness
1222
+ - fairness
1223
+ - bias
1224
+ - toxicity
1225
+ - efficiency
1226
+ - general_information
1227
+ environment:
1228
+ main_name: RR@10
1229
+ main_split: valid
1230
+ taxonomy:
1231
+ task: information retrieval
1232
+ what: "?"
1233
+ who: "?"
1234
+ when: "?"
1235
+ language: Vietnamese
1236
+
1237
+ - name: melt_information_retrieval_mrobust
1238
+ display_name: MRobust
1239
+ short_display_name: MRobust
1240
+ description: The MRobust benchmark for measuring information retrieval on Vietnamese MRobust.
1241
+ metric_groups:
1242
+ - accuracy
1243
+ - robustness
1244
+ - fairness
1245
+ - bias
1246
+ - toxicity
1247
+ - efficiency
1248
+ - general_information
1249
+ environment:
1250
+ main_name: NDCG@10
1251
+ main_split: valid
1252
+ taxonomy:
1253
+ task: information retrieval
1254
+ what: "?"
1255
+ who: "?"
1256
+ when: "?"
1257
+ language: Vietnamese