crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (1033) hide show
  1. crfm_helm-0.5.10.dist-info/METADATA +369 -0
  2. crfm_helm-0.5.10.dist-info/RECORD +1008 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +80 -29
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  8. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  9. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  10. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
  11. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
  12. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
  13. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  14. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
  15. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  16. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
  17. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
  18. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
  19. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  20. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
  21. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  22. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  23. helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
  24. helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
  25. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
  26. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
  27. helm/benchmark/adaptation/common_adapter_specs.py +443 -0
  28. helm/benchmark/adaptation/prompt.py +1 -1
  29. helm/benchmark/adaptation/request_state.py +6 -1
  30. helm/benchmark/adaptation/scenario_state.py +6 -2
  31. helm/benchmark/annotation/aci_bench_annotator.py +84 -0
  32. helm/benchmark/annotation/air_bench_annotator.py +79 -0
  33. helm/benchmark/annotation/alrage_annotator.py +90 -0
  34. helm/benchmark/annotation/annotator.py +48 -0
  35. helm/benchmark/annotation/annotator_factory.py +50 -0
  36. helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
  37. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  38. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  39. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  40. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  41. helm/benchmark/annotation/call_center_annotator.py +258 -0
  42. helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
  43. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  44. helm/benchmark/annotation/dischargeme_annotator.py +96 -0
  45. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  46. helm/benchmark/annotation/financebench_annotator.py +79 -0
  47. helm/benchmark/annotation/harm_bench_annotator.py +55 -0
  48. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  49. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
  50. helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
  51. helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
  52. helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
  53. helm/benchmark/annotation/live_qa_annotator.py +76 -0
  54. helm/benchmark/annotation/med_dialog_annotator.py +88 -0
  55. helm/benchmark/annotation/medalign_annotator.py +89 -0
  56. helm/benchmark/annotation/medi_qa_annotator.py +87 -0
  57. helm/benchmark/annotation/medication_qa_annotator.py +86 -0
  58. helm/benchmark/annotation/mental_health_annotator.py +87 -0
  59. helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
  60. helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
  61. helm/benchmark/annotation/model_as_judge.py +309 -0
  62. helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
  63. helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
  64. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  65. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  66. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  67. helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
  68. helm/benchmark/annotation/spider_annotator.py +18 -0
  69. helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
  70. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  71. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  72. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  73. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  74. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  75. helm/benchmark/annotation/xstest_annotator.py +100 -0
  76. helm/benchmark/annotation_executor.py +144 -0
  77. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  78. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  79. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  80. helm/benchmark/augmentations/data_augmenter.py +0 -2
  81. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  82. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  83. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  84. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  85. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  86. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  87. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  88. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  89. helm/benchmark/augmentations/perturbation.py +26 -4
  90. helm/benchmark/augmentations/perturbation_description.py +1 -1
  91. helm/benchmark/augmentations/space_perturbation.py +2 -2
  92. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  93. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  94. helm/benchmark/augmentations/test_perturbation.py +56 -19
  95. helm/benchmark/augmentations/translate_perturbation.py +31 -0
  96. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  97. helm/benchmark/config_registry.py +7 -1
  98. helm/benchmark/data_preprocessor.py +2 -2
  99. helm/benchmark/executor.py +54 -25
  100. helm/benchmark/huggingface_registration.py +28 -10
  101. helm/benchmark/metrics/air_bench_metrics.py +3212 -0
  102. helm/benchmark/metrics/alrage_metric.py +35 -0
  103. helm/benchmark/metrics/annotation_metrics.py +108 -0
  104. helm/benchmark/metrics/basic_metrics.py +437 -667
  105. helm/benchmark/metrics/bbq_metrics.py +17 -6
  106. helm/benchmark/metrics/bias_metrics.py +18 -9
  107. helm/benchmark/metrics/bias_word_lists.py +1 -1
  108. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  109. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  110. helm/benchmark/metrics/classification_metrics.py +107 -22
  111. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  112. helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
  113. helm/benchmark/metrics/code_metrics.py +5 -5
  114. helm/benchmark/metrics/code_metrics_helper.py +11 -3
  115. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  116. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  117. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  118. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  119. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  120. helm/benchmark/metrics/comet_metric.py +125 -0
  121. helm/benchmark/metrics/common_metric_specs.py +174 -0
  122. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
  123. helm/benchmark/metrics/copyright_metrics.py +5 -5
  124. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  125. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  126. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  127. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  128. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  129. helm/benchmark/metrics/disinformation_metrics.py +8 -114
  130. helm/benchmark/metrics/dry_run_metrics.py +35 -6
  131. helm/benchmark/metrics/efficiency_metrics.py +287 -0
  132. helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
  133. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  134. helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
  135. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  136. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  137. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
  138. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  139. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  140. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  141. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
  142. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  143. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  144. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  145. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  146. helm/benchmark/metrics/ifeval_metrics.py +67 -0
  147. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  148. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  149. helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
  150. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  151. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  152. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  153. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  154. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  155. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  156. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  157. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  158. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  159. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  160. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  161. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  162. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  163. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  164. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  165. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  166. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  167. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  168. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  169. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  170. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  171. helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
  172. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  173. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  174. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  175. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  176. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  177. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  178. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  179. helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
  180. helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
  181. helm/benchmark/metrics/language_modeling_metrics.py +111 -0
  182. helm/benchmark/metrics/live_qa_metrics.py +35 -0
  183. helm/benchmark/metrics/llm_jury_metrics.py +58 -0
  184. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  185. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  186. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  187. helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
  188. helm/benchmark/metrics/medec_metrics.py +124 -0
  189. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  190. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  191. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  192. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  193. helm/benchmark/metrics/metric.py +121 -175
  194. helm/benchmark/metrics/metric_name.py +0 -1
  195. helm/benchmark/metrics/metric_service.py +23 -7
  196. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
  197. helm/benchmark/metrics/nltk_helper.py +32 -0
  198. helm/benchmark/metrics/omni_math_metrics.py +44 -0
  199. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  200. helm/benchmark/metrics/output_processing_metric.py +60 -0
  201. helm/benchmark/metrics/output_processors.py +15 -0
  202. helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
  203. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  204. helm/benchmark/metrics/ranking_metrics.py +5 -5
  205. helm/benchmark/metrics/reference_metric.py +148 -0
  206. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  207. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  208. helm/benchmark/metrics/safety_metrics.py +91 -0
  209. helm/benchmark/metrics/seahelm_metrics.py +201 -0
  210. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  211. helm/benchmark/metrics/spider_metrics.py +7 -0
  212. helm/benchmark/metrics/statistic.py +1 -1
  213. helm/benchmark/metrics/summac/model_summac.py +8 -11
  214. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  215. helm/benchmark/metrics/summarization_metrics.py +150 -11
  216. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  217. helm/benchmark/metrics/test_classification_metrics.py +145 -70
  218. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  219. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
  220. helm/benchmark/metrics/test_metric.py +3 -3
  221. helm/benchmark/metrics/test_statistic.py +2 -2
  222. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  223. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  224. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  225. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  226. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
  227. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  228. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
  229. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
  230. helm/benchmark/metrics/toxicity_metrics.py +37 -7
  231. helm/benchmark/metrics/toxicity_utils.py +23 -0
  232. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  233. helm/benchmark/metrics/unitxt_metrics.py +107 -0
  234. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  235. helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
  236. helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
  237. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  238. helm/benchmark/metrics/wildbench_metrics.py +54 -0
  239. helm/benchmark/model_deployment_registry.py +69 -5
  240. helm/benchmark/model_metadata_registry.py +58 -2
  241. helm/benchmark/multi_gpu_runner.py +133 -0
  242. helm/benchmark/presentation/contamination.py +3 -3
  243. helm/benchmark/presentation/create_plots.py +51 -20
  244. helm/benchmark/presentation/run_display.py +51 -12
  245. helm/benchmark/presentation/run_entry.py +2 -2
  246. helm/benchmark/presentation/schema.py +83 -66
  247. helm/benchmark/presentation/summarize.py +483 -388
  248. helm/benchmark/presentation/table.py +8 -8
  249. helm/benchmark/presentation/taxonomy_info.py +20 -0
  250. helm/benchmark/presentation/test_contamination.py +2 -2
  251. helm/benchmark/presentation/test_create_plots.py +4 -1
  252. helm/benchmark/presentation/test_run_entry.py +2 -2
  253. helm/benchmark/presentation/test_schema.py +11 -0
  254. helm/benchmark/presentation/test_summarize.py +148 -6
  255. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  256. helm/benchmark/reeval_run.py +202 -0
  257. helm/benchmark/reeval_runner.py +355 -0
  258. helm/benchmark/run.py +151 -87
  259. helm/benchmark/run_expander.py +418 -33
  260. helm/benchmark/run_spec.py +93 -0
  261. helm/benchmark/run_spec_factory.py +180 -0
  262. helm/benchmark/run_specs/__init__.py +0 -0
  263. helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
  264. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  265. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  266. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  267. helm/benchmark/run_specs/call_center_run_specs.py +201 -0
  268. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  269. helm/benchmark/run_specs/classic_run_specs.py +1393 -0
  270. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  271. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  272. helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
  273. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  274. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  275. helm/benchmark/run_specs/experimental_run_specs.py +224 -0
  276. helm/benchmark/run_specs/finance_run_specs.py +114 -0
  277. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  278. helm/benchmark/run_specs/heim_run_specs.py +625 -0
  279. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  280. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  281. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  282. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  283. helm/benchmark/run_specs/long_context_run_specs.py +188 -0
  284. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  285. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  286. helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
  287. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  288. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  289. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  290. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  291. helm/benchmark/run_specs/safety_run_specs.py +191 -0
  292. helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
  293. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  294. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
  295. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  296. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  297. helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
  298. helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
  299. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  300. helm/benchmark/runner.py +63 -62
  301. helm/benchmark/runner_config_registry.py +21 -0
  302. helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
  303. helm/benchmark/scenarios/air_bench_scenario.py +76 -0
  304. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  305. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  306. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
  307. helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
  308. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  309. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  310. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  311. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  312. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  313. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  314. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  315. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  316. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  317. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  318. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  319. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  320. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  321. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  322. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  323. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  324. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  325. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  326. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  327. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  328. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  329. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  330. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  331. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  332. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  333. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  334. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  335. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
  336. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  337. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
  338. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  339. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  340. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  341. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  342. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  343. helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
  344. helm/benchmark/scenarios/banking77_scenario.py +77 -0
  345. helm/benchmark/scenarios/bbq_scenario.py +17 -2
  346. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  347. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  348. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  349. helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
  350. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  351. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  352. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  353. helm/benchmark/scenarios/bold_scenario.py +18 -3
  354. helm/benchmark/scenarios/boolq_scenario.py +21 -1
  355. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  356. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  357. helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
  358. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  359. helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
  360. helm/benchmark/scenarios/clear_scenario.py +180 -0
  361. helm/benchmark/scenarios/cleva_scenario.py +482 -3
  362. helm/benchmark/scenarios/code_scenario.py +46 -4
  363. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  364. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  365. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  366. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  367. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  368. helm/benchmark/scenarios/commonsense_scenario.py +33 -1
  369. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  370. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
  371. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  372. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  373. helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
  374. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  375. helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
  376. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
  377. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
  378. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
  379. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
  380. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
  381. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
  382. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
  383. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
  384. helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
  385. helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
  386. helm/benchmark/scenarios/disinformation_scenario.py +32 -1
  387. helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
  388. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  389. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  390. helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
  391. helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
  392. helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
  393. helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
  394. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  395. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  396. helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
  397. helm/benchmark/scenarios/financebench_scenario.py +74 -0
  398. helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
  399. helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
  400. helm/benchmark/scenarios/gpqa_scenario.py +98 -0
  401. helm/benchmark/scenarios/grammar.py +2 -2
  402. helm/benchmark/scenarios/grammar_scenario.py +21 -2
  403. helm/benchmark/scenarios/gsm_scenario.py +31 -1
  404. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
  405. helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
  406. helm/benchmark/scenarios/headqa_scenario.py +158 -0
  407. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  408. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
  409. helm/benchmark/scenarios/ice_scenario.py +28 -4
  410. helm/benchmark/scenarios/ifeval_scenario.py +71 -0
  411. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  412. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  413. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  414. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  415. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  416. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  417. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  418. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  419. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  420. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  421. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  422. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  423. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  424. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  425. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  426. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  427. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  428. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  429. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  430. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  431. helm/benchmark/scenarios/imdb_scenario.py +26 -3
  432. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  433. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  434. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
  435. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  436. helm/benchmark/scenarios/koala_scenario.py +21 -1
  437. helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
  438. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
  439. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  440. helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
  441. helm/benchmark/scenarios/legal_support_scenario.py +24 -1
  442. helm/benchmark/scenarios/legalbench_scenario.py +45 -3
  443. helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
  444. helm/benchmark/scenarios/lextreme_scenario.py +22 -1
  445. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  446. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  447. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  448. helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
  449. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  450. helm/benchmark/scenarios/math_scenario.py +81 -22
  451. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  452. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  453. helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
  454. helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
  455. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  456. helm/benchmark/scenarios/med_qa_scenario.py +30 -1
  457. helm/benchmark/scenarios/medalign_scenario.py +117 -0
  458. helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
  459. helm/benchmark/scenarios/medbullets_scenario.py +167 -0
  460. helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
  461. helm/benchmark/scenarios/medec_scenario.py +148 -0
  462. helm/benchmark/scenarios/medhallu_scenario.py +95 -0
  463. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  464. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  465. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  466. helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
  467. helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
  468. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  469. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  470. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  471. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  472. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  473. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  474. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  475. helm/benchmark/scenarios/mental_health_scenario.py +146 -0
  476. helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
  477. helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
  478. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
  479. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  480. helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
  481. helm/benchmark/scenarios/mmlu_scenario.py +32 -1
  482. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  483. helm/benchmark/scenarios/msmarco_scenario.py +31 -1
  484. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
  485. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
  486. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
  487. helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
  488. helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
  489. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  490. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  491. helm/benchmark/scenarios/omni_math_scenario.py +71 -0
  492. helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
  493. helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
  494. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
  495. helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
  496. helm/benchmark/scenarios/quac_scenario.py +24 -1
  497. helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
  498. helm/benchmark/scenarios/raft_scenario.py +33 -3
  499. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  500. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  501. helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
  502. helm/benchmark/scenarios/scenario.py +44 -1
  503. helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
  504. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  505. helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
  506. helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
  507. helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
  508. helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
  509. helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
  510. helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
  511. helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
  512. helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
  513. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  514. helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
  515. helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
  516. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  517. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  518. helm/benchmark/scenarios/spider_scenario.py +109 -0
  519. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
  520. helm/benchmark/scenarios/summarization_scenario.py +48 -1
  521. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  522. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  523. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
  524. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  525. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  526. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  527. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  528. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  529. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  530. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  531. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  532. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  533. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  534. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  535. helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
  536. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  537. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  538. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  539. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  540. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  541. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  542. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  543. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  544. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  545. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  546. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  547. helm/benchmark/scenarios/test_math_scenario.py +4 -3
  548. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  549. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  550. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  551. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  552. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  553. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  554. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  555. helm/benchmark/scenarios/test_scenario.py +6 -3
  556. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  557. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  558. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  559. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  560. helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
  561. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  562. helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
  563. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  564. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  565. helm/benchmark/scenarios/unitxt_scenario.py +62 -0
  566. helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
  567. helm/benchmark/scenarios/vicuna_scenario.py +22 -2
  568. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  569. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  570. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  571. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
  572. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  573. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  574. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  575. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  576. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
  577. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  578. helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
  579. helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
  580. helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
  581. helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
  582. helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
  583. helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
  584. helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
  585. helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
  586. helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
  587. helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
  588. helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
  589. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  590. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  591. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  592. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  593. helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
  594. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  595. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  596. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  597. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  598. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  599. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  600. helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
  601. helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
  602. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  603. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
  604. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  605. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
  606. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
  607. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  608. helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
  609. helm/benchmark/scenarios/wikifact_scenario.py +31 -1
  610. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  611. helm/benchmark/scenarios/wildbench_scenario.py +101 -0
  612. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  613. helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
  614. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  615. helm/benchmark/server.py +32 -2
  616. helm/benchmark/slurm_jobs.py +1 -2
  617. helm/benchmark/slurm_runner.py +78 -50
  618. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  619. helm/benchmark/static/schema_arabic.yaml +271 -0
  620. helm/benchmark/static/schema_audio.yaml +763 -0
  621. helm/benchmark/static/schema_autobencher.yaml +150 -0
  622. helm/benchmark/static/schema_call_center.yaml +269 -0
  623. helm/benchmark/static/schema_capabilities.yaml +254 -0
  624. helm/benchmark/static/schema_classic.yaml +259 -1140
  625. helm/benchmark/static/schema_cleva.yaml +768 -0
  626. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  627. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  628. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  629. helm/benchmark/static/schema_enterprise.yaml +319 -0
  630. helm/benchmark/static/schema_ewok.yaml +367 -0
  631. helm/benchmark/static/schema_finance.yaml +191 -0
  632. helm/benchmark/static/schema_heim.yaml +1389 -0
  633. helm/benchmark/static/schema_image2struct.yaml +588 -0
  634. helm/benchmark/static/schema_instruction_following.yaml +161 -0
  635. helm/benchmark/static/schema_legal.yaml +566 -0
  636. helm/benchmark/static/schema_lite.yaml +3 -286
  637. helm/benchmark/static/schema_long_context.yaml +282 -0
  638. helm/benchmark/static/schema_medhelm.yaml +1176 -0
  639. helm/benchmark/static/schema_melt.yaml +1257 -0
  640. helm/benchmark/static/schema_mmlu.yaml +1449 -0
  641. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  642. helm/benchmark/static/schema_safety.yaml +283 -0
  643. helm/benchmark/static/schema_seahelm.yaml +723 -0
  644. helm/benchmark/static/schema_slp.yaml +219 -0
  645. helm/benchmark/static/schema_slphelm.yaml +162 -0
  646. helm/benchmark/static/schema_social_audio.yaml +224 -0
  647. helm/benchmark/static/schema_sql.yaml +171 -0
  648. helm/benchmark/static/schema_thai.yaml +244 -0
  649. helm/benchmark/static/schema_torr.yaml +474 -0
  650. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  651. helm/benchmark/static/schema_unitxt.yaml +370 -0
  652. helm/benchmark/static/schema_vhelm.yaml +933 -0
  653. helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
  654. helm/benchmark/static/schema_video.yaml +219 -0
  655. helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
  656. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  657. helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
  658. helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
  659. helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
  660. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  661. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  662. helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
  663. helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
  664. helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
  665. helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
  666. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  667. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  668. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  669. helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
  670. helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
  671. helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
  672. helm/benchmark/static_build/config.js +4 -0
  673. helm/benchmark/static_build/index.html +19 -0
  674. helm/benchmark/test_data_preprocessor.py +3 -3
  675. helm/benchmark/test_run_expander.py +1 -1
  676. helm/benchmark/window_services/default_window_service.py +3 -45
  677. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
  678. helm/benchmark/window_services/ice_window_service.py +1 -35
  679. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  680. helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
  681. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  682. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  683. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  684. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  685. helm/benchmark/window_services/local_window_service.py +22 -5
  686. helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
  687. helm/benchmark/window_services/test_bloom_window_service.py +5 -4
  688. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  689. helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
  690. helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
  691. helm/benchmark/window_services/test_gptj_window_service.py +11 -5
  692. helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
  693. helm/benchmark/window_services/test_openai_window_service.py +18 -12
  694. helm/benchmark/window_services/test_opt_window_service.py +6 -5
  695. helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
  696. helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
  697. helm/benchmark/window_services/test_t511b_window_service.py +5 -4
  698. helm/benchmark/window_services/test_ul2_window_service.py +5 -4
  699. helm/benchmark/window_services/test_utils.py +6 -6
  700. helm/benchmark/window_services/test_yalm_window_service.py +5 -4
  701. helm/benchmark/window_services/tokenizer_service.py +7 -13
  702. helm/benchmark/window_services/window_service.py +42 -0
  703. helm/benchmark/window_services/window_service_factory.py +4 -1
  704. helm/benchmark/window_services/yalm_window_service.py +1 -28
  705. helm/clients/__init__.py +0 -0
  706. helm/{proxy/clients → clients}/ai21_client.py +78 -12
  707. helm/clients/aleph_alpha_client.py +114 -0
  708. helm/{proxy/clients → clients}/anthropic_client.py +304 -21
  709. helm/clients/audio_language/__init__.py +0 -0
  710. helm/clients/audio_language/diva_llama_client.py +122 -0
  711. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  712. helm/clients/audio_language/llama_omni/constants.py +9 -0
  713. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  714. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  715. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  716. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  717. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  718. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  719. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  720. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  721. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  722. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  723. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  724. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  725. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  726. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  727. helm/clients/audio_language/llama_omni/utils.py +202 -0
  728. helm/clients/audio_language/llama_omni_client.py +199 -0
  729. helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
  730. helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
  731. helm/clients/audio_language/qwen_audiolm_client.py +153 -0
  732. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  733. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  734. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  735. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  736. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  737. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  738. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  739. helm/clients/audio_language/test.py +62 -0
  740. helm/{proxy/clients → clients}/auto_client.py +72 -31
  741. helm/clients/azure_openai_client.py +55 -0
  742. helm/clients/bedrock_client.py +381 -0
  743. helm/clients/bedrock_utils.py +105 -0
  744. helm/{proxy/clients → clients}/client.py +92 -17
  745. helm/clients/clip_score_client.py +49 -0
  746. helm/clients/clip_scorers/__init__.py +0 -0
  747. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  748. helm/clients/clip_scorers/clip_scorer.py +50 -0
  749. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  750. helm/{proxy/clients → clients}/cohere_client.py +105 -14
  751. helm/clients/dspy_client.py +135 -0
  752. helm/clients/gcs_client.py +82 -0
  753. helm/{proxy/clients → clients}/google_client.py +8 -6
  754. helm/clients/google_translate_client.py +35 -0
  755. helm/clients/grok_client.py +36 -0
  756. helm/{proxy/clients → clients}/http_model_client.py +8 -8
  757. helm/{proxy/clients → clients}/huggingface_client.py +157 -86
  758. helm/clients/huggingface_pipeline_client.py +138 -0
  759. helm/clients/ibm_client.py +269 -0
  760. helm/clients/image_generation/__init__.py +0 -0
  761. helm/clients/image_generation/adobe_vision_client.py +80 -0
  762. helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
  763. helm/clients/image_generation/cogview2/__init__.py +0 -0
  764. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  765. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  766. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  767. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
  768. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  769. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  770. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
  771. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  772. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  773. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  774. helm/clients/image_generation/cogview2_client.py +192 -0
  775. helm/clients/image_generation/dalle2_client.py +194 -0
  776. helm/clients/image_generation/dalle3_client.py +108 -0
  777. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  778. helm/clients/image_generation/dalle_mini/data.py +442 -0
  779. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  780. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  781. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  782. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  783. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  784. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  785. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  786. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  787. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  788. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  789. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  790. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  791. helm/clients/image_generation/dalle_mini_client.py +191 -0
  792. helm/clients/image_generation/deep_floyd_client.py +80 -0
  793. helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
  794. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  795. helm/clients/image_generation/lexica_client.py +88 -0
  796. helm/clients/image_generation/mindalle/__init__.py +0 -0
  797. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  798. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  799. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  800. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  801. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  802. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  803. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  804. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  805. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  806. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  807. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  808. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  809. helm/clients/image_generation/mindalle_client.py +116 -0
  810. helm/clients/image_generation/nudity_check_client.py +64 -0
  811. helm/clients/image_generation/together_image_generation_client.py +113 -0
  812. helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
  813. helm/{proxy/clients → clients}/megatron_client.py +7 -5
  814. helm/clients/mistral_client.py +180 -0
  815. helm/clients/moderation_api_client.py +111 -0
  816. helm/clients/nvidia_nim_client.py +32 -0
  817. helm/clients/open_lm_client.py +43 -0
  818. helm/clients/openai_client.py +604 -0
  819. helm/clients/openai_responses_client.py +200 -0
  820. helm/clients/openrouter_client.py +31 -0
  821. helm/{proxy/clients → clients}/palmyra_client.py +31 -14
  822. helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
  823. helm/clients/reka_client.py +190 -0
  824. helm/clients/simple_client.py +64 -0
  825. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  826. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  827. helm/clients/stanfordhealthcare_google_client.py +43 -0
  828. helm/clients/stanfordhealthcare_http_model_client.py +95 -0
  829. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  830. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  831. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  832. helm/clients/test_client.py +98 -0
  833. helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
  834. helm/clients/test_openrouter_client.py +69 -0
  835. helm/clients/test_simple_client.py +19 -0
  836. helm/clients/test_together_client.py +184 -0
  837. helm/clients/together_client.py +599 -0
  838. helm/clients/upstage_client.py +23 -0
  839. helm/clients/vertexai_client.py +488 -0
  840. helm/clients/vision_language/__init__.py +0 -0
  841. helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
  842. helm/clients/vision_language/huggingface_vlm_client.py +114 -0
  843. helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
  844. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  845. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  846. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  847. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  848. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  849. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  850. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  851. helm/clients/vision_language/open_flamingo_client.py +155 -0
  852. helm/clients/vision_language/paligemma_client.py +147 -0
  853. helm/clients/vision_language/palmyra_vision_client.py +101 -0
  854. helm/clients/vision_language/qwen2_vlm_client.py +189 -0
  855. helm/clients/vision_language/qwen_vlm_client.py +174 -0
  856. helm/clients/vllm_client.py +80 -0
  857. helm/clients/vllm_granite_thinking_client.py +56 -0
  858. helm/clients/writer_client.py +105 -0
  859. helm/clients/yi_client.py +28 -0
  860. helm/common/audio_utils.py +111 -0
  861. helm/common/cache.py +23 -33
  862. helm/common/cache_backend_config.py +47 -0
  863. helm/common/clip_score_request.py +41 -0
  864. helm/common/context.py +80 -0
  865. helm/common/credentials_utils.py +5 -5
  866. helm/common/critique_request.py +10 -2
  867. helm/common/file_caches/__init__.py +0 -0
  868. helm/common/file_caches/file_cache.py +16 -0
  869. helm/common/file_caches/local_file_cache.py +61 -0
  870. helm/common/file_caches/test_local_file_cache.py +25 -0
  871. helm/common/file_upload_request.py +27 -0
  872. helm/common/general.py +10 -3
  873. helm/common/hierarchical_logger.py +124 -12
  874. helm/common/image_generation_parameters.py +25 -0
  875. helm/common/images_utils.py +60 -5
  876. helm/common/key_value_store.py +41 -10
  877. helm/common/local_context.py +140 -0
  878. helm/common/media_object.py +14 -1
  879. helm/common/moderations_api_request.py +71 -0
  880. helm/common/mongo_key_value_store.py +8 -7
  881. helm/common/multimodal_request_utils.py +57 -0
  882. helm/common/nudity_check_request.py +29 -0
  883. helm/common/object_spec.py +23 -8
  884. helm/common/optional_dependencies.py +1 -1
  885. helm/common/reeval_parameters.py +12 -0
  886. helm/common/remote_context.py +61 -0
  887. helm/common/request.py +45 -19
  888. helm/common/response_format.py +18 -0
  889. helm/common/test_cache.py +1 -48
  890. helm/common/test_general.py +10 -0
  891. helm/common/test_logging.py +94 -0
  892. helm/common/test_media_object.py +1 -1
  893. helm/common/tokenization_request.py +1 -10
  894. helm/config/model_deployments.yaml +4713 -1005
  895. helm/config/model_metadata.yaml +4045 -255
  896. helm/config/tokenizer_configs.yaml +1091 -50
  897. helm/proxy/accounts.py +31 -4
  898. helm/proxy/cli.py +6 -4
  899. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  900. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  901. helm/proxy/critique/model_critique_client.py +40 -10
  902. helm/proxy/example_queries.py +33 -28
  903. helm/proxy/retry.py +5 -0
  904. helm/proxy/server.py +82 -18
  905. helm/proxy/services/remote_service.py +32 -7
  906. helm/proxy/services/server_service.py +71 -69
  907. helm/proxy/services/service.py +30 -6
  908. helm/proxy/services/test_remote_service.py +6 -5
  909. helm/proxy/services/test_service.py +1 -13
  910. helm/proxy/static/help.html +99 -0
  911. helm/proxy/static/index.css +61 -0
  912. helm/proxy/static/index.html +40 -0
  913. helm/proxy/static/index.js +462 -0
  914. helm/proxy/test_accounts.py +32 -0
  915. helm/proxy/test_retry.py +1 -1
  916. helm/proxy/token_counters/auto_token_counter.py +37 -37
  917. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  918. helm/proxy/token_counters/token_counter.py +3 -5
  919. helm/tokenizers/__init__.py +0 -0
  920. helm/tokenizers/ai21_tokenizer.py +52 -0
  921. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
  922. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
  923. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
  924. helm/tokenizers/cohere_tokenizer.py +50 -0
  925. helm/tokenizers/grok_tokenizer.py +55 -0
  926. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
  927. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
  928. helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
  929. helm/tokenizers/simple_tokenizer.py +33 -0
  930. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  931. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
  932. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  933. helm/tokenizers/test_grok_tokenizer.py +33 -0
  934. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
  935. helm/tokenizers/test_simple_tokenizer.py +33 -0
  936. helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
  937. helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
  938. helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
  939. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  940. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
  941. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  942. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  943. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  944. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  945. crfm_helm-0.4.0.dist-info/METADATA +0 -264
  946. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  947. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  948. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  949. helm/benchmark/data_overlap/light_scenario.py +0 -60
  950. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  951. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  952. helm/benchmark/run_specs.py +0 -2762
  953. helm/benchmark/scenarios/numeracy_scenario.py +0 -784
  954. helm/benchmark/static/benchmarking.css +0 -156
  955. helm/benchmark/static/benchmarking.js +0 -1705
  956. helm/benchmark/static/config.js +0 -3
  957. helm/benchmark/static/images/helm-logo.png +0 -0
  958. helm/benchmark/static/images/language-model-helm.png +0 -0
  959. helm/benchmark/static/images/organizations/ai21.png +0 -0
  960. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  961. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  962. helm/benchmark/static/images/organizations/cohere.png +0 -0
  963. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  964. helm/benchmark/static/images/organizations/google.png +0 -0
  965. helm/benchmark/static/images/organizations/meta.png +0 -0
  966. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  967. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  968. helm/benchmark/static/images/organizations/openai.png +0 -0
  969. helm/benchmark/static/images/organizations/together.png +0 -0
  970. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  971. helm/benchmark/static/images/organizations/yandex.png +0 -0
  972. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  973. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  974. helm/benchmark/static/index.html +0 -68
  975. helm/benchmark/static/json-urls.js +0 -69
  976. helm/benchmark/static/plot-captions.js +0 -27
  977. helm/benchmark/static/utils.js +0 -285
  978. helm/benchmark/test_model_deployment_definition.py +0 -92
  979. helm/benchmark/test_model_properties.py +0 -1570
  980. helm/benchmark/vlm_run_specs.py +0 -97
  981. helm/benchmark/window_services/ai21_window_service.py +0 -258
  982. helm/benchmark/window_services/cohere_window_service.py +0 -163
  983. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  984. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  985. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  986. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  987. helm/benchmark/window_services/t511b_window_service.py +0 -30
  988. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  989. helm/benchmark/window_services/test_cohere_window_service.py +0 -74
  990. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  991. helm/benchmark/window_services/test_ice_window_service.py +0 -326
  992. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  993. helm/benchmark/window_services/ul2_window_service.py +0 -30
  994. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  995. helm/common/cache_utils.py +0 -14
  996. helm/proxy/clients/aleph_alpha_client.py +0 -95
  997. helm/proxy/clients/goose_ai_client.py +0 -99
  998. helm/proxy/clients/microsoft_client.py +0 -180
  999. helm/proxy/clients/openai_client.py +0 -206
  1000. helm/proxy/clients/simple_client.py +0 -60
  1001. helm/proxy/clients/test_client.py +0 -49
  1002. helm/proxy/clients/test_together_client.py +0 -97
  1003. helm/proxy/clients/together_client.py +0 -334
  1004. helm/proxy/clients/vertexai_client.py +0 -115
  1005. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  1006. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  1007. helm/proxy/token_counters/free_token_counter.py +0 -12
  1008. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  1009. helm/proxy/token_counters/openai_token_counter.py +0 -22
  1010. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  1011. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  1012. helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
  1013. helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
  1014. helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
  1015. helm/proxy/tokenizers/ice_tokenizer.py +0 -30
  1016. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  1017. helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
  1018. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  1019. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
  1020. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  1021. /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
  1022. /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
  1023. /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
  1024. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  1025. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  1026. /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
  1027. /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
  1028. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  1029. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  1030. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  1031. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  1032. /helm/{benchmark → proxy}/static/general.js +0 -0
  1033. /helm/{benchmark → proxy}/static/info-icon.png +0 -0
@@ -0,0 +1,831 @@
1
+ from dataclasses import replace
2
+ from functools import partial
3
+ from typing import Callable, Dict, List, Optional, Set, Tuple, cast
4
+ import re
5
+ import string
6
+
7
+ from nltk.metrics.scores import f_measure
8
+ from nltk.tokenize import word_tokenize
9
+ from nltk.translate.bleu_score import sentence_bleu
10
+ from rouge_score import rouge_scorer
11
+ import numpy as np
12
+
13
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
14
+ from helm.benchmark.adaptation.request_state import RequestState
15
+ from helm.benchmark.metrics import code_metrics_helper
16
+ from helm.benchmark.metrics.cleva_metrics_helper import ChineseTokenizer
17
+ from helm.benchmark.metrics.metric import MetricMetadata
18
+ from helm.benchmark.metrics.metric_name import MetricName
19
+ from helm.benchmark.metrics.metric_service import MetricService
20
+ from helm.benchmark.metrics.nltk_helper import install_nltk_resources
21
+ from helm.benchmark.metrics.statistic import Stat
22
+ from helm.benchmark.scenarios.code_scenario import CodeReference
23
+ from helm.benchmark.scenarios.math_scenario import is_equiv, is_equiv_chain_of_thought
24
+ from helm.benchmark.scenarios.scenario import Reference
25
+ from helm.common.optional_dependencies import handle_module_not_found_error
26
+ from helm.common.request import GeneratedOutput
27
+
28
+
29
+ install_nltk_resources()
30
+
31
+
32
+ def pass_at_k_estimator(n: int, c: int, k: int) -> float:
33
+ """Calculates 1 - comb(n - c, k) / comb(n, k).
34
+
35
+ Numerically stable version defined in
36
+ https://arxiv.org/pdf/2107.03374.pdf
37
+ """
38
+ if n - c < k:
39
+ return 1.0
40
+ return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)).item()
41
+
42
+
43
+ def normalize_text(text: str, should_remove_articles: bool = True) -> str:
44
+ """Lower text and remove punctuation, articles and extra whitespace.
45
+ Copied from the [QuAC](http://quac.ai/) evaluation script found at
46
+ https://s3.amazonaws.com/my89public/quac/scorer.py"""
47
+
48
+ def remove_articles(text: str) -> str:
49
+ return re.sub(r"\b(a|an|the)\b", " ", text)
50
+
51
+ def white_space_fix(text: str) -> str:
52
+ return " ".join(text.split())
53
+
54
+ def remove_punc(text: str) -> str:
55
+ exclude = set(string.punctuation)
56
+ return "".join(ch for ch in text if ch not in exclude)
57
+
58
+ def lower(text: str) -> str:
59
+ return text.lower()
60
+
61
+ normalized_text = remove_punc(lower(text))
62
+ if should_remove_articles:
63
+ normalized_text = remove_articles(normalized_text)
64
+ return white_space_fix(normalized_text)
65
+
66
+
67
+ def exact_match(gold: str, pred: str) -> float:
68
+ if not pred:
69
+ return 0
70
+
71
+ return 1 if gold.strip() == pred.strip() else 0
72
+
73
+
74
+ def quasi_exact_match(gold: str, pred: str) -> float:
75
+ if not pred:
76
+ return 0
77
+
78
+ return 1 if normalize_text(gold) == normalize_text(pred) else 0
79
+
80
+
81
+ def quasi_leave_articles_exact_match(gold: str, pred: str) -> float:
82
+ if not pred:
83
+ return 0
84
+
85
+ return (
86
+ 1
87
+ if normalize_text(gold, should_remove_articles=False) == normalize_text(pred, should_remove_articles=False)
88
+ else 0
89
+ )
90
+
91
+
92
+ def prefix_exact_match(gold: str, pred: str) -> float:
93
+ """
94
+ The `prefix_exact_match` metric is particularly useful in the zero-shot setting, where the model is
95
+ not given examples of the expected outputs and tends to output more tokens than it should.
96
+
97
+ For example, for this zero-shot prompt from BoolQ,
98
+
99
+ Passage: Elmendorf Air Force Base (IATA: EDF, ICAO: PAED, FAA LID: EDF) is a United States military facility
100
+ in Anchorage, the largest city in Alaska. Originally known as Elmendorf Field, it became Elmendorf Air Force
101
+ Base after World War II, and in 2010 it merged with nearby Fort Richardson to form Joint Base Elmendorf-Richardson.
102
+ Question: Is there an air force base in anchorage alaska?
103
+ Answer:
104
+
105
+ the model could output up to `max_tokens` number of tokens "Yes, Elmendorf" instead of just "Yes".
106
+ """
107
+ if not pred:
108
+ return 0
109
+
110
+ return 1 if pred.strip().startswith(gold.strip()) else 0
111
+
112
+
113
+ def quasi_prefix_exact_match(gold: str, pred: str) -> float:
114
+ """
115
+ Same thing as `prefix_exact_match` but we normalize the text before checking if the prefix match.
116
+ """
117
+ if not pred:
118
+ return 0
119
+
120
+ return 1 if normalize_text(pred).startswith(normalize_text(gold)) else 0
121
+
122
+
123
+ def f1_score(gold: str, pred: str) -> float:
124
+ ret = f_measure(set(normalize_text(gold).split()), set(normalize_text(pred).split()))
125
+ if ret is None: # answer is the empty string after normalizing
126
+ return 0.0
127
+
128
+ return ret
129
+
130
+
131
+ def exact_match_indicator(gold: str, pred: str, indicator: str = " ") -> float:
132
+ """
133
+ Exact match, allowing for some preceding context.
134
+ For example, the following two answers are considered matching:
135
+ - Because of x and y, the answer is ## <answer>
136
+ - Given reasons y and z, the answer is ## <answer>
137
+ While the following is considered different from the earlier two
138
+ - Given reasons x and a, the answer is ## <other answer>
139
+ """
140
+ pred = pred.split(indicator)[-1].strip()
141
+ gold = gold.split(indicator)[-1].strip()
142
+ return exact_match(gold, pred)
143
+
144
+
145
+ def final_number_exact_match(gold: str, pred: str) -> float:
146
+ """
147
+ Returns 1 iff the final number in gold and pred match.
148
+ Similar to exact_match_indicator.
149
+ Example:
150
+ - gold = "The answer is 15."
151
+ - pred = "The answer is 15 eggs."
152
+ - Returns 1
153
+ """
154
+
155
+ def get_final_number(x: str) -> str:
156
+ matches = re.findall(r"-?[\d,]+(?:.\d+)?", x)
157
+ if not matches:
158
+ return ""
159
+ return matches[-1].replace(",", "")
160
+
161
+ return exact_match(get_final_number(gold), get_final_number(pred))
162
+
163
+
164
+ def rouge_score(gold: str, pred: str, rouge_type: str, scorer: rouge_scorer.RougeScorer) -> float:
165
+ scores = scorer.score(gold, pred)
166
+ return scores[rouge_type].fmeasure
167
+
168
+
169
+ def get_rouge_function(rouge_type: str) -> Callable[[str, str], float]:
170
+ scorer = rouge_scorer.RougeScorer([rouge_type], use_stemmer=True)
171
+ return partial(rouge_score, scorer=scorer, rouge_type=rouge_type)
172
+
173
+
174
+ def bleu_1(gold: str, pred: str) -> float:
175
+ return sentence_bleu([word_tokenize(gold)], word_tokenize(pred), weights=(1, 0, 0, 0))
176
+
177
+
178
+ def chinese_bleu_1(gold: str, pred: str) -> float:
179
+ char_tokenizer = ChineseTokenizer()
180
+ return sentence_bleu([char_tokenizer.tokenize(gold)], char_tokenizer.tokenize(pred), weights=(1, 0, 0, 0))
181
+
182
+
183
+ def get_chinese_rouge_function(rouge_type: str) -> Callable[[str, str], float]:
184
+ char_tokenizer = ChineseTokenizer()
185
+ scorer = rouge_scorer.RougeScorer([rouge_type], use_stemmer=True, tokenizer=char_tokenizer)
186
+ return partial(rouge_score, scorer=scorer, rouge_type=rouge_type)
187
+
188
+
189
+ def cleva_math_result_match(gold: str, pred: str) -> float:
190
+ """
191
+ Exact match that only cares the last math expression.
192
+ Common math expressions are numbers and fractions.
193
+ """
194
+ pattern = r"[-+*/%\.\(\)\d]+"
195
+ matches = re.findall(pattern, pred)
196
+ if matches:
197
+ pred = matches[-1].lstrip(")")
198
+ # remove space in front or at the end
199
+ pred = pred.strip()
200
+ return exact_match(gold, pred)
201
+
202
+
203
+ def bleu_4(gold: str, pred: str) -> float:
204
+ return sentence_bleu([word_tokenize(gold)], word_tokenize(pred), weights=(0, 0, 0, 1))
205
+
206
+
207
+ def cider(gold: str, pred: str) -> float:
208
+ try:
209
+ from pycocoevalcap.cider.cider import Cider
210
+ except ModuleNotFoundError as e:
211
+ handle_module_not_found_error(e, ["vlm"])
212
+
213
+ cider_evaluator = Cider()
214
+ candidate = {"caption": [pred]}
215
+ reference = {"caption": [gold]}
216
+ average_score, _ = cider_evaluator.compute_score(reference, candidate)
217
+ return average_score
218
+
219
+
220
+ def wer_score(gold: str, pred: str) -> float:
221
+ # Word Error Rate (WER), which is a common
222
+ # metric used to evaluate the accuracy of speech recognition systems.
223
+ # The lower the better. The WER might be greater than 1.
224
+ # https://huggingface.co/learn/audio-course/en/chapter5/evaluation#word-error-rate
225
+ try:
226
+ from jiwer import wer
227
+ except ModuleNotFoundError as e:
228
+ handle_module_not_found_error(e, ["audiolm"])
229
+
230
+ if not pred:
231
+ return 0
232
+ gold = normalize_text(gold, should_remove_articles=False)
233
+ pred = normalize_text(pred, should_remove_articles=False)
234
+ wer_ret = wer(gold, pred)
235
+ return wer_ret
236
+
237
+
238
+ def mer_score(gold: str, pred: str) -> float:
239
+ # Match Error Rate (MER), which is for evaluating the error rate of
240
+ # speech recognition systems. The lower the better.
241
+ try:
242
+ from jiwer import mer
243
+ except ModuleNotFoundError as e:
244
+ handle_module_not_found_error(e, ["audiolm"])
245
+
246
+ if not pred:
247
+ return 0
248
+
249
+ gold = normalize_text(gold, should_remove_articles=False)
250
+ pred = normalize_text(pred, should_remove_articles=False)
251
+ mer_ret = mer(gold, pred)
252
+ return mer_ret
253
+
254
+
255
+ def wip_score(gold: str, pred: str) -> float:
256
+ # Word information preservation (WIP) for evaluating the preserved information of speech
257
+ # recognition systems. The higher the better.
258
+ try:
259
+ from jiwer import wip
260
+ except ModuleNotFoundError as e:
261
+ handle_module_not_found_error(e, ["audiolm"])
262
+
263
+ if not pred:
264
+ return 0
265
+
266
+ gold = normalize_text(gold, should_remove_articles=False)
267
+ pred = normalize_text(pred, should_remove_articles=False)
268
+ wip_ret = wip(gold, pred)
269
+ return wip_ret
270
+
271
+
272
+ def cer_score(gold: str, pred: str) -> float:
273
+ # Character Error Rate (CER) for evaluating the accuracy
274
+ # of speech recognition systems. The lower the better.
275
+ try:
276
+ from jiwer import cer
277
+ except ModuleNotFoundError as e:
278
+ handle_module_not_found_error(e, ["audiolm"])
279
+
280
+ if not pred:
281
+ return 0
282
+
283
+ gold = normalize_text(gold, should_remove_articles=False)
284
+ pred = normalize_text(pred, should_remove_articles=False)
285
+ cer_ret = cer(gold, pred)
286
+ assert isinstance(cer_ret, float)
287
+ return cer_ret
288
+
289
+
290
+ def chinese_wer_score(gold: str, pred: str) -> float:
291
+ try:
292
+ import jieba
293
+ except ModuleNotFoundError as e:
294
+ handle_module_not_found_error(e, ["audiolm"])
295
+
296
+ return wer_score(" ".join(jieba.cut(gold)), " ".join(jieba.cut(pred)))
297
+
298
+
299
+ def chinese_mer_score(gold: str, pred: str) -> float:
300
+ try:
301
+ import jieba
302
+ except ModuleNotFoundError as e:
303
+ handle_module_not_found_error(e, ["audiolm"])
304
+
305
+ return mer_score(" ".join(jieba.cut(gold)), " ".join(jieba.cut(pred)))
306
+
307
+
308
+ def chinese_wip_score(gold: str, pred: str) -> float:
309
+ try:
310
+ import jieba
311
+ except ModuleNotFoundError as e:
312
+ handle_module_not_found_error(e, ["audiolm"])
313
+
314
+ return wip_score(" ".join(jieba.cut(gold)), " ".join(jieba.cut(pred)))
315
+
316
+
317
+ def chinese_cer_score(gold: str, pred: str) -> float:
318
+ try:
319
+ import jieba
320
+ except ModuleNotFoundError as e:
321
+ handle_module_not_found_error(e, ["audiolm"])
322
+
323
+ return cer_score(" ".join(jieba.cut(gold)), " ".join(jieba.cut(pred)))
324
+
325
+
326
+ def extract_set_from_text(
327
+ set_str: str,
328
+ set_start_str: str = " is ",
329
+ set_separator: str = " and ",
330
+ empty_set_str: str = "Nothing.",
331
+ ) -> Set[str]:
332
+ """
333
+ Given a string, extract the set of strings implied by that string.
334
+ set_start_str denotes the start of the set
335
+ set_separator denotes the string separating set elements
336
+ empty_set_str is the string which denotes the empty set
337
+ """
338
+ if set_str == empty_set_str:
339
+ return set()
340
+ set_str = set_str.replace(".", "")
341
+ extracted_set = set(set_str.split(set_start_str)[-1].split(set_separator))
342
+ return extracted_set
343
+
344
+
345
+ def extract_gold_pred_sets(gold: str, pred: str) -> Tuple[Set[str], Set[str]]:
346
+ """Extract the set of strings implied by the gold and pred strings"""
347
+ gold_set = extract_set_from_text(gold)
348
+ pred_set = extract_set_from_text(pred.split("\n")[0])
349
+ return gold_set, pred_set
350
+
351
+
352
+ def iou_set_match(gold: str, pred: str) -> float:
353
+ """Compute the intersection over union of the gold and pred sets"""
354
+ gold_set, pred_set = extract_gold_pred_sets(gold, pred)
355
+ if len(gold_set) == 0: # If gold is empty, just check if the pred set is also empty
356
+ return float(gold_set == pred_set)
357
+ return len(gold_set.intersection(pred_set)) / len(gold_set.union(pred_set))
358
+
359
+
360
+ def f1_set_match(gold: str, pred: str) -> float:
361
+ """Compute the F1 score of the gold and pred sets"""
362
+ gold_set, pred_set = extract_gold_pred_sets(gold, pred)
363
+ if len(gold_set) == 0: # If gold is empty, just check if the pred set is also empty
364
+ return float(gold_set == pred_set)
365
+ true_positives = gold_set.intersection(pred_set)
366
+ return 2 * len(true_positives) / (len(gold_set) + len(pred_set))
367
+
368
+
369
+ def exact_set_match(gold: str, pred: str) -> float:
370
+ """Compute whether the sets generated exactly match"""
371
+ gold_set, pred_set = extract_gold_pred_sets(gold, pred)
372
+ return float(gold_set == pred_set)
373
+
374
+
375
+ def absolute_value_difference(gold: str, pred: str) -> float:
376
+ """Compute the absolute value of the difference between two numbers (provided as strings),
377
+ or 0.0 if invalid input.
378
+ """
379
+
380
+ def maybe_int(text: str):
381
+ """Parse int, ignoring commas in numbers."""
382
+ try:
383
+ val = int(text.replace(",", ""))
384
+ except ValueError:
385
+ return 0.0
386
+ return val
387
+
388
+ gold_val = maybe_int(gold)
389
+ pred_val = maybe_int(pred)
390
+ return abs(gold_val - pred_val)
391
+
392
+
393
+ def code_eval(gold: Tuple[str, Optional[Dict]], pred: str) -> float:
394
+ """Evaluate Code Correctness on test examples."""
395
+ assert gold[1] is not None # gold[1]["canonical_solution"]
396
+ # Warning: will execute machine generated code; need to sandbox before executing
397
+ return float(code_metrics_helper.check_correctness(gold[1], pred, 3.0)["passed"]) # type: ignore
398
+
399
+
400
+ def _apply_output_mapping_pattern(pattern: str, prediction: str) -> str:
401
+ match = re.search(pattern, prediction)
402
+ if not match:
403
+ return ""
404
+ elif match.groups():
405
+ return match.group(0)
406
+ else:
407
+ return match.string
408
+
409
+
410
+ # TODO This should probably be made into an implementation of MetricInterface. For now it lives here
411
+ # just to separate it from basic_metrics.py.
412
+ def compute_reference_metrics(
413
+ names: List[str], adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
414
+ ) -> List[Stat]:
415
+ """
416
+ Setup:
417
+
418
+ - Gold (correct references): G1 ... Gm
419
+ - Predictions (completions): P1 ... Pk
420
+
421
+ For each pair (G, P), we can define a ${score} (e.g., exact match, F1, BLEU).
422
+
423
+ We define the following stats:
424
+
425
+ - ${score}: max_i score(Gi, P1)
426
+ - ${score}@k: max_{i,j} score(Gi, Pj)
427
+ """
428
+
429
+ def compute_metrics_helper(
430
+ name: MetricName,
431
+ score_func: Callable,
432
+ group: Optional[str] = None,
433
+ ) -> List[Stat]:
434
+ if name.name == "pass": # Calculate pass@k for HumanEval from CodeScenario.
435
+ score_func = cast(Callable[[Tuple[str, Optional[Dict]], str], float], score_func) # Make mypy happy.
436
+ code_golds = cast(List[CodeReference], golds)
437
+ results = [score_func((gold.output.text, gold.test_cases), pred) for gold in code_golds for pred in preds]
438
+ _len, _sum = len(results), int(sum(results)) # Cast to int to make type match.
439
+ score_1 = pass_at_k_estimator(_len, _sum, 1)
440
+ score_k = pass_at_k_estimator(_len, _sum, adapter_spec.num_outputs)
441
+ elif name.name == "code_eval_acc":
442
+ score_func = cast(Callable[[Tuple[str, Optional[Dict]], str], float], score_func) # Make mypy happy.
443
+ code_golds = cast(List[CodeReference], golds)
444
+ score_1 = max(score_func((gold.output.text, gold.test_cases), preds[0]) for gold in code_golds)
445
+ score_k = max(
446
+ score_func((gold.output.text, gold.test_cases), pred) for gold in code_golds for pred in preds
447
+ )
448
+ else:
449
+ score_func = cast(Callable[[str, str], float], score_func) # Make mypy happy.
450
+ score_1 = max(score_func(gold.output.text, preds[0]) for gold in golds)
451
+ score_k = max(score_func(gold.output.text, pred) for gold in golds for pred in preds)
452
+
453
+ metrics = [Stat(name).add(score_1)] # score_1 corresponds using one prediction
454
+ if adapter_spec.num_outputs != 1:
455
+ metrics.append(Stat(replace(name, name=f"{name.name}@{adapter_spec.num_outputs}")).add(score_k))
456
+ return metrics
457
+
458
+ # maps each string metric name to its associated function
459
+ metric_fn_mapping: Dict[str, Callable] = {
460
+ "exact_match": exact_match,
461
+ "quasi_exact_match": quasi_exact_match,
462
+ "quasi_leave_articles_exact_match": quasi_leave_articles_exact_match,
463
+ "prefix_exact_match": prefix_exact_match,
464
+ "quasi_prefix_exact_match": quasi_prefix_exact_match,
465
+ "exact_match_indicator": exact_match_indicator,
466
+ "final_number_exact_match": final_number_exact_match,
467
+ "exact_set_match": exact_set_match,
468
+ "iou_set_match": iou_set_match,
469
+ "f1_set_match": f1_set_match,
470
+ "math_equiv": is_equiv,
471
+ "math_equiv_chain_of_thought": is_equiv_chain_of_thought,
472
+ "code_eval_acc": code_eval,
473
+ "pass": code_eval,
474
+ "cider": cider,
475
+ "f1_score": f1_score,
476
+ "rouge_1": get_rouge_function("rouge1"),
477
+ "rouge_2": get_rouge_function("rouge2"),
478
+ "rouge_l": get_rouge_function("rougeL"),
479
+ "bleu_1": bleu_1,
480
+ "bleu_4": bleu_4,
481
+ "chinese_bleu_1": chinese_bleu_1,
482
+ "chinese_rouge_1": get_chinese_rouge_function("rouge1"),
483
+ "chinese_rouge_2": get_chinese_rouge_function("rouge2"),
484
+ "cleva_math_result_match": cleva_math_result_match,
485
+ "absolute_value_difference": absolute_value_difference,
486
+ "wer_score": wer_score,
487
+ "mer_score": mer_score,
488
+ "wip_score": wip_score,
489
+ "cer_score": cer_score,
490
+ "chinese_wer_score": chinese_wer_score,
491
+ "chinese_mer_score": chinese_mer_score,
492
+ "chinese_wip_score": chinese_wip_score,
493
+ "chinese_cer_score": chinese_cer_score,
494
+ }
495
+
496
+ stats: List[Stat] = []
497
+
498
+ # Gold outputs
499
+ golds: List[Reference] = [reference for reference in request_state.instance.references if reference.is_correct]
500
+ assert len(golds) > 0
501
+
502
+ # Predicted outputs
503
+ assert request_state.result is not None
504
+ sorted_completions: List[GeneratedOutput] = sorted(request_state.result.completions, key=lambda x: -x.logprob)
505
+ preds: List[str] = [completion.text.strip() for completion in sorted_completions]
506
+
507
+ # Apply mapping if exists (e.g., for multiple-choice questions A -> Boston, B -> New York)
508
+ # Note: If 'A' and 'B' were the only possible choices, smaller language models like GPT-2 would
509
+ # sometimes predict a random letter like 'M'.
510
+ if request_state.output_mapping is not None:
511
+ if adapter_spec.output_mapping_pattern:
512
+ preds = [_apply_output_mapping_pattern(adapter_spec.output_mapping_pattern, pred) for pred in preds]
513
+ preds = [request_state.output_mapping.get(pred) for pred in preds] # type: ignore
514
+
515
+ # Compute max_prob, the probability that the model assigns to its generated text.
516
+ # Use the log prob of sorted_completions[0], which is the completion with the highest
517
+ # log_prob. We use this since that's what's used for computing metrics like exact_match.
518
+ # One subtlety is that when computing exact_match, we strip whitespace, so the actual
519
+ # max_prob is the sum of all the probabilities in the set {x : strip(x) = prediction}.
520
+ # In practice, we think this may not make much of a difference because models may not place
521
+ # high probabilities on having additional spaces (should check this). Also, the sum
522
+ # involves computing the log_prob for many completions which could be intractable.
523
+ max_prob = np.exp(sorted_completions[0].logprob)
524
+ stats.append(Stat(MetricName("max_prob")).add(max_prob))
525
+
526
+ # Add other metrics
527
+ for metric_name in names:
528
+ if metric_name in metric_fn_mapping:
529
+ stats.extend(compute_metrics_helper(MetricName(metric_name), metric_fn_mapping[metric_name]))
530
+ else:
531
+ raise NameError(f"{metric_name} is not in the list of metric functions.")
532
+
533
+ return stats
534
+
535
+
536
+ _METRIC_METADATA_MAPPING: Dict[str, MetricMetadata] = {
537
+ "exact_match": MetricMetadata(
538
+ name="exact_match",
539
+ display_name="Exact match",
540
+ short_display_name="EM",
541
+ description="Fraction of instances that the predicted output matches a correct reference exactly.",
542
+ lower_is_better=False,
543
+ group="accuracy",
544
+ ),
545
+ "quasi_exact_match": MetricMetadata(
546
+ name="quasi_exact_match",
547
+ display_name="Quasi-exact match",
548
+ short_display_name="EM",
549
+ description="Fraction of instances that the predicted output matches a correct reference up to light "
550
+ "processing.",
551
+ lower_is_better=False,
552
+ group=None,
553
+ ),
554
+ "quasi_leave_articles_exact_match": MetricMetadata(
555
+ name="quasi_leave_articles_exact_match",
556
+ display_name="Quasi-exact match",
557
+ short_display_name="EM",
558
+ description="Fraction of instances that the predicted output matches a correct reference up to light "
559
+ "processing.",
560
+ lower_is_better=False,
561
+ group=None,
562
+ ),
563
+ "prefix_exact_match": MetricMetadata(
564
+ name="prefix_exact_match",
565
+ display_name="Prefix exact match",
566
+ short_display_name="PEM",
567
+ description="Fraction of instances that the predicted output matches the prefix of a correct reference "
568
+ "exactly.",
569
+ lower_is_better=False,
570
+ group=None,
571
+ ),
572
+ "quasi_prefix_exact_match": MetricMetadata(
573
+ name="quasi_prefix_exact_match",
574
+ display_name="Prefix quasi-exact match",
575
+ short_display_name="PEM",
576
+ description="Fraction of instances that the predicted output matches the prefix of a correct reference "
577
+ "up to light processing.",
578
+ lower_is_better=False,
579
+ group=None,
580
+ ),
581
+ "exact_match_indicator": MetricMetadata(
582
+ name="exact_match_indicator",
583
+ display_name="Exact match (final)",
584
+ short_display_name="EM",
585
+ description="Fraction of instances that the predicted output matches a correct reference exactly, "
586
+ "ignoring text preceding the specified indicator (e.g., space).",
587
+ lower_is_better=False,
588
+ group=None,
589
+ ),
590
+ "final_number_exact_match": MetricMetadata(
591
+ name="final_number_exact_match",
592
+ display_name="Exact match (final number)",
593
+ short_display_name="EM",
594
+ description="Fraction of instances that the predicted output matches a correct reference exactly, "
595
+ "ignoring text preceding the specified indicator.",
596
+ lower_is_better=False,
597
+ group=None,
598
+ ),
599
+ "exact_set_match": MetricMetadata(
600
+ name="exact_set_match",
601
+ display_name="Exact match (at sets)",
602
+ short_display_name="EM",
603
+ description="Fraction of instances that the predicted output matches a correct reference exactly as " "sets.",
604
+ lower_is_better=False,
605
+ group=None,
606
+ ),
607
+ "iou_set_match": MetricMetadata(
608
+ name="iou_set_match",
609
+ display_name="Intersection over union (as sets)",
610
+ short_display_name="IoU",
611
+ description="Intersection over union in terms of set overlap between the model predicted set and "
612
+ "correct reference set.",
613
+ lower_is_better=False,
614
+ group=None,
615
+ ),
616
+ "f1_set_match": MetricMetadata(
617
+ name="f1_set_match",
618
+ display_name="F1 (set match)",
619
+ short_display_name="F1",
620
+ description="Average F1 score in terms of set overlap between the model predicted set and correct "
621
+ "reference set.",
622
+ lower_is_better=False,
623
+ group=None,
624
+ ),
625
+ "math_equiv": MetricMetadata(
626
+ name="math_equiv",
627
+ display_name="Equivalent",
628
+ description="Fraction of model outputs that are mathematically equivalent to the correct reference.",
629
+ lower_is_better=False,
630
+ group=None,
631
+ ),
632
+ "math_equiv_chain_of_thought": MetricMetadata(
633
+ name="math_equiv_chain_of_thought",
634
+ display_name="Equivalent (CoT)",
635
+ description="Fraction of model outputs that are mathematically equivalent to the correct reference "
636
+ "when using chain-of-thought prompting.",
637
+ lower_is_better=False,
638
+ group=None,
639
+ ),
640
+ "code_eval_acc": MetricMetadata(
641
+ name="code_eval_acc",
642
+ display_name="Correctness",
643
+ short_display_name="Correctness",
644
+ description="Fraction of instances that the model output evaluates to the correct answer.",
645
+ lower_is_better=False,
646
+ group=None,
647
+ ),
648
+ "pass": MetricMetadata(
649
+ name="pass",
650
+ display_name="pass@1",
651
+ description="Fraction of model outputs that pass the associated test cases.",
652
+ lower_is_better=False,
653
+ group=None,
654
+ ),
655
+ "cider": MetricMetadata(
656
+ name="cider",
657
+ display_name="CIDEr",
658
+ description="Evaluates the quality of generated caption by measuring the weighted similarity of "
659
+ "n-grams between the captions and a set of human-written reference captions, emphasizing "
660
+ "informativeness and consensus.",
661
+ lower_is_better=False,
662
+ group=None,
663
+ ),
664
+ "f1_score": MetricMetadata(
665
+ name="f1_score",
666
+ display_name="F1",
667
+ description="Average F1 score in terms of word overlap between the model output and correct reference.",
668
+ lower_is_better=False,
669
+ group=None,
670
+ ),
671
+ "rouge_1": MetricMetadata(
672
+ name="rouge_1",
673
+ display_name="ROUGE-1",
674
+ description="Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram " "overlap.",
675
+ lower_is_better=False,
676
+ group=None,
677
+ ),
678
+ "rouge_2": MetricMetadata(
679
+ name="rouge_2",
680
+ display_name="ROUGE-2",
681
+ description="Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram " "overlap.",
682
+ lower_is_better=False,
683
+ group=None,
684
+ ),
685
+ "rouge_l": MetricMetadata(
686
+ name="rouge_l",
687
+ display_name="ROUGE-L",
688
+ description="Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest "
689
+ "common subsequence overlap.",
690
+ lower_is_better=False,
691
+ group=None,
692
+ ),
693
+ "bleu_1": MetricMetadata(
694
+ name="bleu_1",
695
+ display_name="BLEU-1",
696
+ description="Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on "
697
+ "1-gram overlap.",
698
+ lower_is_better=False,
699
+ group=None,
700
+ ),
701
+ "bleu_4": MetricMetadata(
702
+ name="bleu_4",
703
+ display_name="BLEU-4",
704
+ description="Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on "
705
+ "4-gram overlap.",
706
+ lower_is_better=False,
707
+ group=None,
708
+ ),
709
+ "chinese_bleu_1": MetricMetadata(
710
+ name="chinese_bleu_1",
711
+ display_name="Chinese BLEU-1 score",
712
+ short_display_name="BLEU-1 (Chinese)",
713
+ description="BLEU-1 score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on a "
714
+ "Chinese tokenizer that segments Chinese strings by character.",
715
+ lower_is_better=False,
716
+ group=None,
717
+ # Group could be one of:
718
+ # "cleva_pinyin_transliteration_metrics"
719
+ # "cleva_dialogue_generation_metrics"
720
+ # "cleva_data_to_text_generation_metrics"
721
+ ),
722
+ "chinese_rouge_1": MetricMetadata(
723
+ name="chinese_rouge_1",
724
+ display_name="Chinese ROUGE-1 score",
725
+ short_display_name="ROUGE-1 (Chinese)",
726
+ description="ROUGE-1 score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on a Chinese "
727
+ "tokenizer that segments Chinese strings by character.",
728
+ lower_is_better=False,
729
+ group="cleva_summarization_metrics",
730
+ ),
731
+ "chinese_rouge_2": MetricMetadata(
732
+ name="chinese_rouge_2",
733
+ display_name="Chinese ROUGE-2 score",
734
+ short_display_name="ROUGE-2 (Chinese)",
735
+ description="ROUGE-2 score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on a Chinese "
736
+ "tokenizer that segments Chinese strings by character.",
737
+ lower_is_better=False,
738
+ group="cleva_summarization_metrics",
739
+ ),
740
+ "cleva_math_result_match": MetricMetadata(
741
+ name="cleva_math_result_match",
742
+ display_name="CLEVA Math Exact Match",
743
+ short_display_name="EM (Math)",
744
+ description="Exact match that cares only the last math expression (numbers and fractions) in the "
745
+ "model's prediction.",
746
+ lower_is_better=False,
747
+ group="cleva_mathematical_reasoning_metrics",
748
+ ),
749
+ "absolute_value_difference": MetricMetadata(
750
+ name="absolute_value_difference",
751
+ display_name="Absolute difference",
752
+ short_display_name="Diff.",
753
+ description="Average absolute difference between the model output (converted to a number) and the "
754
+ "correct reference.",
755
+ lower_is_better=True,
756
+ group=None,
757
+ ),
758
+ "wer_score": MetricMetadata(
759
+ name="wer_score",
760
+ display_name="Word Error Rate",
761
+ short_display_name="WER",
762
+ description="Word error rate between model predictions and ground truth answers for ASR tasks.",
763
+ lower_is_better=True,
764
+ group=None,
765
+ ),
766
+ "mer_score": MetricMetadata(
767
+ name="mer_score",
768
+ display_name="Match Error Rate",
769
+ short_display_name="MER",
770
+ description="Word match error rate between model predictions and ground truth answers.",
771
+ lower_is_better=True,
772
+ group=None,
773
+ ),
774
+ "wip_score": MetricMetadata(
775
+ name="wip_score",
776
+ display_name="Word Information Preservation",
777
+ short_display_name="WIP",
778
+ description="Word information preservation (WIP) for evaluating the preserved information of ASR.",
779
+ lower_is_better=False,
780
+ group=None,
781
+ ),
782
+ "cer_score": MetricMetadata(
783
+ name="cer_score",
784
+ display_name="Character Error Rate",
785
+ short_display_name="CER",
786
+ description="Character error rate (CER) for evaluating the accuracy of ASR.",
787
+ lower_is_better=True,
788
+ group=None,
789
+ ),
790
+ "chinese_wer_score": MetricMetadata(
791
+ name="chinese_wer_score",
792
+ display_name="Chinese Word Error Rate",
793
+ short_display_name="Chinese WER",
794
+ description="Chinese word error rate between model predictions and ground truth answers for ASR tasks.",
795
+ lower_is_better=True,
796
+ group=None,
797
+ ),
798
+ "chinese_mer_score": MetricMetadata(
799
+ name="chinese_mer_score",
800
+ display_name="Chinese Match Error Rate",
801
+ short_display_name="Chinese MER",
802
+ description="Chinese word match error rate between model predictions and ground truth answers.",
803
+ lower_is_better=True,
804
+ group=None,
805
+ ),
806
+ "chinese_wip_score": MetricMetadata(
807
+ name="chinese_wip_score",
808
+ display_name="Chinese Word Information Preservation",
809
+ short_display_name="Chinese WIP",
810
+ description="Chinese word information preservation (WIP) for evaluating the preserved information of " "ASR.",
811
+ lower_is_better=False,
812
+ group=None,
813
+ ),
814
+ "chinese_cer_score": MetricMetadata(
815
+ name="chinese_cer_score",
816
+ display_name="Chinese Character Error Rate",
817
+ short_display_name="Chinese CER",
818
+ description="Chinese character error rate (CER) for evaluating the accuracy of Chiese ASR.",
819
+ lower_is_better=True,
820
+ group=None,
821
+ ),
822
+ }
823
+
824
+
825
+ def get_reference_metrics_metadata(names: List[str]) -> List[MetricMetadata]:
826
+ metadata_list: List[MetricMetadata] = []
827
+ for name in names:
828
+ metadata = _METRIC_METADATA_MAPPING.get(name)
829
+ if metadata:
830
+ metadata_list.append(metadata)
831
+ return metadata_list