crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/call_center_annotator.py +22 -11
  36. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  37. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  38. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  39. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  40. helm/benchmark/annotation/harm_bench_annotator.py +11 -24
  41. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  42. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  43. helm/benchmark/annotation/live_qa_annotator.py +10 -5
  44. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  45. helm/benchmark/annotation/medalign_annotator.py +100 -0
  46. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  47. helm/benchmark/annotation/medication_qa_annotator.py +90 -61
  48. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  49. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  50. helm/benchmark/annotation/model_as_judge.py +281 -18
  51. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  52. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  54. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  55. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  56. helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
  57. helm/benchmark/annotation/spider_annotator.py +18 -0
  58. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  59. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  60. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  61. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  62. helm/benchmark/annotation/xstest_annotator.py +20 -30
  63. helm/benchmark/annotation_executor.py +35 -15
  64. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  65. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  66. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  67. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  68. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  69. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  70. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  71. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  72. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  73. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  74. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  75. helm/benchmark/augmentations/perturbation.py +1 -1
  76. helm/benchmark/augmentations/space_perturbation.py +2 -2
  77. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  78. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  79. helm/benchmark/augmentations/test_perturbation.py +16 -13
  80. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  81. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  82. helm/benchmark/data_preprocessor.py +2 -2
  83. helm/benchmark/huggingface_registration.py +2 -7
  84. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  85. helm/benchmark/metrics/basic_metrics.py +6 -6
  86. helm/benchmark/metrics/bbq_metrics.py +2 -2
  87. helm/benchmark/metrics/bias_metrics.py +12 -3
  88. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  89. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  90. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  91. helm/benchmark/metrics/classification_metrics.py +76 -12
  92. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  93. helm/benchmark/metrics/code_metrics.py +5 -5
  94. helm/benchmark/metrics/comet_metric.py +125 -0
  95. helm/benchmark/metrics/common_metric_specs.py +9 -2
  96. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  97. helm/benchmark/metrics/copyright_metrics.py +4 -4
  98. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  99. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  101. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  102. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  103. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  104. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  105. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  106. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  107. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  108. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  109. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  110. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  113. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  115. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  116. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  117. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  118. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  120. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  122. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  123. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  124. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  125. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  126. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  127. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  128. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  129. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  130. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  131. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  132. helm/benchmark/metrics/medalign_metrics.py +34 -0
  133. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  134. helm/benchmark/metrics/medec_metrics.py +101 -0
  135. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  136. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  137. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  138. helm/benchmark/metrics/metric.py +3 -3
  139. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  140. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  141. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  142. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  143. helm/benchmark/metrics/nltk_helper.py +32 -0
  144. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  145. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  146. helm/benchmark/metrics/output_processing_metric.py +60 -0
  147. helm/benchmark/metrics/output_processors.py +15 -0
  148. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  149. helm/benchmark/metrics/ranking_metrics.py +3 -3
  150. helm/benchmark/metrics/reference_metric.py +3 -3
  151. helm/benchmark/metrics/safety_metrics.py +39 -17
  152. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  153. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  154. helm/benchmark/metrics/spider_metrics.py +7 -0
  155. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  156. helm/benchmark/metrics/statistic.py +1 -1
  157. helm/benchmark/metrics/summac/model_summac.py +1 -1
  158. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  159. helm/benchmark/metrics/summarization_metrics.py +19 -9
  160. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  161. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  162. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  163. helm/benchmark/metrics/test_metric.py +1 -1
  164. helm/benchmark/metrics/test_statistic.py +2 -2
  165. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  167. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  169. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  170. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  171. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  172. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  173. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  174. helm/benchmark/metrics/unitxt_metrics.py +21 -4
  175. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  176. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  177. helm/benchmark/model_metadata_registry.py +16 -0
  178. helm/benchmark/presentation/create_plots.py +1 -1
  179. helm/benchmark/presentation/schema.py +3 -0
  180. helm/benchmark/presentation/summarize.py +119 -256
  181. helm/benchmark/presentation/test_summarize.py +145 -3
  182. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  183. helm/benchmark/reeval_run.py +203 -0
  184. helm/benchmark/reeval_runner.py +355 -0
  185. helm/benchmark/run.py +8 -17
  186. helm/benchmark/run_expander.py +105 -8
  187. helm/benchmark/run_spec_factory.py +12 -0
  188. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  189. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  190. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  191. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  192. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  193. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  194. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  195. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  196. helm/benchmark/run_specs/finance_run_specs.py +6 -2
  197. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  198. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  199. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  200. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  201. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  202. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  203. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  204. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
  205. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  206. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  207. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  208. helm/benchmark/run_specs/vlm_run_specs.py +83 -5
  209. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  210. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  211. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  212. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  213. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  214. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  215. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  216. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  217. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  218. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  219. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  220. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  221. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  222. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  223. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  224. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  225. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  226. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  227. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  228. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  229. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  230. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  231. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  232. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  233. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  234. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  235. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  236. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  237. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  238. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  239. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  240. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  241. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  242. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  243. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  244. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  245. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  246. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  247. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  248. helm/benchmark/scenarios/bold_scenario.py +1 -1
  249. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  250. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  251. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  252. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  253. helm/benchmark/scenarios/clear_scenario.py +153 -0
  254. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  255. helm/benchmark/scenarios/code_scenario.py +17 -4
  256. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  257. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  258. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  259. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  260. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  261. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  262. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  263. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  264. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  265. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  266. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  267. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  268. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  269. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  270. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  271. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  272. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  273. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  274. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  275. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  276. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  277. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  278. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  279. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  280. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  281. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  282. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  283. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  284. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  285. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  286. helm/benchmark/scenarios/ice_scenario.py +8 -4
  287. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  288. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  289. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  290. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  291. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  292. helm/benchmark/scenarios/koala_scenario.py +1 -1
  293. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  294. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  295. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  296. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  297. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  298. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  299. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  300. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  301. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  302. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  303. helm/benchmark/scenarios/math_scenario.py +9 -1
  304. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  305. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  306. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  307. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  308. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  309. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  310. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  311. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  312. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  313. helm/benchmark/scenarios/medec_scenario.py +120 -0
  314. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  315. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  316. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  317. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  318. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  319. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  320. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  321. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  322. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  323. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  324. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  325. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  326. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  327. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  328. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  329. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  330. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  331. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  332. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  333. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  334. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  335. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  336. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  337. helm/benchmark/scenarios/quac_scenario.py +10 -1
  338. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  339. helm/benchmark/scenarios/raft_scenario.py +18 -3
  340. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  341. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  342. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  343. helm/benchmark/scenarios/scenario.py +9 -1
  344. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
  345. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  346. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  347. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  348. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  349. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  350. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  351. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  352. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  353. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  354. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  355. helm/benchmark/scenarios/spider_scenario.py +91 -0
  356. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  357. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  358. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  359. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  360. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  361. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  362. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  363. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  364. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  366. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  367. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  368. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  369. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  370. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  371. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  372. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  373. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  374. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  375. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  376. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  377. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  378. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  379. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  380. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  381. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  382. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  383. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  384. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  385. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  386. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  387. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  388. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  389. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  390. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  391. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  392. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  393. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  394. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  395. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  396. helm/benchmark/server.py +11 -0
  397. helm/benchmark/slurm_runner.py +1 -1
  398. helm/benchmark/static/schema_audio.yaml +752 -0
  399. helm/benchmark/static/schema_autobencher.yaml +150 -0
  400. helm/benchmark/static/schema_call_center.yaml +97 -60
  401. helm/benchmark/static/schema_capabilities.yaml +254 -0
  402. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  403. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  404. helm/benchmark/static/schema_enterprise.yaml +298 -0
  405. helm/benchmark/static/schema_finance.yaml +14 -12
  406. helm/benchmark/static/schema_heim.yaml +1389 -0
  407. helm/benchmark/static/schema_legal.yaml +566 -0
  408. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  409. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  410. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  411. helm/benchmark/static/schema_safety.yaml +42 -6
  412. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
  413. helm/benchmark/static/schema_social_audio.yaml +224 -0
  414. helm/benchmark/static/schema_sql.yaml +171 -0
  415. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
  416. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  417. helm/benchmark/static/schema_vhelm.yaml +151 -47
  418. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  419. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  420. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  421. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  422. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  423. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  424. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  425. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  426. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  427. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  428. helm/benchmark/static_build/config.js +1 -1
  429. helm/benchmark/static_build/index.html +5 -5
  430. helm/benchmark/window_services/default_window_service.py +1 -1
  431. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  432. helm/benchmark/window_services/ice_window_service.py +1 -1
  433. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  434. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  435. helm/benchmark/window_services/local_window_service.py +2 -2
  436. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  437. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  438. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  439. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  440. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  441. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  442. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  443. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  444. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  445. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  446. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  447. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  448. helm/benchmark/window_services/test_utils.py +1 -1
  449. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  450. helm/benchmark/window_services/tokenizer_service.py +0 -5
  451. helm/benchmark/window_services/yalm_window_service.py +1 -1
  452. helm/clients/ai21_client.py +3 -3
  453. helm/clients/aleph_alpha_client.py +1 -1
  454. helm/clients/audio_language/__init__.py +0 -0
  455. helm/clients/audio_language/diva_llama_client.py +118 -0
  456. helm/clients/audio_language/llama_omni_client.py +198 -0
  457. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  458. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  459. helm/clients/auto_client.py +4 -2
  460. helm/clients/azure_openai_client.py +55 -0
  461. helm/clients/bedrock_client.py +201 -7
  462. helm/clients/bedrock_utils.py +33 -0
  463. helm/clients/clip_scorers/clip_scorer.py +1 -1
  464. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  465. helm/clients/cohere_client.py +3 -3
  466. helm/clients/google_client.py +1 -1
  467. helm/clients/http_model_client.py +1 -1
  468. helm/clients/huggingface_client.py +10 -18
  469. helm/clients/ibm_client.py +267 -0
  470. helm/clients/image_generation/adobe_vision_client.py +1 -1
  471. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  472. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  473. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  474. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  475. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  476. helm/clients/image_generation/cogview2_client.py +1 -1
  477. helm/clients/image_generation/dalle2_client.py +1 -1
  478. helm/clients/image_generation/dalle3_client.py +2 -2
  479. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  480. helm/clients/image_generation/dalle_mini/data.py +1 -1
  481. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  482. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  483. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  484. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  485. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  486. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  487. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  488. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  489. helm/clients/image_generation/dalle_mini_client.py +1 -1
  490. helm/clients/image_generation/deep_floyd_client.py +1 -1
  491. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  492. helm/clients/image_generation/lexica_client.py +1 -1
  493. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  494. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  495. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  496. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  497. helm/clients/image_generation/mindalle_client.py +1 -1
  498. helm/clients/image_generation/together_image_generation_client.py +1 -1
  499. helm/clients/lit_gpt_client.py +2 -2
  500. helm/clients/mistral_client.py +62 -18
  501. helm/clients/nvidia_nim_client.py +0 -3
  502. helm/clients/openai_client.py +255 -21
  503. helm/clients/palmyra_client.py +2 -6
  504. helm/clients/reka_client.py +1 -1
  505. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  506. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  507. helm/clients/stanfordhealthcare_google_client.py +43 -0
  508. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  509. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  510. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  511. helm/clients/test_client.py +1 -1
  512. helm/clients/test_together_client.py +6 -1
  513. helm/clients/together_client.py +69 -7
  514. helm/clients/upstage_client.py +23 -0
  515. helm/clients/vertexai_client.py +39 -13
  516. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  517. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  518. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  519. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  520. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  521. helm/clients/vllm_client.py +4 -6
  522. helm/clients/yi_client.py +0 -3
  523. helm/common/audio_utils.py +111 -0
  524. helm/common/cache.py +8 -30
  525. helm/common/file_caches/local_file_cache.py +1 -1
  526. helm/common/file_caches/test_local_file_cache.py +1 -1
  527. helm/common/images_utils.py +2 -2
  528. helm/common/key_value_store.py +9 -9
  529. helm/common/media_object.py +2 -2
  530. helm/common/mongo_key_value_store.py +3 -3
  531. helm/common/multimodal_request_utils.py +26 -0
  532. helm/common/reeval_parameters.py +12 -0
  533. helm/common/request.py +6 -2
  534. helm/common/response_format.py +18 -0
  535. helm/common/test_cache.py +1 -48
  536. helm/common/test_media_object.py +1 -1
  537. helm/common/tokenization_request.py +0 -9
  538. helm/config/model_deployments.yaml +1258 -33
  539. helm/config/model_metadata.yaml +1110 -41
  540. helm/config/tokenizer_configs.yaml +403 -3
  541. helm/proxy/cli.py +2 -2
  542. helm/proxy/example_queries.py +1 -1
  543. helm/proxy/server.py +11 -13
  544. helm/proxy/services/remote_service.py +1 -7
  545. helm/proxy/services/server_service.py +6 -19
  546. helm/proxy/services/service.py +0 -6
  547. helm/proxy/services/test_remote_service.py +2 -2
  548. helm/proxy/services/test_service.py +1 -1
  549. helm/proxy/static/general.js +122 -0
  550. helm/proxy/static/help.html +99 -0
  551. helm/proxy/static/index.css +57 -0
  552. helm/proxy/static/index.html +40 -0
  553. helm/proxy/static/index.js +456 -0
  554. helm/proxy/static/info-icon.png +0 -0
  555. helm/proxy/test_retry.py +1 -1
  556. helm/proxy/token_counters/auto_token_counter.py +1 -1
  557. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  558. helm/tokenizers/caching_tokenizer.py +2 -30
  559. helm/tokenizers/http_model_tokenizer.py +1 -1
  560. helm/tokenizers/huggingface_tokenizer.py +2 -2
  561. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  562. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  563. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  564. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  565. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  566. helm/tokenizers/tokenizer.py +3 -1
  567. helm/tokenizers/yalm_tokenizer.py +3 -3
  568. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  569. crfm_helm-0.5.3.dist-info/METADATA +0 -355
  570. crfm_helm-0.5.3.dist-info/RECORD +0 -699
  571. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  572. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  573. helm/benchmark/data_overlap/light_scenario.py +0 -60
  574. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  575. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  576. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  577. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  578. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  579. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  580. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  581. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  582. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  583. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  584. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  585. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  586. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  587. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  588. helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
  589. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  590. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  591. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  592. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  593. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  594. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  595. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  596. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  597. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  598. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  599. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  600. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  601. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  602. helm/tokenizers/anthropic_tokenizer.py +0 -52
  603. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  604. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  605. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
  606. /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
@@ -8,9 +8,18 @@ from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMe
8
8
 
9
9
  from helm.common.request import RequestResult, GeneratedOutput
10
10
  from helm.benchmark.adaptation.request_state import RequestState
11
- from .statistic import Stat
12
- from .metric_name import MetricName
13
- from .bias_word_lists import GENDER_TO_WORD_LISTS, RACE_TO_NAME_LISTS, ADJECTIVE_LIST, PROFESSION_LIST
11
+ from helm.benchmark.metrics.nltk_helper import install_nltk_resources
12
+ from helm.benchmark.metrics.statistic import Stat
13
+ from helm.benchmark.metrics.metric_name import MetricName
14
+ from helm.benchmark.metrics.bias_word_lists import (
15
+ GENDER_TO_WORD_LISTS,
16
+ RACE_TO_NAME_LISTS,
17
+ ADJECTIVE_LIST,
18
+ PROFESSION_LIST,
19
+ )
20
+
21
+
22
+ install_nltk_resources()
14
23
 
15
24
 
16
25
  class BiasMetric(EvaluateInstancesMetric):
@@ -0,0 +1,25 @@
1
+ from typing import List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.metrics.metric import Metric
6
+ from helm.benchmark.metrics.metric_name import MetricName
7
+ from helm.benchmark.metrics.metric_service import MetricService
8
+ from helm.benchmark.metrics.statistic import Stat
9
+
10
+
11
+ class BigCodeBenchMetric(Metric):
12
+ """Score metrics for BigCodeBench."""
13
+
14
+ def evaluate_generation(
15
+ self,
16
+ adapter_spec: AdapterSpec,
17
+ request_state: RequestState,
18
+ metric_service: MetricService,
19
+ eval_cache_path: str,
20
+ ) -> List[Stat]:
21
+ assert request_state.annotations
22
+ score = request_state.annotations["bigcodebench"]["pass_at_one"]
23
+ return [
24
+ Stat(MetricName("bigcodebench_p@1")).add(score),
25
+ ]
@@ -0,0 +1,28 @@
1
+ from typing import List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.metrics.metric import Metric
6
+ from helm.benchmark.metrics.metric_name import MetricName
7
+ from helm.benchmark.metrics.metric_service import MetricService
8
+ from helm.benchmark.metrics.statistic import Stat
9
+
10
+
11
+ class BirdSQLMetric(Metric):
12
+ """Score metrics for Bird-SQL."""
13
+
14
+ ANNOTATOR_NAME = "bird_sql"
15
+
16
+ def evaluate_generation(
17
+ self,
18
+ adapter_spec: AdapterSpec,
19
+ request_state: RequestState,
20
+ metric_service: MetricService,
21
+ eval_cache_path: str,
22
+ ) -> List[Stat]:
23
+ if not request_state.annotations:
24
+ raise Exception("Request state did not have annotations.")
25
+ predicted_result = request_state.annotations[self.ANNOTATOR_NAME]["predicted_result"]
26
+ ground_truth_result = request_state.annotations[self.ANNOTATOR_NAME]["ground_truth_result"]
27
+ execution_accuracy = int(set(predicted_result) == set(ground_truth_result))
28
+ return [Stat(MetricName("execution_accuracy")).add(execution_accuracy)]
@@ -0,0 +1,34 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.annotation.chw_care_plan_annotator import ANNOTATOR_MODELS
6
+ from helm.benchmark.metrics.metric import Metric
7
+ from helm.benchmark.metrics.metric_name import MetricName
8
+ from helm.benchmark.metrics.metric_service import MetricService
9
+ from helm.benchmark.metrics.statistic import Stat
10
+
11
+
12
+ class CHWCarePlanMetric(Metric):
13
+ """Score metrics for CHWCarePla."""
14
+
15
+ def evaluate_generation(
16
+ self,
17
+ adapter_spec: AdapterSpec,
18
+ request_state: RequestState,
19
+ metric_service: MetricService,
20
+ eval_cache_path: str,
21
+ ) -> List[Stat]:
22
+ assert request_state.annotations
23
+ annotations: Dict[str, Any] = request_state.annotations["chw_care_plan"]
24
+ scores: List[int] = []
25
+ score = 0.0
26
+ for annotation_key, annotation_dict in annotations.items():
27
+ if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
28
+ for val in annotation_dict.values():
29
+ scores.append(int(val["score"]))
30
+ if scores:
31
+ score = sum(scores) / len(scores)
32
+ return [
33
+ Stat(MetricName("chw_care_plan_accuracy")).add(score),
34
+ ]
@@ -1,6 +1,6 @@
1
1
  from typing import List, Optional
2
2
 
3
- from sklearn.metrics import f1_score
3
+ from sklearn.metrics import f1_score, precision_score, recall_score
4
4
  from sklearn.preprocessing import MultiLabelBinarizer
5
5
 
6
6
  from helm.benchmark.adaptation.request_state import RequestState
@@ -9,9 +9,14 @@ from helm.benchmark.metrics.evaluate_reference_metrics import normalize_text
9
9
  from helm.benchmark.metrics.metric import MetricName
10
10
  from helm.benchmark.metrics.statistic import Stat
11
11
  from helm.benchmark.scenarios.scenario import Reference
12
+ from helm.common.hierarchical_logger import hlog
12
13
  from helm.common.request import GeneratedOutput
13
14
 
14
15
 
16
+ def _normalize_label_text(text: str) -> str:
17
+ return normalize_text(text, should_remove_articles=False)
18
+
19
+
15
20
  class ClassificationMetric(EvaluateInstancesMetric):
16
21
  """Defines metrics for multi-class classification using the generation adapter.
17
22
 
@@ -23,8 +28,12 @@ class ClassificationMetric(EvaluateInstancesMetric):
23
28
  reference. The predicted class for each instance is the normalized text of the generation.
24
29
 
25
30
  Note:
26
- - The set of classes is derived from the correct references from all the instances.
27
- This means that classes may be omitted if they are never used as a correct reference.
31
+ - It is highly recommended to specify the set of classes should be specified using the
32
+ `labels` parameter. Otherwise, the set of classes is derived from the correct references
33
+ from all the instances. This means that classes may be incorrectly omitted if they are never
34
+ used as a correct reference.
35
+ - The `averages` parameter is a list of averaging methods to be used.
36
+ It has the same meaning `average` as in scikit-learn.
28
37
  - Generations that are not in any of the known classes are counted as a
29
38
  negative prediction for every class.
30
39
  - Perturbed classes are considered different classes from unperturbed
@@ -32,8 +41,45 @@ class ClassificationMetric(EvaluateInstancesMetric):
32
41
  - Currently, multi-label classification is not supported.
33
42
  """
34
43
 
35
- def __init__(self, delimiter: Optional[str] = None):
44
+ AVERAGE_OPTIONS = ["micro", "macro", "weighted", None]
45
+ SCORE_OPTIONS = ["f1", "precision", "recall"]
46
+
47
+ def __init__(
48
+ self,
49
+ averages: Optional[List[Optional[str]]] = None,
50
+ labels: Optional[List[str]] = None,
51
+ scores: Optional[List[str]] = None,
52
+ delimiter: Optional[str] = None,
53
+ ) -> None:
54
+ """Creates metrics for multi-class classification.
55
+
56
+ :param delimiter: For multi-label classification, the string delimiter between classes in the model's output.
57
+ :param average: The list of scores to compute (e.g. "f1", "precision", "recall").
58
+ Defaults to ["f1"].
59
+ :param average: The averaging methods (e.g. "micro", "macro", "weighted") to be used.
60
+ It has the same meaning `average` as in scikit-learn.
61
+ Defaults to ["macro", "micro"].
62
+ :param labels: The set of labels.
63
+ :return: A list of `Stat` objects.
64
+ """
65
+ self.averages = averages or ["macro", "micro"]
66
+ for average in self.averages:
67
+ if average not in ClassificationMetric.AVERAGE_OPTIONS:
68
+ raise ValueError(
69
+ f"Each value in `averages` must be set to one of {ClassificationMetric.AVERAGE_OPTIONS}."
70
+ )
71
+ self.scores = scores or ["f1"]
72
+ for score_name in self.scores:
73
+ if score_name not in ClassificationMetric.SCORE_OPTIONS:
74
+ raise ValueError(f"Each value in `scores` must be set to one of {ClassificationMetric.SCORE_OPTIONS}.")
36
75
  self.delimiter = delimiter
76
+ self.labels = labels
77
+ if not self.labels:
78
+ hlog(
79
+ "WARNING: `labels` were not set on `ClassificationMetric`, "
80
+ "so they will be inferred from target references. "
81
+ "It is recommend to explicitly set `labels` on `ClassificationMetric`."
82
+ )
37
83
 
38
84
  def is_multi_label(self) -> bool:
39
85
  return bool(self.delimiter)
@@ -57,20 +103,38 @@ class ClassificationMetric(EvaluateInstancesMetric):
57
103
  references = request_state.instance.all_correct_references
58
104
  if not self.is_multi_label():
59
105
  assert len(references) == 1
60
- correct_ref_texts = [normalize_text(ref.output.text) for ref in references if ref.output.text]
106
+ correct_ref_texts = [_normalize_label_text(ref.output.text) for ref in references if ref.output.text]
61
107
  y_true.append(correct_ref_texts)
62
108
 
63
109
  input_text = request_state.result.completions[0].text
64
110
  predictions = input_text.split(self.delimiter) if self.is_multi_label() else [input_text]
65
- y_pred.append([normalize_text(pred) for pred in predictions if pred])
66
- labels: List[str] = list(set(y for ys in y_true for y in ys))
67
- mlb = MultiLabelBinarizer().fit([labels])
111
+ y_pred.append([_normalize_label_text(pred) for pred in predictions if pred])
112
+ mlb = MultiLabelBinarizer().fit(
113
+ [[_normalize_label_text(label) for label in self.labels]] if self.labels else y_true
114
+ )
68
115
  y_true = mlb.transform(y_true)
69
116
  y_pred = mlb.transform(y_pred)
70
- return [
71
- Stat(MetricName("classification_macro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="macro")),
72
- Stat(MetricName("classification_micro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="micro")),
73
- ]
117
+ stats: List[Stat] = []
118
+ for average in self.averages:
119
+ for score_name in self.scores:
120
+ if score_name == "f1":
121
+ score_value = f1_score(y_pred=y_pred, y_true=y_true, average=average)
122
+ elif score_name == "precision":
123
+ score_value = precision_score(y_pred=y_pred, y_true=y_true, average=average)
124
+ elif score_name == "recall":
125
+ score_value = recall_score(y_pred=y_pred, y_true=y_true, average=average)
126
+ else:
127
+ raise ValueError(
128
+ f"Unknown score name: '{score_name}' - expected one of ['f1', 'precision', 'recall']"
129
+ )
130
+ if average is None:
131
+ for mlb_class, class_score_value in zip(mlb.classes_, score_value):
132
+ stats.append(
133
+ Stat(MetricName(f"classification_{mlb_class}_{score_name}")).add(class_score_value)
134
+ )
135
+ else:
136
+ stats.append(Stat(MetricName(f"classification_{average}_{score_name}")).add(score_value))
137
+ return stats
74
138
 
75
139
 
76
140
  class MultipleChoiceClassificationMetric(EvaluateInstancesMetric):
@@ -14,12 +14,13 @@ from helm.benchmark.metrics.cleva_metrics_helper import ChineseTokenizer
14
14
  from helm.clients.perspective_api_client import PerspectiveAPIClientCredentialsError
15
15
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
16
16
  from helm.common.optional_dependencies import handle_module_not_found_error
17
- from .bias_metrics import BiasMetric
18
- from .toxicity_metrics import ToxicityMetric
19
- from .copyright_metrics import BasicCopyrightMetric
20
- from .metric_name import MetricName
21
- from .metric_service import MetricService
22
- from .statistic import Stat
17
+ from helm.benchmark.metrics.bias_metrics import BiasMetric
18
+ from helm.benchmark.metrics.toxicity_metrics import ToxicityMetric
19
+ from helm.benchmark.metrics.copyright_metrics import BasicCopyrightMetric
20
+ from helm.benchmark.metrics.metric_name import MetricName
21
+ from helm.benchmark.metrics.metric_service import MetricService
22
+ from helm.benchmark.metrics.statistic import Stat
23
+ from helm.benchmark.runner import get_benchmark_output_path
23
24
 
24
25
  try:
25
26
  import jieba
@@ -71,7 +72,7 @@ class CLEVABiasMetric(BiasMetric):
71
72
  "demographic_category". One of "adjective" or "profession".
72
73
  """
73
74
  # Ensure all necessary data are downloaded
74
- self.output_dir = os.path.join("benchmark_output", "metrics", self.name)
75
+ self.output_dir = os.path.join(get_benchmark_output_path(), "metrics", self.name)
75
76
  ensure_directory_exists(self.output_dir)
76
77
  for filename in self.FILE_NAMES:
77
78
  target_path = os.path.join(self.output_dir, filename)
@@ -10,11 +10,11 @@ from helm.benchmark.adaptation.scenario_state import ScenarioState
10
10
  from helm.benchmark.adaptation.request_state import RequestState
11
11
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
12
12
  from helm.benchmark.scenarios.code_scenario import CodeReference
13
- from . import code_metrics_helper
14
- from .metric import Metric, MetricResult
15
- from .metric_service import MetricService
16
- from .metric_name import MetricName
17
- from .statistic import Stat
13
+ from helm.benchmark.metrics import code_metrics_helper
14
+ from helm.benchmark.metrics.metric import Metric, MetricResult
15
+ from helm.benchmark.metrics.metric_service import MetricService
16
+ from helm.benchmark.metrics.metric_name import MetricName
17
+ from helm.benchmark.metrics.statistic import Stat
18
18
 
19
19
  MAXIMUM_MEMORY_BYTES = 8 * 1024 * 1024 * 1024 # 8GB.
20
20
 
@@ -0,0 +1,125 @@
1
+ import logging
2
+ from typing import List
3
+
4
+ import comet
5
+ from torch import nn
6
+
7
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
8
+ from helm.benchmark.adaptation.request_state import RequestState
9
+ from helm.benchmark.adaptation.scenario_state import ScenarioState
10
+ from helm.benchmark.metrics.metric import Metric, MetricResult
11
+ from helm.benchmark.metrics.metric_name import MetricName
12
+ from helm.benchmark.metrics.metric_service import MetricService
13
+ from helm.benchmark.metrics.statistic import Stat
14
+ from helm.common.hierarchical_logger import hlog
15
+ from helm.common.request import RequestResult
16
+
17
+
18
+ class CometMetric(Metric):
19
+ """COMET machine translation metric using a regression model.
20
+ The model takes a triplet of source sentence, translation, and reference
21
+ and computes a score in the range [0, 1] reflecting the quality of the predicted
22
+ translation.
23
+
24
+ Paper:
25
+ @inproceedings{rei-etal-2022-comet,
26
+ title = "{COMET}-22: Unbabel-{IST} 2022 Submission for the Metrics Shared Task",
27
+ author = "Rei, Ricardo and
28
+ C. de Souza, Jos{\'e} G. and
29
+ Alves, Duarte and
30
+ Zerva, Chrysoula and
31
+ Farinha, Ana C and
32
+ Glushkova, Taisiya and
33
+ Lavie, Alon and
34
+ Coheur, Luisa and
35
+ Martins, Andr{\'e} F. T.",
36
+ editor = {Koehn, Philipp and
37
+ Barrault, Lo{\"\i}c and
38
+ Bojar, Ond{\v{r}}ej and
39
+ Bougares, Fethi and
40
+ Chatterjee, Rajen and
41
+ Costa-juss{\`a}, Marta R. and
42
+ Federmann, Christian and
43
+ Fishel, Mark and
44
+ Fraser, Alexander and
45
+ Freitag, Markus and
46
+ Graham, Yvette and
47
+ Grundkiewicz, Roman and
48
+ Guzman, Paco and
49
+ Haddow, Barry and
50
+ Huck, Matthias and
51
+ Jimeno Yepes, Antonio and
52
+ Kocmi, Tom and
53
+ Martins, Andr{\'e} and
54
+ Morishita, Makoto and
55
+ Monz, Christof and
56
+ Nagata, Masaaki and
57
+ Nakazawa, Toshiaki and
58
+ Negri, Matteo and
59
+ N{\'e}v{\'e}ol, Aur{\'e}lie and
60
+ Neves, Mariana and
61
+ Popel, Martin and
62
+ Turchi, Marco and
63
+ Zampieri, Marcos},
64
+ booktitle = "Proceedings of the Seventh Conference on Machine Translation (WMT)",
65
+ month = dec,
66
+ year = "2022",
67
+ publisher = "Association for Computational Linguistics",
68
+ url = "https://aclanthology.org/2022.wmt-1.52",
69
+ }
70
+ """
71
+
72
+ METRIC_NAME = "comet"
73
+
74
+ def __init__(self, task: str, model_name: str = "Unbabel/wmt22-comet-da", device: str = "cpu"):
75
+ self.model_name = model_name
76
+ self.comet_scorer: nn.Module = self._load_model(model_name)
77
+ self.num_gpus = 0 if device == "cpu" else 1
78
+
79
+ # suppress warnings from PyTorch Lightning which spams terminal
80
+ logging.getLogger("lightning.pytorch.utilities.rank_zero").setLevel(logging.WARNING)
81
+ logging.getLogger("lightning.pytorch.accelerators.cuda").setLevel(logging.WARNING)
82
+
83
+ @staticmethod
84
+ def _load_model(model_name: str) -> nn.Module:
85
+ """Load Comet model from the checkpoint.
86
+
87
+ Returns:
88
+ The loaded model.
89
+ """
90
+ return comet.load_from_checkpoint(comet.download_model(model_name))
91
+
92
+ def evaluate(
93
+ self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str, parallelism: int
94
+ ) -> MetricResult:
95
+ hlog(
96
+ f"Setting parallelism from {parallelism} to 1, since "
97
+ f"evaluating {self.__class__.__name__} with parallelism > 1 seg faults."
98
+ )
99
+ return super().evaluate(scenario_state, metric_service, eval_cache_path, parallelism=1)
100
+
101
+ def evaluate_generation(
102
+ self,
103
+ adapter_spec: AdapterSpec,
104
+ request_state: RequestState,
105
+ metric_service: MetricService,
106
+ eval_cache_path: str,
107
+ ) -> List[Stat]:
108
+ """Compute the COMET score for this instance"""
109
+ assert len(request_state.instance.references) == 1
110
+ ref = request_state.instance.references[0].output.text
111
+ src = request_state.instance.input.text
112
+
113
+ result = request_state.result
114
+ if not isinstance(result, RequestResult):
115
+ raise TypeError(f"Expected a valid result, but got {result}!")
116
+ mt = result.completions[0].text.strip()
117
+
118
+ # comet requires this exac5 format
119
+ data = [dict(ref=ref, src=src, mt=mt)]
120
+ output = self.comet_scorer.predict(data, gpus=self.num_gpus, progress_bar=False) # type: ignore
121
+ comet_score = output[0][0] # extract the actual score
122
+
123
+ metric_result = [Stat(MetricName(self.METRIC_NAME)).add(comet_score)]
124
+
125
+ return metric_result
@@ -44,11 +44,18 @@ def get_language_modeling_metric_specs(names: List[str]) -> List[MetricSpec]:
44
44
  ]
45
45
 
46
46
 
47
- def get_classification_metric_specs(delimiter: Optional[str] = None) -> List[MetricSpec]:
47
+ def get_classification_metric_specs(
48
+ labels: Optional[List[str]] = None, delimiter: Optional[str] = None
49
+ ) -> List[MetricSpec]:
50
+ extra_args: Dict[str, Any] = {}
51
+ if labels:
52
+ extra_args["labels"] = labels
53
+ if delimiter:
54
+ extra_args["delimiter"] = delimiter
48
55
  return [
49
56
  MetricSpec(
50
57
  class_name="helm.benchmark.metrics.classification_metrics.ClassificationMetric",
51
- args={"delimiter": delimiter},
58
+ args=extra_args,
52
59
  )
53
60
  ]
54
61
 
@@ -0,0 +1,72 @@
1
+ import re
2
+ from typing import Any, List
3
+
4
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
5
+ from helm.benchmark.adaptation.request_state import RequestState
6
+ from helm.benchmark.metrics.metric import Metric
7
+ from helm.benchmark.metrics.metric_name import MetricName
8
+ from helm.benchmark.metrics.metric_service import MetricService
9
+ from helm.benchmark.metrics.statistic import Stat
10
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG
11
+ from helm.common.hierarchical_logger import hlog
12
+
13
+
14
+ def _strip_string(str: str) -> Any:
15
+ # from https://stackoverflow.com/a/4703508
16
+ numeric_const_pattern = r"[-+]?(?:(?:\d*\.\d+)|(?:\d+\.?))(?:[Ee][+-]?\d+)?"
17
+ match = re.search(numeric_const_pattern, str)
18
+ if match:
19
+ try:
20
+ return float(str[match.start() : match.end()])
21
+ except Exception:
22
+ return None
23
+ return None
24
+
25
+
26
+ def float_equiv(str1: str, str2: str, eps: float = 1e-6) -> float:
27
+ """Check if two values have the same float value, up to a small tolerance.
28
+
29
+ This is the implementation used in the IBM Enterprise Benchmark paper.
30
+
31
+ Note: This is a "mostly-correct" equality function and does not handle some cases correctly:
32
+
33
+ - If both values are non-floats, then it will always return 1.0,
34
+ regardless of whether strings match.
35
+ - If either of both values have different units (e.g. currency symbols,
36
+ trailing "M" or "B", trailing %), the values will not be converted to the same
37
+ units before comparison.
38
+ """
39
+ try:
40
+ ss1 = _strip_string(str1)
41
+ ss2 = _strip_string(str2)
42
+
43
+ if ss1 is None or ss2 is None:
44
+ hlog("WARNING: float_equiv returning 1.0 because both values are non-floats")
45
+ return 0.0
46
+ return float(abs(ss1 - ss2) < eps)
47
+ except Exception:
48
+ return float(str1 == str2)
49
+
50
+
51
+ class ConvFinQACalcMetric(Metric):
52
+ """Score metrics for AIRBench 2024."""
53
+
54
+ def evaluate_generation(
55
+ self,
56
+ adapter_spec: AdapterSpec,
57
+ request_state: RequestState,
58
+ metric_service: MetricService,
59
+ eval_cache_path: str,
60
+ ) -> List[Stat]:
61
+ assert request_state.result
62
+ assert len(request_state.result.completions) == 1
63
+ model_answer = request_state.result.completions[0].text
64
+
65
+ assert len(request_state.instance.references) == 1
66
+ assert len(request_state.instance.references[0].tags) == 1
67
+ assert request_state.instance.references[0].tags[0] == CORRECT_TAG
68
+ gold_answer = request_state.instance.references[0].output.text
69
+
70
+ return [
71
+ Stat(MetricName("float_equiv")).add(float_equiv(model_answer, gold_answer)),
72
+ ]
@@ -9,10 +9,10 @@ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
9
9
  from helm.benchmark.scenarios.scenario import Reference
10
10
  from helm.common.optional_dependencies import handle_module_not_found_error
11
11
  from helm.common.request import RequestResult
12
- from .metric import Metric
13
- from .metric_name import MetricName
14
- from .metric_service import MetricService
15
- from .statistic import Stat
12
+ from helm.benchmark.metrics.metric import Metric
13
+ from helm.benchmark.metrics.metric_name import MetricName
14
+ from helm.benchmark.metrics.metric_service import MetricService
15
+ from helm.benchmark.metrics.statistic import Stat
16
16
 
17
17
  try:
18
18
  import numba
@@ -0,0 +1,29 @@
1
+ from typing import List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.metrics.metric import Metric
6
+ from helm.benchmark.metrics.metric_name import MetricName
7
+ from helm.benchmark.metrics.metric_service import MetricService
8
+ from helm.benchmark.metrics.statistic import Stat
9
+
10
+
11
+ class CzechBankQAMetrics(Metric):
12
+ """Score metrics for AIRBench 2024."""
13
+
14
+ def evaluate_generation(
15
+ self,
16
+ adapter_spec: AdapterSpec,
17
+ request_state: RequestState,
18
+ metric_service: MetricService,
19
+ eval_cache_path: str,
20
+ ) -> List[Stat]:
21
+ # assert len(request_state.instance.references) > 1
22
+ # category_text = request_state.instance.references[0].output.text
23
+ # category_parts = category_text.split(".")
24
+ # assert len(category_parts) == 3
25
+ assert request_state.annotations
26
+ error_rate = 0.0 if request_state.annotations["czech_bank_qa"]["error"] is None else 1.0
27
+ return [
28
+ Stat(MetricName("error_rate")).add(error_rate),
29
+ ]
@@ -1,8 +1,8 @@
1
1
  from typing import List
2
2
  from helm.benchmark.adaptation.request_state import RequestState
3
3
  from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
4
- from .metric_name import MetricName
5
- from .statistic import Stat
4
+ from helm.benchmark.metrics.metric_name import MetricName
5
+ from helm.benchmark.metrics.statistic import Stat
6
6
  from helm.benchmark.metrics.evaluate_reference_metrics import normalize_text
7
7
  import numpy as np
8
8
  from fairlearn.metrics import equalized_odds_difference
@@ -1,7 +1,7 @@
1
1
  from helm.benchmark.adaptation.request_state import RequestState
2
2
  from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
3
- from .metric_name import MetricName
4
- from .statistic import Stat
3
+ from helm.benchmark.metrics.metric_name import MetricName
4
+ from helm.benchmark.metrics.statistic import Stat
5
5
  from typing import List
6
6
  import re
7
7
 
@@ -3,8 +3,8 @@ from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMe
3
3
  from helm.common.request import GeneratedOutput
4
4
  from helm.common.request import RequestResult
5
5
  from helm.benchmark.adaptation.request_state import RequestState
6
- from .metric_name import MetricName
7
- from .statistic import Stat
6
+ from helm.benchmark.metrics.metric_name import MetricName
7
+ from helm.benchmark.metrics.statistic import Stat
8
8
 
9
9
  import numpy as np
10
10
 
@@ -0,0 +1,34 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.annotation.dischargeme_annotator import ANNOTATOR_MODELS
6
+ from helm.benchmark.metrics.metric import Metric
7
+ from helm.benchmark.metrics.metric_name import MetricName
8
+ from helm.benchmark.metrics.metric_service import MetricService
9
+ from helm.benchmark.metrics.statistic import Stat
10
+
11
+
12
+ class DischargeMeMetric(Metric):
13
+ """Score metrics for DischargeMe."""
14
+
15
+ def evaluate_generation(
16
+ self,
17
+ adapter_spec: AdapterSpec,
18
+ request_state: RequestState,
19
+ metric_service: MetricService,
20
+ eval_cache_path: str,
21
+ ) -> List[Stat]:
22
+ assert request_state.annotations
23
+ annotations: Dict[str, Any] = request_state.annotations["dischargeme"]
24
+ scores: List[int] = []
25
+ score = 0.0
26
+ for annotation_key, annotation_dict in annotations.items():
27
+ if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
28
+ for val in annotation_dict.values():
29
+ scores.append(int(val["score"]))
30
+ if scores:
31
+ score = sum(scores) / len(scores)
32
+ return [
33
+ Stat(MetricName("dischargeme_accuracy")).add(score),
34
+ ]
@@ -11,10 +11,10 @@ from helm.common.optional_dependencies import handle_module_not_found_error
11
11
  from helm.common.request import RequestResult, GeneratedOutput
12
12
  from helm.benchmark.adaptation.request_state import RequestState
13
13
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
14
- from .metric import Metric
15
- from .metric_name import MetricName
16
- from .metric_service import MetricService
17
- from .statistic import Stat
14
+ from helm.benchmark.metrics.metric import Metric
15
+ from helm.benchmark.metrics.metric_name import MetricName
16
+ from helm.benchmark.metrics.metric_service import MetricService
17
+ from helm.benchmark.metrics.statistic import Stat
18
18
 
19
19
  try:
20
20
  from sacrebleu.metrics import BLEU
@@ -8,11 +8,11 @@ from helm.benchmark.adaptation.request_state import RequestState
8
8
  from helm.benchmark.metrics.statistic import Stat, merge_stat
9
9
  from helm.benchmark.window_services.window_service import WindowService
10
10
  from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
11
- from .metric import MetricInterface, MetricResult, PerInstanceStats
12
- from .metric_name import MetricName
13
- from .metric_service import MetricService
14
- from .tokens.auto_token_cost_estimator import AutoTokenCostEstimator
15
- from .tokens.token_cost_estimator import TokenCostEstimator
11
+ from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats
12
+ from helm.benchmark.metrics.metric_name import MetricName
13
+ from helm.benchmark.metrics.metric_service import MetricService
14
+ from helm.benchmark.metrics.tokens.auto_token_cost_estimator import AutoTokenCostEstimator
15
+ from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
16
16
 
17
17
 
18
18
  @dataclass