crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,190 @@
1
+ from threading import Lock
2
+ import librosa
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from dataclasses import dataclass
6
+ from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
7
+
8
+ from helm.common.cache import CacheConfig
9
+ from helm.common.gpu_utils import get_torch_device_name
10
+ from helm.common.hierarchical_logger import hlog, htrack_block
11
+ from helm.common.media_object import TEXT_TYPE
12
+ from helm.common.request import Request, RequestResult, GeneratedOutput, Token
13
+ from helm.common.request import wrap_request_time
14
+ from helm.clients.client import CachingClient, generate_uid_for_multimodal_prompt
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class LoadedQwenModelProcessor:
19
+ """Loaded model and processor for Qwen."""
20
+
21
+ model: Qwen2AudioForConditionalGeneration
22
+ tokenizer: AutoProcessor
23
+
24
+
25
+ _models_lock: Lock = Lock()
26
+ _models: Dict[str, Optional[LoadedQwenModelProcessor]] = {
27
+ "Qwen/Qwen2-Audio-7B-Instruct": None,
28
+ }
29
+
30
+
31
+ class Qwen2AudioLMClient(CachingClient):
32
+ """
33
+ From https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct,
34
+ Qwen2-Audio-Instruct (Qwen2 Large Vision Language Model) is the audito multimodal version of the large model series,
35
+ Qwen2 (abbr. Tongyi Qianwen), proposed by Alibaba Cloud. Qwen2-Audio-Instruct accepts audio, text as inputs,
36
+ outputs text.
37
+ Alibaba released Qwen-Audio and Qwen-Audio-Instruct, which is a instruction-following model based on Qwen-Audio.
38
+ We for now integrated Qwen2-Audio-Instruct for instruction-following tasks.
39
+
40
+ Paper: https://arxiv.org/abs/2407.10759
41
+ """
42
+
43
+ END_OF_TEXT_TOKEN: str = "<|im_end|>"
44
+
45
+ def __init__(self, cache_config: CacheConfig):
46
+ super().__init__(cache_config=cache_config)
47
+ self._device: str = get_torch_device_name()
48
+
49
+ def _get_model(self, helm_model_name: str) -> LoadedQwenModelProcessor:
50
+ global _models_lock
51
+ global _models
52
+
53
+ model_name: str
54
+ if helm_model_name == "qwen2-audio-7b-instruct":
55
+ model_name = "Qwen/Qwen2-Audio-7B-Instruct"
56
+ else:
57
+ raise ValueError(f"Unhandled model name: {helm_model_name}")
58
+
59
+ # Ensure that only one thread is loading the model at a time
60
+ with _models_lock:
61
+ loaded_model_processor = _models[model_name]
62
+ if loaded_model_processor is None:
63
+ hlog(f"Loading model {model_name} and caching in memory...")
64
+ model = Qwen2AudioForConditionalGeneration.from_pretrained(
65
+ model_name,
66
+ device_map=self._device,
67
+ ).eval()
68
+ tokenizer = AutoProcessor.from_pretrained(
69
+ model_name,
70
+ )
71
+ _models[model_name] = LoadedQwenModelProcessor(model, tokenizer)
72
+ loaded_model_processor = _models[model_name]
73
+
74
+ assert loaded_model_processor is not None
75
+ return loaded_model_processor
76
+
77
+ def make_request(self, request: Request) -> RequestResult:
78
+ assert request.multimodal_prompt is not None, "Multimodal prompt is required"
79
+
80
+ loaded_model_processor: LoadedQwenModelProcessor = self._get_model(request.model_engine)
81
+ model = loaded_model_processor.model
82
+ tokenizer = loaded_model_processor.tokenizer
83
+
84
+ input_query: List[Dict[str, Any]] = []
85
+ query: List[Dict[str, str]] = []
86
+ prompt_text: str = ""
87
+
88
+ input_query.append({"role": "system", "content": "You are a helpful assistant."})
89
+ prompt_text += "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
90
+ for media_num, media_object in enumerate(request.multimodal_prompt.media_objects):
91
+ if media_object.is_type("audio") and media_object.location:
92
+ assert media_object.is_local_file, "Only local audio files are supported"
93
+ query.append({"type": "audio", "audio_url": media_object.location})
94
+
95
+ prompt_text += f"<|im_start|>user\nAudio {media_num+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
96
+ elif media_object.is_type(TEXT_TYPE):
97
+ if media_object.text is None:
98
+ raise ValueError("MediaObject of text type has missing text field value")
99
+ query.append({"type": "text", "text": media_object.text})
100
+ prompt_text += media_object.text
101
+ else:
102
+ raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
103
+ prompt_text += "<|im_end|>\n<|im_start|>assistant\n"
104
+
105
+ input_query.append({"role": "user", "content": query})
106
+ completions: List[GeneratedOutput] = []
107
+ request_time: float = 0
108
+ request_datetime: Optional[int] = None
109
+ all_cached: bool = True
110
+
111
+ with htrack_block(f"Generating for prompt: {prompt_text}"):
112
+ for completion_index in range(request.num_completions):
113
+ try:
114
+
115
+ def do_it() -> Dict[str, Any]:
116
+ inputs = tokenizer.apply_chat_template( # type: ignore
117
+ input_query, add_generation_prompt=True, tokenize=False
118
+ )
119
+ audios: List[Any] = []
120
+ # Refer to the official Qwen2-Audio documentation for the format of the input query
121
+ # https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct
122
+ for message in input_query:
123
+ if isinstance(message["content"], list):
124
+ for element in message["content"]:
125
+ if element["type"] == "audio":
126
+ audios.append(
127
+ librosa.load(
128
+ element["audio_url"],
129
+ sr=tokenizer.feature_extractor.sampling_rate, # type: ignore
130
+ )[0]
131
+ )
132
+ inputs = tokenizer( # type: ignore
133
+ text=inputs,
134
+ audios=audios,
135
+ sampling_rate=tokenizer.feature_extractor.sampling_rate, # type: ignore
136
+ return_tensors="pt",
137
+ padding=True,
138
+ )
139
+ input_length = inputs.input_ids.size(1)
140
+ # Qwen2-Audio-Instruct counts input into the max_length,
141
+ # so we need to add the length of the prompt
142
+ inputs = inputs.to(self._device)
143
+ pred = model.generate(**inputs, max_length=request.max_tokens + input_length)[:, input_length:]
144
+
145
+ completion = tokenizer.decode( # type: ignore
146
+ pred.cpu()[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
147
+ )
148
+ # The processor of Qwen2-Audio-Instruct consists an AutoTokenizer and a WhisperFeatureExtractor
149
+ tokens: List[str] = tokenizer.tokenizer.tokenize(completion) # type: ignore
150
+ return {"output": (completion, tokens)}
151
+
152
+ # Include the prompt and model name in the cache key
153
+ cache_key = CachingClient.make_cache_key(
154
+ raw_request={
155
+ "completion_index": completion_index,
156
+ "model": request.model,
157
+ "prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt),
158
+ "max_tokens": request.max_tokens,
159
+ },
160
+ request=request,
161
+ )
162
+ result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
163
+ except RuntimeError as model_error:
164
+ return RequestResult(
165
+ success=False, cached=False, error=str(model_error), completions=[], embedding=[]
166
+ )
167
+
168
+ text, tokens = result["output"]
169
+ hlog(f"Generated: {text}")
170
+
171
+ # Tokenize truncated text to get the list of tokens
172
+ completions.append(
173
+ GeneratedOutput(
174
+ text=text, logprob=0, tokens=[Token(text=str(token), logprob=0) for token in tokens]
175
+ )
176
+ )
177
+
178
+ request_time += result["request_time"]
179
+ # Use the datetime from the first completion because that's when the request was fired
180
+ request_datetime = request_datetime or result.get("request_datetime")
181
+ all_cached = all_cached and cached
182
+
183
+ return RequestResult(
184
+ success=True,
185
+ cached=all_cached,
186
+ request_time=request_time,
187
+ request_datetime=request_datetime,
188
+ completions=completions,
189
+ embedding=[],
190
+ )
@@ -0,0 +1,152 @@
1
+ from threading import Lock
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ from dataclasses import dataclass
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer
6
+
7
+ from helm.common.cache import CacheConfig
8
+ from helm.common.gpu_utils import get_torch_device_name
9
+ from helm.common.hierarchical_logger import hlog, htrack_block
10
+ from helm.common.media_object import TEXT_TYPE
11
+ from helm.common.request import Request, RequestResult, GeneratedOutput, Token
12
+ from helm.common.request import wrap_request_time
13
+ from helm.clients.client import CachingClient, generate_uid_for_multimodal_prompt
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class LoadedQwenModelProcessor:
18
+ """Loaded model and processor for Qwen."""
19
+
20
+ model: AutoModelForCausalLM
21
+ tokenizer: AutoTokenizer
22
+
23
+
24
+ _models_lock: Lock = Lock()
25
+ _models: Dict[str, Optional[LoadedQwenModelProcessor]] = {
26
+ "Qwen/Qwen-Audio-Chat": None,
27
+ }
28
+
29
+
30
+ class QwenAudioLMClient(CachingClient):
31
+ """
32
+ From https://huggingface.co/Qwen/Qwen-Audio,
33
+ Qwen-Audio-Chat (Qwen Large Audio Language Model) is the audio multimodal version of the large model series,
34
+ Qwen (abbr. Tongyi Qianwen), proposed by Alibaba Cloud. Qwen-Audio-Chat accepts audio, text as inputs, outputs text.
35
+ Alibaba released Qwen-Audio and Qwen-Audio-Chat, which is a chatbot model based on Qwen-Audio.
36
+ We for now integrated Qwen-Audio-Chat for instruction-following tasks.
37
+
38
+ Paper: https://arxiv.org/abs/2311.07919
39
+ """
40
+
41
+ def __init__(self, cache_config: CacheConfig):
42
+ super().__init__(cache_config=cache_config)
43
+ self._device: str = get_torch_device_name()
44
+
45
+ def _get_model(self, helm_model_name: str) -> LoadedQwenModelProcessor:
46
+ global _models_lock
47
+ global _models
48
+
49
+ model_name: str
50
+ if helm_model_name == "qwen-audio-chat":
51
+ model_name = "Qwen/Qwen-Audio-Chat"
52
+ else:
53
+ raise ValueError(f"Unhandled model name: {helm_model_name}")
54
+
55
+ # Ensure that only one thread is loading the model at a time
56
+ with _models_lock:
57
+ loaded_model_processor = _models[model_name]
58
+ if loaded_model_processor is None:
59
+ hlog(f"Loading model {model_name} and caching in memory...")
60
+ model = AutoModelForCausalLM.from_pretrained(
61
+ model_name,
62
+ device_map=self._device,
63
+ trust_remote_code=True,
64
+ bf16=True,
65
+ ).eval()
66
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
67
+ _models[model_name] = LoadedQwenModelProcessor(model, tokenizer)
68
+ loaded_model_processor = _models[model_name]
69
+
70
+ assert loaded_model_processor is not None
71
+ return loaded_model_processor
72
+
73
+ def make_request(self, request: Request) -> RequestResult:
74
+ assert request.multimodal_prompt is not None, "Multimodal prompt is required"
75
+
76
+ loaded_model_processor: LoadedQwenModelProcessor = self._get_model(request.model_engine)
77
+ model = loaded_model_processor.model
78
+ tokenizer = loaded_model_processor.tokenizer
79
+
80
+ generation_args = {
81
+ "max_length": request.max_tokens,
82
+ }
83
+
84
+ query: List[Dict[str, str]] = []
85
+ prompt_text: str = ""
86
+
87
+ for media_object in request.multimodal_prompt.media_objects:
88
+ if media_object.is_type("audio") and media_object.location:
89
+ query.append({"audio": media_object.location})
90
+ elif media_object.is_type(TEXT_TYPE):
91
+ if media_object.text is None:
92
+ raise ValueError("MediaObject of text type has missing text field value")
93
+
94
+ query.append({"text": media_object.text})
95
+ prompt_text += media_object.text
96
+ else:
97
+ raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
98
+
99
+ completions: List[GeneratedOutput] = []
100
+ request_time: float = 0
101
+ request_datetime: Optional[int] = None
102
+ all_cached: bool = True
103
+
104
+ with htrack_block(f"Generating for prompt: {prompt_text}"):
105
+ for completion_index in range(request.num_completions):
106
+ try:
107
+
108
+ def do_it() -> Dict[str, Any]:
109
+ completion, _ = model.chat( # type: ignore
110
+ tokenizer, query=tokenizer.from_list_format(query), history=None # type: ignore
111
+ )
112
+ tokens: List[str] = tokenizer.tokenize(completion) # type: ignore
113
+ return {"output": (completion, tokens)}
114
+
115
+ # Include the prompt and model name in the cache key
116
+ cache_key = CachingClient.make_cache_key(
117
+ raw_request={
118
+ "completion_index": completion_index,
119
+ "model": request.model,
120
+ "prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt),
121
+ **generation_args,
122
+ },
123
+ request=request,
124
+ )
125
+ result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
126
+ except RuntimeError as model_error:
127
+ return RequestResult(
128
+ success=False, cached=False, error=str(model_error), completions=[], embedding=[]
129
+ )
130
+
131
+ text, tokens = result["output"]
132
+
133
+ # Tokenize truncated text to get the list of tokens
134
+ completions.append(
135
+ GeneratedOutput(
136
+ text=text, logprob=0, tokens=[Token(text=str(token), logprob=0) for token in tokens]
137
+ )
138
+ )
139
+
140
+ request_time += result["request_time"]
141
+ # Use the datetime from the first completion because that's when the request was fired
142
+ request_datetime = request_datetime or result.get("request_datetime")
143
+ all_cached = all_cached and cached
144
+
145
+ return RequestResult(
146
+ success=True,
147
+ cached=all_cached,
148
+ request_time=request_time,
149
+ request_datetime=request_datetime,
150
+ completions=completions,
151
+ embedding=[],
152
+ )
@@ -0,0 +1,62 @@
1
+ import soundfile as sf
2
+
3
+ from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor # type: ignore
4
+ from qwen_omni_utils import process_mm_info
5
+
6
+ # default: Load the model on the available device(s)
7
+ model = Qwen2_5OmniModel.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto")
8
+
9
+ # We recommend enabling flash_attention_2 for better acceleration and memory saving.
10
+ # model = Qwen2_5OmniModel.from_pretrained(
11
+ # "Qwen/Qwen2.5-Omni-7B",
12
+ # torch_dtype="auto",
13
+ # device_map="auto",
14
+ # attn_implementation="flash_attention_2",
15
+ # )
16
+
17
+ processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
18
+
19
+ conversation = [
20
+ {
21
+ "role": "system",
22
+ "content": (
23
+ "You are Qwen, a virtual human developed by the Qwen Team,"
24
+ " Alibaba Group, capable of perceiving auditory and visual"
25
+ " inputs, as well as generating text and speech."
26
+ ),
27
+ },
28
+ {
29
+ "role": "user",
30
+ "content": [
31
+ {"type": "video", "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4"},
32
+ ],
33
+ },
34
+ ]
35
+
36
+ # set use audio in video
37
+ USE_AUDIO_IN_VIDEO = True
38
+
39
+ # Preparation for inference
40
+ text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
41
+ audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
42
+ inputs = processor(
43
+ text=text,
44
+ audios=audios,
45
+ images=images,
46
+ videos=videos,
47
+ return_tensors="pt",
48
+ padding=True,
49
+ use_audio_in_video=USE_AUDIO_IN_VIDEO,
50
+ )
51
+ inputs = inputs.to(model.device).to(model.dtype)
52
+
53
+ # Inference: Generation of the output text and audio
54
+ text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO)
55
+
56
+ text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
57
+ print(text)
58
+ sf.write(
59
+ "output.wav",
60
+ audio.reshape(-1).detach().cpu().numpy(),
61
+ samplerate=24000,
62
+ )
@@ -71,6 +71,7 @@ class AutoClient(Client):
71
71
  model_deployment.client_spec,
72
72
  constant_bindings={
73
73
  "cache_config": cache_config,
74
+ "model_name": model_deployment.model_name,
74
75
  "tokenizer_name": model_deployment.tokenizer_name,
75
76
  },
76
77
  provider_bindings={
@@ -81,6 +82,7 @@ class AutoClient(Client):
81
82
  "org_id": lambda: self.credentials.get(
82
83
  host_organization + "OrgId", None
83
84
  ), # OpenAI, GooseAI, Microsoft
85
+ "base_url": lambda: self.credentials.get(host_organization + "BaseUrl", None),
84
86
  "moderation_api_client": lambda: self.get_moderation_api_client(), # OpenAI DALL-E
85
87
  "lock_file_path": lambda: os.path.join(
86
88
  self.file_storage_path, f"{host_organization}.lock"
@@ -131,7 +133,7 @@ class AutoClient(Client):
131
133
  return replace(last_attempt.value, error=f"{retry_error}. Error: {last_attempt.value.error}")
132
134
 
133
135
  def get_gcs_client(self):
134
- from .gcs_client import GCSClient
136
+ from helm.clients.gcs_client import GCSClient
135
137
 
136
138
  bucket_name: str = self.credentials["gcsBucketName"]
137
139
  cache_config: CacheConfig = self.cache_backend_config.get_cache_config("gcs")
@@ -144,7 +146,7 @@ class AutoClient(Client):
144
146
  return NudityCheckClient(cache_config)
145
147
 
146
148
  def get_clip_score_client(self):
147
- from .clip_score_client import CLIPScoreClient
149
+ from helm.clients.clip_score_client import CLIPScoreClient
148
150
 
149
151
  cache_config: CacheConfig = self.cache_backend_config.get_cache_config("clip_score")
150
152
  return CLIPScoreClient(cache_config)
@@ -0,0 +1,55 @@
1
+ import os
2
+ from typing import Dict, Optional
3
+
4
+ from helm.clients.openai_client import OpenAIClient
5
+ from helm.common.cache import CacheConfig
6
+ from helm.common.optional_dependencies import handle_module_not_found_error
7
+ from helm.proxy.retry import NonRetriableException
8
+ from helm.tokenizers.tokenizer import Tokenizer
9
+
10
+ try:
11
+ from openai import AzureOpenAI
12
+ except ModuleNotFoundError as e:
13
+ handle_module_not_found_error(e, ["openai"])
14
+
15
+
16
+ class AzureOpenAIClient(OpenAIClient):
17
+ API_VERSION = "2024-07-01-preview"
18
+
19
+ def __init__(
20
+ self,
21
+ tokenizer: Tokenizer,
22
+ tokenizer_name: str,
23
+ cache_config: CacheConfig,
24
+ api_key: Optional[str] = None,
25
+ endpoint: Optional[str] = None,
26
+ api_version: Optional[str] = None,
27
+ default_headers: Optional[Dict[str, str]] = None,
28
+ base_url: Optional[str] = None,
29
+ azure_openai_deployment_name: Optional[str] = None,
30
+ ):
31
+ super().__init__(
32
+ tokenizer=tokenizer,
33
+ tokenizer_name=tokenizer_name,
34
+ cache_config=cache_config,
35
+ api_key="unused",
36
+ openai_model_name=azure_openai_deployment_name,
37
+ )
38
+ azure_endpoint = endpoint or os.getenv("AZURE_OPENAI_ENDPOINT")
39
+ assert (base_url is None) != (azure_endpoint is None), "Exactly one of endoint or base_url must be provided"
40
+ if base_url:
41
+ self.client = AzureOpenAI(
42
+ api_key=api_key,
43
+ api_version=api_version or AzureOpenAIClient.API_VERSION,
44
+ base_url=base_url,
45
+ default_headers=default_headers,
46
+ )
47
+ elif azure_endpoint:
48
+ self.client = AzureOpenAI(
49
+ api_key=api_key,
50
+ api_version=api_version or AzureOpenAIClient.API_VERSION,
51
+ azure_endpoint=azure_endpoint,
52
+ default_headers=default_headers,
53
+ )
54
+ else:
55
+ raise NonRetriableException("Must provide either endpoint or base_url")