crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  31. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  32. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  33. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  34. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  35. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  36. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  37. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  38. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  39. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  40. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  41. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  42. helm/benchmark/annotation/medalign_annotator.py +100 -0
  43. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  44. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  45. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  46. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  47. helm/benchmark/annotation/model_as_judge.py +218 -6
  48. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  49. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  50. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  51. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  52. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  53. helm/benchmark/annotation/spider_annotator.py +18 -0
  54. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  55. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  56. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  57. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  58. helm/benchmark/annotation_executor.py +35 -15
  59. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  60. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  61. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  62. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  63. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  64. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  65. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  66. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  67. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  68. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  69. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  70. helm/benchmark/augmentations/perturbation.py +1 -1
  71. helm/benchmark/augmentations/space_perturbation.py +2 -2
  72. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  73. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  74. helm/benchmark/augmentations/test_perturbation.py +16 -13
  75. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  76. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  77. helm/benchmark/data_preprocessor.py +2 -2
  78. helm/benchmark/huggingface_registration.py +2 -7
  79. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  80. helm/benchmark/metrics/basic_metrics.py +6 -6
  81. helm/benchmark/metrics/bbq_metrics.py +2 -2
  82. helm/benchmark/metrics/bias_metrics.py +12 -3
  83. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  84. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  85. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  86. helm/benchmark/metrics/classification_metrics.py +76 -12
  87. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  88. helm/benchmark/metrics/code_metrics.py +5 -5
  89. helm/benchmark/metrics/comet_metric.py +125 -0
  90. helm/benchmark/metrics/common_metric_specs.py +9 -2
  91. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  92. helm/benchmark/metrics/copyright_metrics.py +4 -4
  93. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  94. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  95. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  96. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  97. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  98. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  99. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  100. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  101. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  102. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  103. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  104. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  105. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  106. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  107. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  108. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  109. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  110. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  111. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  112. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  113. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  114. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  115. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  116. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  117. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  118. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  120. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  121. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  122. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  123. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  124. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  125. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  126. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  127. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  128. helm/benchmark/metrics/medalign_metrics.py +34 -0
  129. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  130. helm/benchmark/metrics/medec_metrics.py +101 -0
  131. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  132. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  133. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  134. helm/benchmark/metrics/metric.py +3 -3
  135. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  136. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  137. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  138. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  139. helm/benchmark/metrics/nltk_helper.py +32 -0
  140. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  141. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  142. helm/benchmark/metrics/output_processing_metric.py +60 -0
  143. helm/benchmark/metrics/output_processors.py +15 -0
  144. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  145. helm/benchmark/metrics/ranking_metrics.py +3 -3
  146. helm/benchmark/metrics/reference_metric.py +3 -3
  147. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  148. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  149. helm/benchmark/metrics/spider_metrics.py +7 -0
  150. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  151. helm/benchmark/metrics/statistic.py +1 -1
  152. helm/benchmark/metrics/summac/model_summac.py +1 -1
  153. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  154. helm/benchmark/metrics/summarization_metrics.py +19 -9
  155. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  156. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  157. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  158. helm/benchmark/metrics/test_metric.py +1 -1
  159. helm/benchmark/metrics/test_statistic.py +2 -2
  160. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  161. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  162. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  163. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  164. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  165. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  167. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  169. helm/benchmark/metrics/unitxt_metrics.py +4 -1
  170. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  171. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  172. helm/benchmark/model_metadata_registry.py +16 -0
  173. helm/benchmark/presentation/summarize.py +23 -10
  174. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  175. helm/benchmark/reeval_run.py +203 -0
  176. helm/benchmark/reeval_runner.py +355 -0
  177. helm/benchmark/run.py +8 -17
  178. helm/benchmark/run_expander.py +78 -8
  179. helm/benchmark/run_spec_factory.py +12 -0
  180. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  181. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  182. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  183. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  184. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  185. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  186. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  187. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  188. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  189. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  190. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  191. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  192. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  193. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  194. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  195. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  196. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  197. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  198. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  199. helm/benchmark/run_specs/vlm_run_specs.py +75 -2
  200. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  201. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  202. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  203. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  204. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  205. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  206. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  207. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  208. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  209. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  210. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  211. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  212. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  213. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  214. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  215. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  216. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  217. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  218. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  219. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  220. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  221. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  222. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  223. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  224. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  225. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  226. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  227. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  228. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  229. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  230. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  231. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  232. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  233. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  234. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  235. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  236. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  237. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  238. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  239. helm/benchmark/scenarios/bold_scenario.py +1 -1
  240. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  241. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  242. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  243. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  244. helm/benchmark/scenarios/clear_scenario.py +153 -0
  245. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  246. helm/benchmark/scenarios/code_scenario.py +17 -4
  247. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  248. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  249. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  250. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  251. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  252. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  253. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  254. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  255. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  256. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  257. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  258. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  259. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  260. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  261. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  262. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  263. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  264. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  265. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  266. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  267. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  268. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  269. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  270. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  271. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  272. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  273. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  274. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  275. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  276. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  277. helm/benchmark/scenarios/ice_scenario.py +8 -4
  278. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  279. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  280. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  281. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  282. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  283. helm/benchmark/scenarios/koala_scenario.py +1 -1
  284. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  285. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  286. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  287. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  288. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  289. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  290. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  291. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  292. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  293. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  294. helm/benchmark/scenarios/math_scenario.py +9 -1
  295. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  296. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  297. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  298. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  299. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  300. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  301. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  302. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  303. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  304. helm/benchmark/scenarios/medec_scenario.py +120 -0
  305. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  306. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  307. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  308. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  309. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  310. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  311. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  312. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  313. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  314. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  315. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  316. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  317. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  318. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  319. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  320. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  321. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  322. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  323. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  324. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  325. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  326. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  328. helm/benchmark/scenarios/quac_scenario.py +10 -1
  329. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  330. helm/benchmark/scenarios/raft_scenario.py +17 -2
  331. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  332. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  333. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  334. helm/benchmark/scenarios/scenario.py +9 -1
  335. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  336. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  337. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  338. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  339. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  340. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  341. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  342. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  343. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  344. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  345. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  346. helm/benchmark/scenarios/spider_scenario.py +91 -0
  347. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  348. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  349. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  350. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  351. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  352. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  353. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  354. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  355. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  356. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  357. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  358. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  359. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  360. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  361. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  362. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  363. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  364. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  366. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  367. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  368. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  369. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  370. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  371. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  372. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  373. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  374. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  375. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  376. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  377. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  378. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  379. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  380. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  381. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  382. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  383. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  384. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  385. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  386. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  387. helm/benchmark/server.py +11 -0
  388. helm/benchmark/slurm_runner.py +1 -1
  389. helm/benchmark/static/schema_audio.yaml +752 -0
  390. helm/benchmark/static/schema_autobencher.yaml +150 -0
  391. helm/benchmark/static/schema_call_center.yaml +97 -60
  392. helm/benchmark/static/schema_capabilities.yaml +254 -0
  393. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  394. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  395. helm/benchmark/static/schema_enterprise.yaml +298 -0
  396. helm/benchmark/static/schema_finance.yaml +14 -12
  397. helm/benchmark/static/schema_heim.yaml +1389 -0
  398. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  399. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  400. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  401. helm/benchmark/static/schema_safety.yaml +18 -1
  402. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  403. helm/benchmark/static/schema_social_audio.yaml +224 -0
  404. helm/benchmark/static/schema_sql.yaml +171 -0
  405. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  406. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  407. helm/benchmark/static/schema_vhelm.yaml +109 -36
  408. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  409. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  410. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  411. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  412. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  413. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  414. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  415. helm/benchmark/static_build/config.js +1 -1
  416. helm/benchmark/static_build/index.html +5 -5
  417. helm/benchmark/window_services/default_window_service.py +1 -1
  418. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  419. helm/benchmark/window_services/ice_window_service.py +1 -1
  420. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  421. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  422. helm/benchmark/window_services/local_window_service.py +2 -2
  423. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  424. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  425. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  426. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  427. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  428. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  429. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  430. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  431. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  432. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  433. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  434. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  435. helm/benchmark/window_services/test_utils.py +1 -1
  436. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  437. helm/benchmark/window_services/yalm_window_service.py +1 -1
  438. helm/clients/ai21_client.py +3 -3
  439. helm/clients/aleph_alpha_client.py +1 -1
  440. helm/clients/audio_language/__init__.py +0 -0
  441. helm/clients/audio_language/diva_llama_client.py +118 -0
  442. helm/clients/audio_language/llama_omni_client.py +198 -0
  443. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  444. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  445. helm/clients/auto_client.py +4 -2
  446. helm/clients/azure_openai_client.py +55 -0
  447. helm/clients/bedrock_client.py +201 -7
  448. helm/clients/bedrock_utils.py +33 -0
  449. helm/clients/clip_scorers/clip_scorer.py +1 -1
  450. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  451. helm/clients/cohere_client.py +3 -3
  452. helm/clients/google_client.py +1 -1
  453. helm/clients/http_model_client.py +1 -1
  454. helm/clients/huggingface_client.py +10 -18
  455. helm/clients/ibm_client.py +267 -0
  456. helm/clients/image_generation/adobe_vision_client.py +1 -1
  457. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  458. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  459. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  460. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  461. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  462. helm/clients/image_generation/cogview2_client.py +1 -1
  463. helm/clients/image_generation/dalle2_client.py +1 -1
  464. helm/clients/image_generation/dalle3_client.py +2 -2
  465. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  466. helm/clients/image_generation/dalle_mini/data.py +1 -1
  467. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  468. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  469. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  470. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  471. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  472. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  473. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  474. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  475. helm/clients/image_generation/dalle_mini_client.py +1 -1
  476. helm/clients/image_generation/deep_floyd_client.py +1 -1
  477. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  478. helm/clients/image_generation/lexica_client.py +1 -1
  479. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  480. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  481. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  482. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  483. helm/clients/image_generation/mindalle_client.py +1 -1
  484. helm/clients/image_generation/together_image_generation_client.py +1 -1
  485. helm/clients/lit_gpt_client.py +2 -2
  486. helm/clients/mistral_client.py +62 -18
  487. helm/clients/nvidia_nim_client.py +0 -3
  488. helm/clients/openai_client.py +241 -22
  489. helm/clients/palmyra_client.py +1 -4
  490. helm/clients/reka_client.py +1 -1
  491. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  492. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  493. helm/clients/stanfordhealthcare_google_client.py +43 -0
  494. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  495. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  496. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  497. helm/clients/test_client.py +1 -1
  498. helm/clients/test_together_client.py +6 -1
  499. helm/clients/together_client.py +47 -7
  500. helm/clients/upstage_client.py +23 -0
  501. helm/clients/vertexai_client.py +39 -13
  502. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  503. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  504. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  505. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  506. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  507. helm/clients/vllm_client.py +4 -6
  508. helm/clients/yi_client.py +0 -3
  509. helm/common/audio_utils.py +111 -0
  510. helm/common/file_caches/local_file_cache.py +1 -1
  511. helm/common/file_caches/test_local_file_cache.py +1 -1
  512. helm/common/images_utils.py +2 -2
  513. helm/common/media_object.py +2 -2
  514. helm/common/multimodal_request_utils.py +26 -0
  515. helm/common/reeval_parameters.py +12 -0
  516. helm/common/request.py +6 -2
  517. helm/common/response_format.py +18 -0
  518. helm/common/test_media_object.py +1 -1
  519. helm/config/model_deployments.yaml +1112 -19
  520. helm/config/model_metadata.yaml +985 -44
  521. helm/config/tokenizer_configs.yaml +379 -3
  522. helm/proxy/cli.py +2 -2
  523. helm/proxy/example_queries.py +1 -1
  524. helm/proxy/server.py +11 -4
  525. helm/proxy/services/remote_service.py +1 -1
  526. helm/proxy/services/server_service.py +1 -1
  527. helm/proxy/services/test_remote_service.py +2 -2
  528. helm/proxy/services/test_service.py +1 -1
  529. helm/proxy/static/general.js +122 -0
  530. helm/proxy/static/help.html +99 -0
  531. helm/proxy/static/index.css +57 -0
  532. helm/proxy/static/index.html +40 -0
  533. helm/proxy/static/index.js +456 -0
  534. helm/proxy/static/info-icon.png +0 -0
  535. helm/proxy/test_retry.py +1 -1
  536. helm/proxy/token_counters/auto_token_counter.py +1 -1
  537. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  538. helm/tokenizers/caching_tokenizer.py +2 -30
  539. helm/tokenizers/http_model_tokenizer.py +1 -1
  540. helm/tokenizers/huggingface_tokenizer.py +2 -2
  541. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  542. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  543. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  544. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  545. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  546. helm/tokenizers/tokenizer.py +3 -1
  547. helm/tokenizers/yalm_tokenizer.py +3 -3
  548. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  549. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  550. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  551. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  552. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  553. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  554. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  555. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  556. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  557. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  558. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  559. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  560. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  561. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  562. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  563. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  564. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  565. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  566. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  567. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  568. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  569. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  570. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  571. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  572. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  573. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  574. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  575. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  576. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  577. helm/tokenizers/anthropic_tokenizer.py +0 -52
  578. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  579. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  580. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,175 @@
1
+ from threading import Lock
2
+ from typing import Any, Dict, List, Optional
3
+ from dataclasses import dataclass
4
+
5
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
6
+ from qwen_vl_utils import process_vision_info
7
+ import torch
8
+
9
+ from helm.common.cache import CacheConfig
10
+ from helm.common.gpu_utils import get_torch_device_name
11
+ from helm.common.hierarchical_logger import hlog, htrack_block
12
+ from helm.common.media_object import TEXT_TYPE
13
+ from helm.common.request import Request, RequestResult, GeneratedOutput, Token
14
+ from helm.common.request import wrap_request_time
15
+ from helm.clients.client import CachingClient, generate_uid_for_multimodal_prompt
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class LoadedQwen2ModelProcessor:
20
+ model: Qwen2VLForConditionalGeneration
21
+ processor: AutoProcessor
22
+
23
+
24
+ _models_lock: Lock = Lock()
25
+ _models: Dict[str, Optional[LoadedQwen2ModelProcessor]] = {
26
+ "Qwen/Qwen2-VL-7B-Instruct": None,
27
+ "Qwen/Qwen2-VL-72B-Instruct": None,
28
+ }
29
+
30
+
31
+ class Qwen2VLMClient(CachingClient):
32
+ def __init__(self, cache_config: CacheConfig):
33
+ super().__init__(cache_config=cache_config)
34
+ self._device: str = get_torch_device_name()
35
+
36
+ def _get_model_name(self, helm_model_name: str) -> str:
37
+ if helm_model_name == "qwen2-vl-7b-instruct":
38
+ return "Qwen/Qwen2-VL-7B-Instruct"
39
+ elif helm_model_name == "qwen2-vl-72b-instruct":
40
+ return "Qwen/Qwen2-VL-72B-Instruct"
41
+ else:
42
+ raise ValueError(f"Unhandled model name: {helm_model_name}")
43
+
44
+ def _get_model(self, helm_model_name: str) -> LoadedQwen2ModelProcessor:
45
+ global _models_lock
46
+ global _models
47
+
48
+ model_name = self._get_model_name(helm_model_name)
49
+
50
+ with _models_lock:
51
+ loaded = _models[model_name]
52
+ if loaded is None:
53
+ hlog(f"Loading model {model_name} and caching in memory...")
54
+ # https://huggingface.co/docs/transformers/model_doc/qwen2_vl#flash-attention-2-to-speed-up-generation
55
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
56
+ model_name,
57
+ torch_dtype=torch.bfloat16,
58
+ device_map="auto",
59
+ attn_implementation="flash_attention_2",
60
+ ).eval()
61
+ processor = AutoProcessor.from_pretrained(model_name)
62
+ loaded = LoadedQwen2ModelProcessor(model=model, processor=processor)
63
+ _models[model_name] = loaded
64
+
65
+ return loaded
66
+
67
+ def make_request(self, request: Request) -> RequestResult:
68
+ assert request.multimodal_prompt is not None, "Multimodal prompt is required"
69
+ loaded = self._get_model(request.model_engine)
70
+ model = loaded.model
71
+ processor = loaded.processor
72
+
73
+ # Build Qwen2 messages
74
+ # We assume all media objects go into a single "user" message:
75
+ # messages = [
76
+ # {
77
+ # "role": "user",
78
+ # "content": [
79
+ # {"type": "image", "image": "file:///path/to/image1.jpg"},
80
+ # {"type": "image", "image": "file:///path/to/image2.jpg"},
81
+ # {"type": "text", "text": "Describe these images."}
82
+ # ]
83
+ # }
84
+ # ]
85
+ message_content = []
86
+ for media_object in request.multimodal_prompt.media_objects:
87
+ if media_object.is_type("image") and media_object.location:
88
+ message_content.append({"type": "image", "image": media_object.location})
89
+ elif media_object.is_type(TEXT_TYPE):
90
+ if media_object.text is None:
91
+ raise ValueError("MediaObject of text type has missing text field value")
92
+ message_content.append({"type": "text", "text": media_object.text})
93
+ else:
94
+ raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
95
+
96
+ messages = [{"role": "user", "content": message_content}]
97
+
98
+ # Prepare text and vision inputs
99
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
100
+ image_inputs, video_inputs = process_vision_info(messages)
101
+
102
+ inputs = processor(
103
+ text=[text],
104
+ images=image_inputs,
105
+ videos=video_inputs,
106
+ padding=True,
107
+ return_tensors="pt",
108
+ ).to(self._device)
109
+
110
+ generation_args = {
111
+ "max_new_tokens": request.max_tokens,
112
+ }
113
+
114
+ completions: List[GeneratedOutput] = []
115
+ request_time: float = 0
116
+ request_datetime: Optional[int] = None
117
+ all_cached: bool = True
118
+
119
+ with htrack_block(f"Generating for prompt: {text}"):
120
+ for completion_index in range(request.num_completions):
121
+ try:
122
+
123
+ def do_it() -> Dict[str, Any]:
124
+ generated_ids = model.generate(**inputs, **generation_args)
125
+ # Remove the input prefix from outputs
126
+ generated_ids_trimmed = [
127
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
128
+ ]
129
+ output_text = processor.batch_decode(
130
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
131
+ )
132
+ # There's only one batch element
133
+ completion = output_text[0]
134
+ # For simplicity, we split tokens by whitespace.
135
+ # A more accurate tokenization would require a tokenizer for Qwen2, if desired.
136
+ tokens = completion.split()
137
+ return {"output": (completion, tokens)}
138
+
139
+ cache_key = CachingClient.make_cache_key(
140
+ raw_request={
141
+ "completion_index": completion_index,
142
+ "model": request.model,
143
+ "prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt),
144
+ **generation_args,
145
+ },
146
+ request=request,
147
+ )
148
+ result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
149
+ except RuntimeError as model_error:
150
+ return RequestResult(
151
+ success=False, cached=False, error=str(model_error), completions=[], embedding=[]
152
+ )
153
+
154
+ text_out, tokens = result["output"]
155
+ completions.append(
156
+ GeneratedOutput(
157
+ text=text_out,
158
+ logprob=0,
159
+ tokens=[Token(text=str(token), logprob=0) for token in tokens],
160
+ )
161
+ )
162
+ hlog(f"Generated: {text_out}")
163
+
164
+ request_time += result["request_time"]
165
+ request_datetime = request_datetime or result.get("request_datetime")
166
+ all_cached = all_cached and cached
167
+
168
+ return RequestResult(
169
+ success=True,
170
+ cached=all_cached,
171
+ request_time=request_time,
172
+ request_datetime=request_datetime,
173
+ completions=completions,
174
+ embedding=[],
175
+ )
@@ -2,13 +2,15 @@ from typing import Any, Dict, Optional
2
2
 
3
3
  from helm.common.cache import CacheConfig
4
4
  from helm.common.request import Request
5
- from helm.clients.openai_client import OpenAIClient
5
+ from helm.clients.openai_client import OpenAILegacyCompletionsClient
6
6
  from helm.tokenizers.tokenizer import Tokenizer
7
7
 
8
8
 
9
- class VLLMClient(OpenAIClient):
9
+ class VLLMClient(OpenAILegacyCompletionsClient):
10
10
  """Sends request to a vLLM server using the OpenAI-compatible API.
11
11
 
12
+ Only supports the legacy Text Completions API, rather than the Chat Completions API.
13
+
12
14
  See: https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server"""
13
15
 
14
16
  def __init__(
@@ -29,10 +31,6 @@ class VLLMClient(OpenAIClient):
29
31
  self.tokenizer = tokenizer
30
32
  self.tokenizer_name = tokenizer_name
31
33
 
32
- def _is_chat_model_engine(self, model_engine: str) -> bool:
33
- # Only support vLLM completion models for now.
34
- return False
35
-
36
34
  def _get_model_for_request(self, request: Request) -> str:
37
35
  # The `model` parameter for vLLM should be the whole model name including the creator organization,
38
36
  # unlike OpenAI which only uses the model engine.
helm/clients/yi_client.py CHANGED
@@ -26,6 +26,3 @@ class YiChatClient(OpenAIClient):
26
26
  org_id=None,
27
27
  base_url=YiChatClient.BASE_URL,
28
28
  )
29
-
30
- def _is_chat_model_engine(self, model_engine: str) -> bool:
31
- return True
@@ -0,0 +1,111 @@
1
+ from io import BytesIO
2
+ from typing import Optional
3
+ from filelock import FileLock
4
+ import base64
5
+ import os
6
+
7
+ import ffmpeg
8
+ import numpy as np
9
+ import soundfile as sf
10
+ import subprocess
11
+
12
+ from helm.common.hierarchical_logger import hlog
13
+ from helm.common.multimodal_request_utils import get_contents_as_bytes
14
+ from helm.common.optional_dependencies import handle_module_not_found_error
15
+
16
+ try:
17
+ import librosa
18
+ except ModuleNotFoundError as e:
19
+ handle_module_not_found_error(e, ["audiolm"])
20
+
21
+
22
+ def ensure_audio_file_exists_from_array(path: str, array: np.ndarray, sample_rate: int) -> None:
23
+ """Write the array to the wav or mp3 file if it does not already exist.
24
+
25
+ Uses file locking and an atomic rename to avoid file corruption due to incomplete writes and
26
+ concurrent writes."""
27
+ file_extension = os.path.splitext(path)[1]
28
+ if file_extension != ".wav" and file_extension != ".mp3":
29
+ raise ValueError(f"Path must end with .wav or .mp3: {path}")
30
+ with FileLock(f"{path}.lock"):
31
+ if os.path.exists(path):
32
+ # Skip because file already exists
33
+ return
34
+ path_prefix = path.removesuffix(file_extension)
35
+ tmp_path = f"{path_prefix}.tmp{file_extension}"
36
+ sf.write(tmp_path, array, samplerate=sample_rate)
37
+ os.rename(tmp_path, path)
38
+
39
+
40
+ def get_array_from_audio_file(path: str, sample_rate: Optional[int]) -> np.ndarray:
41
+ """Get an array from an audio file"""
42
+ audio_file = (
43
+ BytesIO(get_contents_as_bytes(path)) if path.startswith("http://") or path.startswith("https://") else path
44
+ )
45
+
46
+ # librosa accepts a local file path or a file-like object
47
+ audio_array, _ = librosa.load(audio_file, sr=sample_rate)
48
+ return audio_array
49
+
50
+
51
+ def use_ffmpeg_to_convert_audio_file(input_path: str, output_path: str) -> None:
52
+ if os.path.exists(output_path):
53
+ return
54
+ """Use ffmpeg to convert an audio file type"""
55
+ try:
56
+ subprocess.run(["ffmpeg", "-i", input_path, output_path], check=True)
57
+ except (subprocess.CalledProcessError, FileNotFoundError):
58
+ raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to convert audio files.")
59
+
60
+
61
+ def is_invalid_audio_file(audio_path: str) -> bool:
62
+ """
63
+ Two conditions for an audio file to be considered invalid:
64
+ 1. The file does not exist.
65
+ 2. The file is empty.
66
+ """
67
+ if not os.path.exists(audio_path):
68
+ return True
69
+
70
+ try:
71
+ with sf.SoundFile(audio_path) as audio_file:
72
+ return len(audio_file) == 0
73
+ except RuntimeError:
74
+ return True
75
+
76
+
77
+ def extract_audio(video_path: str, output_audio_path: str) -> None:
78
+ """
79
+ Extracts audio from an MP4 video file and saves it as an MP3 file.
80
+
81
+ Args:
82
+ video_path (str): Path to the input MP4 video file.
83
+ output_audio_path (str): Path to save the extracted MP3 audio file.
84
+
85
+ Returns:
86
+ None
87
+ """
88
+ try:
89
+ (
90
+ ffmpeg.input(video_path)
91
+ .output(output_audio_path, format="mp3", acodec="libmp3lame", audio_bitrate="192k")
92
+ .run(overwrite_output=True)
93
+ )
94
+ except ffmpeg.Error as e:
95
+ hlog(f"Error extracting audio from video: {video_path}: {e.stderr.decode()}")
96
+ raise e
97
+
98
+
99
+ def encode_audio_to_base64(file_path: str) -> str:
100
+ """
101
+ Encodes an audio file to a Base64 string.
102
+
103
+ Args:
104
+ file_path (str): Path to the audio file.
105
+
106
+ Returns:
107
+ str: Base64-encoded string of the audio file.
108
+ """
109
+ assert os.path.exists(file_path), f"Audio file does not exist at path: {file_path}"
110
+ with open(file_path, "rb") as audio_file:
111
+ return base64.b64encode(audio_file.read()).decode("utf-8")
@@ -2,7 +2,7 @@ import os
2
2
  from typing import Callable
3
3
 
4
4
  from helm.common.general import ensure_directory_exists, generate_unique_id
5
- from .file_cache import FileCache
5
+ from helm.common.file_caches.file_cache import FileCache
6
6
 
7
7
  from helm.common.optional_dependencies import handle_module_not_found_error
8
8
 
@@ -3,7 +3,7 @@ import shutil
3
3
  import tempfile
4
4
  import unittest
5
5
 
6
- from .local_file_cache import LocalFileCache
6
+ from helm.common.file_caches.local_file_cache import LocalFileCache
7
7
 
8
8
 
9
9
  class TestLocalFileCache(unittest.TestCase):
@@ -10,7 +10,7 @@ from urllib.request import urlopen
10
10
 
11
11
  import numpy as np
12
12
 
13
- from .general import is_url
13
+ from helm.common.general import is_url
14
14
  from helm.common.optional_dependencies import handle_module_not_found_error
15
15
 
16
16
  try:
@@ -25,7 +25,7 @@ def open_image(image_location: str) -> Image.Image:
25
25
  """
26
26
  image: Image.Image
27
27
  if is_url(image_location):
28
- image = Image.open(requests.get(image_location, stream=True).raw)
28
+ image = Image.open(requests.get(image_location, stream=True).raw) # type: ignore
29
29
  else:
30
30
  image = Image.open(image_location)
31
31
  return image.convert("RGB")
@@ -2,7 +2,7 @@ import os
2
2
  import urllib
3
3
  from copy import deepcopy
4
4
  from dataclasses import dataclass, field, replace
5
- from typing import List, Optional
5
+ from typing import Any, Dict, List, Optional
6
6
 
7
7
 
8
8
  IMAGE_TYPE = "image"
@@ -27,7 +27,7 @@ class MediaObject:
27
27
  location: Optional[str] = None
28
28
  """When the media object is a file, specify the location of the media object, which can be a local path or URL."""
29
29
 
30
- def to_dict(self) -> dict:
30
+ def to_dict(self) -> Dict[str, Any]:
31
31
  """Converts the media object to a dictionary."""
32
32
  return {key: value for key, value in self.__dict__.items() if value is not None}
33
33
 
@@ -1,10 +1,36 @@
1
+ import base64
1
2
  from typing import List, Optional
2
3
 
4
+ import requests
5
+ import urllib.parse
6
+
3
7
  from helm.benchmark.adaptation.request_state import RequestState
4
8
  from helm.benchmark.scenarios.scenario import Reference
5
9
  from helm.common.request import RequestResult
6
10
 
7
11
 
12
+ def get_contents_as_bytes(path: str) -> bytes:
13
+ """Get the contents at the location as bytes.
14
+
15
+ The location can be a local path or a URL."""
16
+ # Fetch the audio file and convert it to a base64 encoded string
17
+ is_remote = urllib.parse.urlparse(path).scheme in ["http", "https"]
18
+ if is_remote:
19
+ response = requests.get(path)
20
+ response.raise_for_status()
21
+ return response.content
22
+ else:
23
+ with open(path, "rb") as f:
24
+ return f.read()
25
+
26
+
27
+ def get_contents_as_base64(path: str) -> str:
28
+ """Get the contents at the location as a base64-encoded string.
29
+
30
+ The location can be a local path or a URL."""
31
+ return base64.b64encode(get_contents_as_bytes(path)).decode("utf-8")
32
+
33
+
8
34
  def gather_generated_image_locations(request_result: RequestResult) -> List[str]:
9
35
  """Gathers the locations (file paths or URLs) of the generated images."""
10
36
  image_locations: List[str] = []
@@ -0,0 +1,12 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+
5
+ @dataclass(frozen=True)
6
+ class REEvalParameters:
7
+ """
8
+ Parameters for reeval evaluation.
9
+ """
10
+
11
+ model_ability: Optional[float] = None
12
+ """The inital ability of the model to perform the task. Used for reeval evaluation."""
helm/common/request.py CHANGED
@@ -4,7 +4,8 @@ from typing import Any, Callable, Dict, List, Optional
4
4
 
5
5
  from helm.common.media_object import MultimediaObject
6
6
  from helm.common.image_generation_parameters import ImageGenerationParameters
7
- from .general import indent_lines, format_text
7
+ from helm.common.general import indent_lines, format_text
8
+ from helm.common.response_format import ResponseFormat
8
9
 
9
10
 
10
11
  @dataclass(frozen=True)
@@ -72,6 +73,9 @@ class Request:
72
73
  image_generation_parameters: Optional[ImageGenerationParameters] = None
73
74
  """Parameters for image generation."""
74
75
 
76
+ response_format: Optional[ResponseFormat] = None
77
+ """EXPERIMENTAL: Response format. Currently only supported by OpenAI and Together."""
78
+
75
79
  def validate(self):
76
80
  if (
77
81
  (self.messages and self.prompt)
@@ -193,7 +197,7 @@ class RequestResult:
193
197
  """Whether the request was actually cached"""
194
198
 
195
199
  request_time: Optional[float] = None
196
- """How long did the request take?"""
200
+ """How long the request took in seconds"""
197
201
 
198
202
  request_datetime: Optional[int] = None
199
203
  """When was the request sent?
@@ -0,0 +1,18 @@
1
+ from dataclasses import dataclass
2
+ from typing import Any, Dict, Optional
3
+
4
+
5
+ @dataclass(frozen=True)
6
+ class ResponseFormat:
7
+ """EXPERIMENTAL: Model response format.
8
+
9
+ Currently only supports JSON schema.
10
+
11
+ Currently only supported by OpenAI and Together.
12
+
13
+ See:
14
+ - https://platform.openai.com/docs/guides/structured-outputs
15
+ - https://docs.together.ai/docs/json-mode"""
16
+
17
+ json_schema: Optional[Dict[str, Any]] = None
18
+ """EXPERIMENTAL: The JSON schema that the model output should conform to."""
@@ -1,6 +1,6 @@
1
1
  import unittest
2
2
 
3
- from .media_object import MediaObject, MultimediaObject
3
+ from helm.common.media_object import MediaObject, MultimediaObject
4
4
 
5
5
 
6
6
  class TestMediaObject(unittest.TestCase):