crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/bbq_metrics.py +12 -0
  27. helm/benchmark/metrics/classification_metrics.py +19 -1
  28. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  29. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  30. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  31. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  32. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  33. helm/benchmark/metrics/comet_metric.py +1 -1
  34. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  35. helm/benchmark/metrics/copyright_metrics.py +1 -1
  36. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  37. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  38. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  39. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  40. helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
  41. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  42. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  43. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  44. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  45. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  46. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  47. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  48. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  49. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  50. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  51. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  52. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  53. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  54. helm/benchmark/metrics/medec_metrics.py +25 -2
  55. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  56. helm/benchmark/metrics/metric.py +25 -0
  57. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  58. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  59. helm/benchmark/metrics/safety_metrics.py +13 -1
  60. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  61. helm/benchmark/metrics/summac/model_summac.py +3 -3
  62. helm/benchmark/metrics/summarization_metrics.py +129 -1
  63. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  64. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  65. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  66. helm/benchmark/model_deployment_registry.py +11 -19
  67. helm/benchmark/presentation/create_plots.py +11 -2
  68. helm/benchmark/presentation/run_display.py +13 -3
  69. helm/benchmark/presentation/run_entry.py +2 -2
  70. helm/benchmark/presentation/schema.py +10 -22
  71. helm/benchmark/presentation/summarize.py +189 -14
  72. helm/benchmark/presentation/taxonomy_info.py +20 -0
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/run.py +15 -4
  75. helm/benchmark/run_expander.py +4 -0
  76. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  77. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  78. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  79. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  80. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  81. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  82. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  83. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  84. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  85. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  86. helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
  87. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  88. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
  89. helm/benchmark/runner.py +7 -0
  90. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  91. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  92. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  93. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  94. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  95. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  96. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  97. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  98. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  99. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  100. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  101. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  102. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  103. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
  104. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
  105. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
  106. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  107. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  108. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  109. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  110. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  111. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  112. helm/benchmark/scenarios/bold_scenario.py +15 -0
  113. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  115. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  116. helm/benchmark/scenarios/clear_scenario.py +23 -0
  117. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  118. helm/benchmark/scenarios/code_scenario.py +28 -0
  119. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  120. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  121. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  122. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  123. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  124. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  125. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  126. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  127. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  128. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  129. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  130. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  131. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  132. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  133. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  134. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  135. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  136. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  137. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  138. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  139. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  140. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  141. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  142. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  143. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  144. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  145. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  146. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  147. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  148. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  149. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  150. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  151. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  152. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  153. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  154. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  155. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  156. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  157. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  158. helm/benchmark/scenarios/ice_scenario.py +21 -1
  159. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  160. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  161. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  162. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  163. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  164. helm/benchmark/scenarios/koala_scenario.py +21 -1
  165. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  166. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  167. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  168. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  169. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  170. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  171. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  172. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  173. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  175. helm/benchmark/scenarios/math_scenario.py +54 -20
  176. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  177. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  178. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  179. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  180. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  181. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  182. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  183. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  184. helm/benchmark/scenarios/medec_scenario.py +23 -0
  185. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  186. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  187. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  188. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  189. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  190. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  191. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  192. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  193. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  194. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  195. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  196. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  197. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  198. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  199. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  200. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  201. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  202. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  203. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  204. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  205. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  206. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  207. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  208. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  209. helm/benchmark/scenarios/quac_scenario.py +14 -0
  210. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  211. helm/benchmark/scenarios/raft_scenario.py +15 -0
  212. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  213. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  214. helm/benchmark/scenarios/scenario.py +31 -0
  215. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  216. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  217. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  218. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  219. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  220. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  221. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  222. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  223. helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
  224. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  225. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  226. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  227. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  228. helm/benchmark/scenarios/spider_scenario.py +18 -0
  229. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  230. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  231. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  232. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  233. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  234. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  235. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  236. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  237. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  238. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  239. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  240. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  241. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  242. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  243. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  244. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  245. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  246. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  247. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  248. helm/benchmark/slurm_jobs.py +1 -2
  249. helm/benchmark/slurm_runner.py +8 -1
  250. helm/benchmark/static/schema_arabic.yaml +271 -0
  251. helm/benchmark/static/schema_classic.yaml +0 -17
  252. helm/benchmark/static/schema_long_context.yaml +17 -18
  253. helm/benchmark/static/schema_medhelm.yaml +36 -0
  254. helm/benchmark/static/schema_slp.yaml +219 -0
  255. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  256. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  257. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  258. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  259. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  260. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  261. helm/benchmark/static_build/index.html +5 -6
  262. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  263. helm/clients/ai21_client.py +2 -0
  264. helm/clients/aleph_alpha_client.py +2 -0
  265. helm/clients/anthropic_client.py +7 -1
  266. helm/clients/audio_language/diva_llama_client.py +2 -0
  267. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  268. helm/clients/audio_language/llama_omni/constants.py +9 -0
  269. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  270. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  271. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  272. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  273. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  274. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  275. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  276. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  277. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  278. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  279. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  280. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  281. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  282. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  283. helm/clients/audio_language/llama_omni/utils.py +202 -0
  284. helm/clients/audio_language/llama_omni_client.py +2 -1
  285. helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
  286. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  287. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  288. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  289. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  290. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  291. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  292. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  293. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  294. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  295. helm/clients/bedrock_client.py +63 -6
  296. helm/clients/cohere_client.py +3 -0
  297. helm/clients/dspy_client.py +135 -0
  298. helm/clients/google_client.py +2 -0
  299. helm/clients/http_model_client.py +2 -0
  300. helm/clients/huggingface_client.py +4 -3
  301. helm/clients/ibm_client.py +3 -1
  302. helm/clients/image_generation/adobe_vision_client.py +2 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  304. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  305. helm/clients/image_generation/cogview2_client.py +2 -1
  306. helm/clients/image_generation/dalle2_client.py +2 -0
  307. helm/clients/image_generation/dalle_mini_client.py +2 -1
  308. helm/clients/image_generation/deep_floyd_client.py +2 -0
  309. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  310. helm/clients/image_generation/lexica_client.py +2 -0
  311. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  312. helm/clients/image_generation/mindalle_client.py +2 -1
  313. helm/clients/image_generation/together_image_generation_client.py +2 -0
  314. helm/clients/megatron_client.py +2 -0
  315. helm/clients/mistral_client.py +2 -0
  316. helm/clients/moderation_api_client.py +2 -0
  317. helm/clients/openai_client.py +38 -21
  318. helm/clients/openai_responses_client.py +34 -8
  319. helm/clients/openrouter_client.py +31 -0
  320. helm/clients/palmyra_client.py +2 -1
  321. helm/clients/reka_client.py +2 -1
  322. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  323. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  324. helm/clients/test_huggingface_client.py +3 -3
  325. helm/clients/test_openrouter_client.py +69 -0
  326. helm/clients/together_client.py +52 -13
  327. helm/clients/vertexai_client.py +23 -11
  328. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  329. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  330. helm/clients/vision_language/idefics_client.py +2 -1
  331. helm/clients/vision_language/open_flamingo_client.py +2 -1
  332. helm/clients/vision_language/paligemma_client.py +2 -1
  333. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  334. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  335. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  336. helm/clients/vllm_client.py +43 -7
  337. helm/clients/vllm_granite_thinking_client.py +56 -0
  338. helm/clients/writer_client.py +5 -2
  339. helm/common/critique_request.py +0 -1
  340. helm/common/hierarchical_logger.py +103 -34
  341. helm/common/object_spec.py +23 -8
  342. helm/common/optional_dependencies.py +1 -1
  343. helm/common/test_general.py +4 -0
  344. helm/common/test_logging.py +94 -0
  345. helm/config/model_deployments.yaml +1001 -187
  346. helm/config/model_metadata.yaml +602 -18
  347. helm/config/tokenizer_configs.yaml +202 -5
  348. helm/proxy/cli.py +1 -1
  349. helm/proxy/example_queries.py +8 -8
  350. helm/proxy/retry.py +5 -0
  351. helm/proxy/server.py +2 -1
  352. helm/proxy/static/index.css +4 -0
  353. helm/proxy/static/index.js +7 -1
  354. helm/tokenizers/auto_tokenizer.py +2 -2
  355. helm/tokenizers/grok_tokenizer.py +2 -0
  356. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  357. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  358. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  359. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  360. helm/benchmark/metrics/medalign_metrics.py +0 -14
  361. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  362. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  363. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  364. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  365. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  366. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  367. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  368. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  369. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  370. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  371. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
  372. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  373. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  374. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  375. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  376. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  377. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  378. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
  379. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  380. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
  381. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  382. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  383. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  384. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  385. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  386. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  387. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  388. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  389. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  390. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  391. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  392. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  393. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  394. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -0,0 +1,104 @@
1
+ import copy
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+
6
+ from transformers.models.llama.modeling_llama import LlamaDecoderLayer
7
+ from helm.clients.audio_language.llama_omni.constants import IGNORE_INDEX
8
+
9
+
10
+ def lengths_to_padding_mask(lens):
11
+ bsz, max_lens = lens.size(0), torch.max(lens).item()
12
+ mask = torch.arange(max_lens).to(lens.device).view([1, int(max_lens)])
13
+ mask = mask.expand(bsz, -1) >= lens.view(bsz, 1).expand(-1, max_lens)
14
+ return mask
15
+
16
+
17
+ def _uniform_assignment(src_lens, tgt_lens):
18
+ tgt_max_len = torch.max(tgt_lens).item()
19
+ tgt_indices = torch.arange(tgt_max_len).expand(len(tgt_lens), -1).to(tgt_lens.device)
20
+ ratio = tgt_lens / src_lens
21
+ index_t = (tgt_indices / ratio.view(-1, 1)).long()
22
+ return index_t
23
+
24
+
25
+ class SpeechGeneratorCTC(nn.Module):
26
+ def __init__(self, config):
27
+ super().__init__()
28
+ n_layers, n_dims, n_heads, n_inter_dims = list(map(int, config.ctc_decoder_config[1:-1].split(",")))
29
+ _config = copy.deepcopy(config)
30
+ _config.hidden_size = n_dims
31
+ _config.num_hidden_layers = n_layers
32
+ _config.num_attention_heads = n_heads
33
+ _config.num_key_value_heads = n_heads
34
+ _config.intermediate_size = n_inter_dims
35
+ _config._attn_implementation = "flash_attention_2"
36
+ self.upsample_factor = config.ctc_upsample_factor
37
+ self.input_proj = nn.Linear(config.hidden_size, n_dims)
38
+ self.layers = nn.ModuleList([LlamaDecoderLayer(_config, layer_idx) for layer_idx in range(n_layers)])
39
+ self.unit_vocab_size = config.unit_vocab_size
40
+ self.output_proj = nn.Linear(n_dims, config.unit_vocab_size + 1)
41
+
42
+ def upsample(self, reps, tgt_units=None):
43
+ src_lens = torch.LongTensor([len(rep) for rep in reps]).to(reps[0].device)
44
+ up_lens = src_lens * self.upsample_factor
45
+ if tgt_units is not None:
46
+ tgt_lens = tgt_units.ne(IGNORE_INDEX).long().sum(dim=-1)
47
+ up_lens = torch.max(up_lens, tgt_lens)
48
+ reps = torch.nn.utils.rnn.pad_sequence(reps, batch_first=True)
49
+ padding_mask = lengths_to_padding_mask(up_lens)
50
+ mapped_inputs = _uniform_assignment(src_lens, up_lens).masked_fill(padding_mask, 0)
51
+ copied_reps = torch.gather(
52
+ reps,
53
+ 1,
54
+ mapped_inputs.unsqueeze(-1).expand(*mapped_inputs.size(), reps.size(-1)),
55
+ )
56
+ copied_reps = copied_reps.masked_fill(padding_mask.unsqueeze(-1), 0)
57
+ position_ids = torch.arange(0, max(up_lens)).unsqueeze(0).expand(len(reps), -1).to(device=copied_reps.device)
58
+ return copied_reps, ~padding_mask, position_ids
59
+
60
+ def forward(self, tgt_reps, labels, tgt_units):
61
+ tgt_label_reps = []
62
+ for tgt_rep, label in zip(tgt_reps, labels):
63
+ tgt_label_reps.append(tgt_rep[label != IGNORE_INDEX])
64
+ hidden_states, attention_mask, position_ids = self.upsample(tgt_label_reps, tgt_units)
65
+ hidden_states = self.input_proj(hidden_states)
66
+ for layer in self.layers:
67
+ layer_outputs = layer(
68
+ hidden_states,
69
+ attention_mask=attention_mask,
70
+ position_ids=position_ids,
71
+ )
72
+ hidden_states = layer_outputs[0]
73
+ ctc_logits = self.output_proj(hidden_states)
74
+ ctc_lprobs = F.log_softmax(ctc_logits.float(), dim=-1, dtype=torch.float32)
75
+ ctc_lens = attention_mask.long().sum(dim=-1)
76
+ ctc_tgt_lens = tgt_units.ne(IGNORE_INDEX).long().sum(dim=-1)
77
+ ctc_tgt_mask = ~lengths_to_padding_mask(ctc_tgt_lens)
78
+ ctc_tgt_flat = tgt_units.masked_select(ctc_tgt_mask)
79
+ ctc_loss = F.ctc_loss(
80
+ ctc_lprobs.transpose(0, 1),
81
+ ctc_tgt_flat,
82
+ ctc_lens,
83
+ ctc_tgt_lens,
84
+ reduction="sum",
85
+ zero_infinity=True,
86
+ blank=self.unit_vocab_size,
87
+ )
88
+ ctc_loss /= ctc_tgt_lens.sum().item()
89
+ return ctc_loss
90
+
91
+ def predict(self, tgt_reps):
92
+ hidden_states, attention_mask, position_ids = self.upsample([tgt_reps])
93
+ hidden_states = self.input_proj(hidden_states)
94
+ for layer in self.layers:
95
+ layer_outputs = layer(
96
+ hidden_states,
97
+ attention_mask=attention_mask,
98
+ position_ids=position_ids,
99
+ )
100
+ hidden_states = layer_outputs[0]
101
+ ctc_logits = self.output_proj(hidden_states)
102
+ ctc_lprobs = F.log_softmax(ctc_logits.float(), dim=-1, dtype=torch.float32)
103
+ ctc_pred = ctc_lprobs.argmax(dim=-1).masked_fill_(~attention_mask, self.unit_vocab_size)
104
+ return ctc_pred
@@ -0,0 +1,9 @@
1
+ from helm.clients.audio_language.llama_omni.model.speech_projector.speech_projector import EncoderProjectorConcat
2
+
3
+
4
+ def build_speech_projector(config):
5
+ projector_type = getattr(config, "speech_projector_type", "linear")
6
+ if projector_type == "linear":
7
+ return EncoderProjectorConcat(config)
8
+
9
+ raise ValueError(f"Unknown projector type: {projector_type}")
@@ -0,0 +1,27 @@
1
+ # Adopted from https://github.com/ddlBoJack/SLAM-LLM/blob/main/src/slam_llm/models/projector.py
2
+ import torch.nn as nn
3
+
4
+
5
+ class EncoderProjectorConcat(nn.Module):
6
+ def __init__(self, config):
7
+ super().__init__()
8
+ self.k = config.speech_encoder_ds_rate
9
+ self.encoder_dim = config.speech_encoder_hidden_size
10
+ self.llm_dim = config.hidden_size
11
+ self.linear1 = nn.Linear(self.encoder_dim * self.k, 2048)
12
+ self.relu = nn.ReLU()
13
+ self.linear2 = nn.Linear(2048, config.hidden_size)
14
+
15
+ def forward(self, x):
16
+ batch_size, seq_len, dim = x.size()
17
+ num_frames_to_discard = seq_len % self.k
18
+ if num_frames_to_discard > 0:
19
+ x = x[:, :-num_frames_to_discard, :]
20
+ seq_len = x.size(1)
21
+
22
+ x = x.contiguous()
23
+ x = x.view(batch_size, seq_len // self.k, dim * self.k)
24
+ x = self.linear1(x)
25
+ x = self.relu(x)
26
+ x = self.linear2(x)
27
+ return x
@@ -0,0 +1,295 @@
1
+ # Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
3
+ # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
4
+ # Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ import torch
19
+ import transformers
20
+
21
+ from typing import Dict, Sequence
22
+
23
+ from helm.clients.audio_language.llama_omni.constants import IGNORE_INDEX, SPEECH_TOKEN_INDEX
24
+ import helm.clients.audio_language.llama_omni.conversation as conversation_lib
25
+
26
+
27
+ def tokenizer_speech_token(prompt, tokenizer, speech_token_index=SPEECH_TOKEN_INDEX, return_tensors=None):
28
+ prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<speech>")]
29
+
30
+ def insert_separator(X, sep):
31
+ return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
32
+
33
+ input_ids = []
34
+ offset = 0
35
+ if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
36
+ offset = 1
37
+ input_ids.append(prompt_chunks[0][0])
38
+
39
+ for x in insert_separator(prompt_chunks, [speech_token_index] * (offset + 1)):
40
+ input_ids.extend(x[offset:])
41
+
42
+ if return_tensors is not None:
43
+ if return_tensors == "pt":
44
+ return torch.tensor(input_ids, dtype=torch.long)
45
+ raise ValueError(f"Unsupported tensor type: {return_tensors}")
46
+ return input_ids
47
+
48
+
49
+ def preprocess_llama_2(sources, tokenizer: transformers.PreTrainedTokenizer, has_speech: bool = False) -> Dict:
50
+ conv = conversation_lib.default_conversation.copy()
51
+ roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
52
+
53
+ # Apply prompt templates
54
+ conversations = []
55
+ for i, source in enumerate(sources):
56
+ if roles[source[0]["from"]] != conv.roles[0]:
57
+ # Skip the first one if it is not from human
58
+ source = source[1:]
59
+
60
+ conv.messages = []
61
+ for j, sentence in enumerate(source):
62
+ role = roles[sentence["from"]]
63
+ assert role == conv.roles[j % 2], f"{i}"
64
+ conv.append_message(role, sentence["value"])
65
+ conversations.append(conv.get_prompt())
66
+
67
+ # Tokenize conversations
68
+
69
+ if has_speech:
70
+ input_ids = torch.stack(
71
+ [tokenizer_speech_token(prompt, tokenizer, return_tensors="pt") for prompt in conversations], dim=0
72
+ )
73
+ else:
74
+ input_ids = tokenizer(
75
+ conversations,
76
+ return_tensors="pt",
77
+ padding="longest",
78
+ max_length=tokenizer.model_max_length,
79
+ truncation=True,
80
+ ).input_ids
81
+
82
+ targets = input_ids.clone()
83
+
84
+ assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_2
85
+
86
+ # Mask targets
87
+ sep = "[/INST] "
88
+ for conversation, target in zip(conversations, targets):
89
+ total_len = int(target.ne(tokenizer.pad_token_id).sum())
90
+
91
+ rounds = conversation.split(conv.sep2)
92
+ cur_len = 1
93
+ target[:cur_len] = IGNORE_INDEX
94
+ for i, rou in enumerate(rounds):
95
+ if rou == "":
96
+ break
97
+
98
+ parts = rou.split(sep)
99
+ if len(parts) != 2:
100
+ break
101
+ parts[0] += sep
102
+
103
+ if has_speech:
104
+ round_len = len(tokenizer_speech_token(rou, tokenizer))
105
+ instruction_len = len(tokenizer_speech_token(parts[0], tokenizer)) - 2
106
+ else:
107
+ round_len = len(tokenizer(rou).input_ids)
108
+ instruction_len = len(tokenizer(parts[0]).input_ids) - 2
109
+
110
+ target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
111
+
112
+ cur_len += round_len
113
+ target[cur_len:] = IGNORE_INDEX
114
+
115
+ if cur_len < tokenizer.model_max_length:
116
+ if cur_len != total_len:
117
+ target[:] = IGNORE_INDEX
118
+ print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
119
+
120
+ return dict(
121
+ input_ids=input_ids,
122
+ labels=targets,
123
+ )
124
+
125
+
126
+ def preprocess_llama_3(sources, tokenizer: transformers.PreTrainedTokenizer, has_speech: bool = False) -> Dict:
127
+ conv = conversation_lib.default_conversation.copy()
128
+ roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
129
+
130
+ # Apply prompt templates
131
+ conversations = []
132
+ for i, source in enumerate(sources):
133
+ if roles[source[0]["from"]] != conv.roles[0]:
134
+ # Skip the first one if it is not from human
135
+ source = source[1:]
136
+
137
+ assert len(source) == 2, "now only support single-turn conversation"
138
+
139
+ conv.messages = []
140
+ for j, sentence in enumerate(source):
141
+ role = roles[sentence["from"]]
142
+ assert role == conv.roles[j % 2], f"{i}"
143
+ conv.append_message(role, sentence["value"])
144
+ conversations.append(conv.get_prompt())
145
+
146
+ # Tokenize conversations
147
+
148
+ if has_speech:
149
+ input_ids = torch.stack(
150
+ [tokenizer_speech_token(prompt, tokenizer, return_tensors="pt") for prompt in conversations], dim=0
151
+ )
152
+ else:
153
+ input_ids = tokenizer(
154
+ conversations,
155
+ return_tensors="pt",
156
+ padding="longest",
157
+ max_length=tokenizer.model_max_length,
158
+ truncation=True,
159
+ ).input_ids
160
+
161
+ targets = input_ids.clone()
162
+
163
+ assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_3
164
+
165
+ # Mask targets
166
+ sep = "<|start_header_id|>" + conv.roles[1] + "<|end_header_id|>\n\n"
167
+ for conversation, target in zip(conversations, targets):
168
+
169
+ cur_len = 1
170
+ target[:cur_len] = IGNORE_INDEX
171
+ parts = conversation.split(sep)
172
+ parts[0] += sep
173
+
174
+ if has_speech:
175
+ conversation_len = len(tokenizer_speech_token(conversation, tokenizer))
176
+ instruction_len = len(tokenizer_speech_token(parts[0], tokenizer)) - 1
177
+ else:
178
+ conversation_len = len(tokenizer(conversation).input_ids)
179
+ instruction_len = len(tokenizer(parts[0]).input_ids) - 1
180
+
181
+ target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
182
+ cur_len += conversation_len
183
+ target[cur_len:] = IGNORE_INDEX
184
+
185
+ # if cur_len < tokenizer.model_max_length:
186
+ # if cur_len != total_len:
187
+ # target[:] = IGNORE_INDEX
188
+ # print(
189
+ # f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
190
+ # f" (ignored)"
191
+ # )
192
+
193
+ return dict(
194
+ input_ids=input_ids,
195
+ labels=targets,
196
+ )
197
+
198
+
199
+ def preprocess_v1(sources, tokenizer: transformers.PreTrainedTokenizer, has_speech: bool = False) -> Dict:
200
+ conv = conversation_lib.default_conversation.copy()
201
+ roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
202
+
203
+ # Apply prompt templates
204
+ conversations = []
205
+ for i, source in enumerate(sources):
206
+ if roles[source[0]["from"]] != conv.roles[0]:
207
+ # Skip the first one if it is not from human
208
+ source = source[1:]
209
+
210
+ conv.messages = []
211
+ for j, sentence in enumerate(source):
212
+ role = roles[sentence["from"]]
213
+ assert role == conv.roles[j % 2], f"{i}"
214
+ conv.append_message(role, sentence["value"])
215
+ conversations.append(conv.get_prompt())
216
+
217
+ # Tokenize conversations
218
+
219
+ if has_speech:
220
+ input_ids = torch.stack(
221
+ [tokenizer_speech_token(prompt, tokenizer, return_tensors="pt") for prompt in conversations], dim=0
222
+ )
223
+ else:
224
+ input_ids = tokenizer(
225
+ conversations,
226
+ return_tensors="pt",
227
+ padding="longest",
228
+ max_length=tokenizer.model_max_length,
229
+ truncation=True,
230
+ ).input_ids
231
+
232
+ targets = input_ids.clone()
233
+
234
+ assert conv.sep_style == conversation_lib.SeparatorStyle.TWO
235
+
236
+ # Mask targets
237
+ sep = conv.sep + conv.roles[1] + ": "
238
+ for conversation, target in zip(conversations, targets):
239
+ total_len = int(target.ne(tokenizer.pad_token_id).sum())
240
+
241
+ rounds = conversation.split(conv.sep2)
242
+ cur_len = 1
243
+ target[:cur_len] = IGNORE_INDEX
244
+ for i, rou in enumerate(rounds):
245
+ if rou == "":
246
+ break
247
+
248
+ parts = rou.split(sep)
249
+ if len(parts) != 2:
250
+ break
251
+ parts[0] += sep
252
+
253
+ if has_speech:
254
+ round_len = len(tokenizer_speech_token(rou, tokenizer))
255
+ instruction_len = len(tokenizer_speech_token(parts[0], tokenizer)) - 2
256
+ else:
257
+ round_len = len(tokenizer(rou).input_ids)
258
+ instruction_len = len(tokenizer(parts[0]).input_ids) - 2
259
+
260
+ # FIXME: tokenizer bug
261
+ if i != 0 and not tokenizer.legacy:
262
+ round_len -= 1
263
+ instruction_len -= 1
264
+
265
+ target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
266
+
267
+ cur_len += round_len
268
+ target[cur_len:] = IGNORE_INDEX
269
+
270
+ if cur_len < tokenizer.model_max_length:
271
+ if cur_len != total_len:
272
+ target[:] = IGNORE_INDEX
273
+ print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
274
+
275
+ return dict(
276
+ input_ids=input_ids,
277
+ labels=targets,
278
+ )
279
+
280
+
281
+ def preprocess(sources: Sequence[str], tokenizer: transformers.PreTrainedTokenizer, has_speech: bool = False) -> Dict:
282
+ """
283
+ Given a list of sources, each is a conversation list. This transform:
284
+ 1. Add signal '### ' at the beginning each sentence, with end signal '\n';
285
+ 2. Concatenate conversations together;
286
+ 3. Tokenize the concatenated conversation;
287
+ 4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX.
288
+ """
289
+ if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.LLAMA_2:
290
+ return preprocess_llama_2(sources, tokenizer, has_speech=has_speech)
291
+ if conversation_lib.default_conversation.version.startswith("v1"):
292
+ return preprocess_v1(sources, tokenizer, has_speech=has_speech)
293
+ if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.LLAMA_3:
294
+ return preprocess_llama_3(sources, tokenizer, has_speech=has_speech)
295
+ raise NotImplementedError
@@ -0,0 +1,202 @@
1
+ # Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import os
17
+ import sys
18
+ import torch
19
+ import logging
20
+ import logging.handlers
21
+ import transformers
22
+
23
+ server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
24
+ moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
25
+
26
+ handler = None
27
+
28
+
29
+ class StreamToLogger(object):
30
+ """
31
+ Fake file-like stream object that redirects writes to a logger instance.
32
+ """
33
+
34
+ def __init__(self, logger, log_level=logging.INFO):
35
+ self.terminal = sys.stdout
36
+ self.logger = logger
37
+ self.log_level = log_level
38
+ self.linebuf = ""
39
+
40
+ def __getattr__(self, attr):
41
+ return getattr(self.terminal, attr)
42
+
43
+ def write(self, buf):
44
+ temp_linebuf = self.linebuf + buf
45
+ self.linebuf = ""
46
+ for line in temp_linebuf.splitlines(True):
47
+ # From the io.TextIOWrapper docs:
48
+ # On output, if newline is None, any '\n' characters written
49
+ # are translated to the system default line separator.
50
+ # By default sys.stdout.write() expects '\n' newlines and then
51
+ # translates them so this is still cross platform.
52
+ if line[-1] == "\n":
53
+ self.logger.log(self.log_level, line.rstrip())
54
+ else:
55
+ self.linebuf += line
56
+
57
+ def flush(self):
58
+ if self.linebuf != "":
59
+ self.logger.log(self.log_level, self.linebuf.rstrip())
60
+ self.linebuf = ""
61
+
62
+
63
+ def maybe_zero_3(param, ignore_status=False, name=None):
64
+ from deepspeed import zero
65
+ from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
66
+
67
+ if hasattr(param, "ds_id"):
68
+ if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
69
+ if not ignore_status:
70
+ logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}")
71
+ with zero.GatheredParameters([param]):
72
+ param = param.data.detach().cpu().clone()
73
+ else:
74
+ param = param.detach().cpu().clone()
75
+ return param
76
+
77
+
78
+ # Borrowed from peft.utils.get_peft_model_state_dict
79
+ def get_peft_state_maybe_zero_3(named_params, bias):
80
+ if bias == "none":
81
+ to_return = {k: t for k, t in named_params if "lora_" in k}
82
+ elif bias == "all":
83
+ to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
84
+ elif bias == "lora_only":
85
+ to_return = {}
86
+ maybe_lora_bias = {}
87
+ lora_bias_names = set()
88
+ for k, t in named_params:
89
+ if "lora_" in k:
90
+ to_return[k] = t
91
+ bias_name = k.split("lora_")[0] + "bias"
92
+ lora_bias_names.add(bias_name)
93
+ elif "bias" in k:
94
+ maybe_lora_bias[k] = t
95
+ for k, t in maybe_lora_bias:
96
+ if bias_name in lora_bias_names:
97
+ to_return[bias_name] = t
98
+ else:
99
+ raise NotImplementedError
100
+ to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()}
101
+ return to_return
102
+
103
+
104
+ def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
105
+ to_return = {k: t for k, t in named_params if "lora_" not in k}
106
+ if require_grad_only:
107
+ to_return = {k: t for k, t in to_return.items() if t.requires_grad}
108
+ to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
109
+ return to_return
110
+
111
+
112
+ def get_speech_projector_state_maybe_zero_3(named_params, keys_to_match):
113
+ to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
114
+ to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
115
+ return to_return
116
+
117
+
118
+ def find_all_linear_names(model):
119
+ cls = torch.nn.Linear
120
+ lora_module_names = set()
121
+ speech_keywords = ["speech_projector", "speech_encoder"]
122
+ for name, module in model.named_modules():
123
+ if any(speech_keyword in name for speech_keyword in speech_keywords):
124
+ continue
125
+ if isinstance(module, cls):
126
+ names = name.split(".")
127
+ lora_module_names.add(names[0] if len(names) == 1 else names[-1])
128
+
129
+ if "lm_head" in lora_module_names: # needed for 16-bit
130
+ lora_module_names.remove("lm_head")
131
+ return list(lora_module_names)
132
+
133
+
134
+ def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str):
135
+ """Collects the state dict and dump to disk."""
136
+
137
+ if getattr(trainer.args, "tune_speech_projector", False):
138
+ # Only save projector
139
+ keys_to_match = ["speech_projector"]
140
+ if getattr(trainer.args, "use_im_start_end", False):
141
+ keys_to_match.extend(["embed_tokens", "embed_in"])
142
+
143
+ weight_to_save = get_speech_projector_state_maybe_zero_3(trainer.model.named_parameters(), keys_to_match)
144
+ trainer.model.config.save_pretrained(output_dir)
145
+
146
+ current_folder = output_dir.split("/")[-1]
147
+ parent_folder = os.path.dirname(output_dir)
148
+ if trainer.args.local_rank == 0 or trainer.args.local_rank == -1:
149
+ if current_folder.startswith("checkpoint-"):
150
+ speech_projector_folder = os.path.join(parent_folder, "speech_projector")
151
+ os.makedirs(speech_projector_folder, exist_ok=True)
152
+ torch.save(weight_to_save, os.path.join(speech_projector_folder, f"{current_folder}.bin"))
153
+ else:
154
+ torch.save(weight_to_save, os.path.join(output_dir, "speech_projector.bin"))
155
+ return
156
+
157
+ if trainer.deepspeed:
158
+ torch.cuda.synchronize()
159
+ trainer.save_model(output_dir)
160
+ return
161
+
162
+ state_dict = trainer.model.state_dict()
163
+ if trainer.args.should_save:
164
+ cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
165
+ del state_dict
166
+ trainer._save(output_dir, state_dict=cpu_state_dict) # noqa
167
+
168
+
169
+ def lengths_to_padding_mask(lens):
170
+ bsz, max_lens = lens.size(0), torch.max(lens).item()
171
+ mask = torch.arange(max_lens).to(lens.device).view([1, int(max_lens)])
172
+ mask = mask.expand(bsz, -1) >= lens.view(bsz, 1).expand(-1, max_lens)
173
+ return mask
174
+
175
+
176
+ def lengths_to_mask(lens):
177
+ return ~lengths_to_padding_mask(lens)
178
+
179
+
180
+ def disable_torch_init():
181
+ """
182
+ Disable the redundant torch default initialization to accelerate model creation.
183
+ """
184
+ import torch
185
+
186
+ setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
187
+ setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
188
+
189
+
190
+ def get_model_name_from_path(model_path):
191
+ model_path = model_path.strip("/")
192
+ model_paths = model_path.split("/")
193
+ if model_paths[-1].startswith("checkpoint-"):
194
+ return model_paths[-2] + "_" + model_paths[-1]
195
+ else:
196
+ return model_paths[-1]
197
+
198
+
199
+ def pretty_print_semaphore(semaphore):
200
+ if semaphore is None:
201
+ return "None"
202
+ return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
@@ -12,7 +12,7 @@ from helm.clients.audio_language.llama_omni.preprocess import tokenizer_speech_t
12
12
 
13
13
  from helm.common.cache import CacheConfig
14
14
  from helm.common.gpu_utils import get_torch_device_name
15
- from helm.common.hierarchical_logger import hlog, htrack_block
15
+ from helm.common.hierarchical_logger import hexception, hlog, htrack_block
16
16
  from helm.common.media_object import TEXT_TYPE
17
17
  from helm.common.request import Request, RequestResult, GeneratedOutput, Token
18
18
  from helm.common.request import wrap_request_time
@@ -170,6 +170,7 @@ class LlamaOmniAudioLMClient(CachingClient):
170
170
  )
171
171
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
172
172
  except RuntimeError as model_error:
173
+ hexception(model_error)
173
174
  return RequestResult(
174
175
  success=False, cached=False, error=str(model_error), completions=[], embedding=[]
175
176
  )