crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  31. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  32. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  33. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  34. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  35. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  36. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  37. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  38. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  39. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  40. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  41. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  42. helm/benchmark/annotation/medalign_annotator.py +100 -0
  43. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  44. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  45. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  46. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  47. helm/benchmark/annotation/model_as_judge.py +218 -6
  48. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  49. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  50. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  51. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  52. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  53. helm/benchmark/annotation/spider_annotator.py +18 -0
  54. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  55. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  56. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  57. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  58. helm/benchmark/annotation_executor.py +35 -15
  59. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  60. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  61. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  62. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  63. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  64. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  65. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  66. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  67. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  68. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  69. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  70. helm/benchmark/augmentations/perturbation.py +1 -1
  71. helm/benchmark/augmentations/space_perturbation.py +2 -2
  72. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  73. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  74. helm/benchmark/augmentations/test_perturbation.py +16 -13
  75. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  76. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  77. helm/benchmark/data_preprocessor.py +2 -2
  78. helm/benchmark/huggingface_registration.py +2 -7
  79. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  80. helm/benchmark/metrics/basic_metrics.py +6 -6
  81. helm/benchmark/metrics/bbq_metrics.py +2 -2
  82. helm/benchmark/metrics/bias_metrics.py +12 -3
  83. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  84. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  85. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  86. helm/benchmark/metrics/classification_metrics.py +76 -12
  87. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  88. helm/benchmark/metrics/code_metrics.py +5 -5
  89. helm/benchmark/metrics/comet_metric.py +125 -0
  90. helm/benchmark/metrics/common_metric_specs.py +9 -2
  91. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  92. helm/benchmark/metrics/copyright_metrics.py +4 -4
  93. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  94. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  95. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  96. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  97. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  98. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  99. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  100. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  101. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  102. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  103. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  104. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  105. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  106. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  107. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  108. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  109. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  110. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  111. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  112. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  113. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  114. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  115. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  116. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  117. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  118. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  120. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  121. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  122. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  123. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  124. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  125. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  126. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  127. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  128. helm/benchmark/metrics/medalign_metrics.py +34 -0
  129. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  130. helm/benchmark/metrics/medec_metrics.py +101 -0
  131. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  132. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  133. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  134. helm/benchmark/metrics/metric.py +3 -3
  135. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  136. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  137. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  138. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  139. helm/benchmark/metrics/nltk_helper.py +32 -0
  140. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  141. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  142. helm/benchmark/metrics/output_processing_metric.py +60 -0
  143. helm/benchmark/metrics/output_processors.py +15 -0
  144. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  145. helm/benchmark/metrics/ranking_metrics.py +3 -3
  146. helm/benchmark/metrics/reference_metric.py +3 -3
  147. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  148. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  149. helm/benchmark/metrics/spider_metrics.py +7 -0
  150. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  151. helm/benchmark/metrics/statistic.py +1 -1
  152. helm/benchmark/metrics/summac/model_summac.py +1 -1
  153. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  154. helm/benchmark/metrics/summarization_metrics.py +19 -9
  155. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  156. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  157. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  158. helm/benchmark/metrics/test_metric.py +1 -1
  159. helm/benchmark/metrics/test_statistic.py +2 -2
  160. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  161. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  162. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  163. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  164. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  165. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  167. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  169. helm/benchmark/metrics/unitxt_metrics.py +4 -1
  170. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  171. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  172. helm/benchmark/model_metadata_registry.py +16 -0
  173. helm/benchmark/presentation/summarize.py +23 -10
  174. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  175. helm/benchmark/reeval_run.py +203 -0
  176. helm/benchmark/reeval_runner.py +355 -0
  177. helm/benchmark/run.py +8 -17
  178. helm/benchmark/run_expander.py +78 -8
  179. helm/benchmark/run_spec_factory.py +12 -0
  180. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  181. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  182. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  183. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  184. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  185. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  186. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  187. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  188. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  189. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  190. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  191. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  192. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  193. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  194. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  195. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  196. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  197. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  198. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  199. helm/benchmark/run_specs/vlm_run_specs.py +75 -2
  200. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  201. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  202. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  203. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  204. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  205. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  206. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  207. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  208. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  209. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  210. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  211. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  212. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  213. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  214. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  215. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  216. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  217. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  218. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  219. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  220. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  221. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  222. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  223. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  224. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  225. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  226. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  227. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  228. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  229. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  230. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  231. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  232. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  233. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  234. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  235. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  236. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  237. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  238. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  239. helm/benchmark/scenarios/bold_scenario.py +1 -1
  240. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  241. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  242. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  243. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  244. helm/benchmark/scenarios/clear_scenario.py +153 -0
  245. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  246. helm/benchmark/scenarios/code_scenario.py +17 -4
  247. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  248. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  249. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  250. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  251. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  252. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  253. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  254. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  255. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  256. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  257. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  258. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  259. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  260. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  261. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  262. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  263. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  264. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  265. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  266. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  267. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  268. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  269. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  270. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  271. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  272. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  273. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  274. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  275. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  276. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  277. helm/benchmark/scenarios/ice_scenario.py +8 -4
  278. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  279. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  280. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  281. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  282. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  283. helm/benchmark/scenarios/koala_scenario.py +1 -1
  284. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  285. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  286. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  287. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  288. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  289. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  290. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  291. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  292. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  293. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  294. helm/benchmark/scenarios/math_scenario.py +9 -1
  295. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  296. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  297. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  298. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  299. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  300. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  301. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  302. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  303. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  304. helm/benchmark/scenarios/medec_scenario.py +120 -0
  305. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  306. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  307. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  308. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  309. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  310. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  311. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  312. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  313. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  314. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  315. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  316. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  317. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  318. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  319. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  320. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  321. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  322. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  323. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  324. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  325. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  326. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  328. helm/benchmark/scenarios/quac_scenario.py +10 -1
  329. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  330. helm/benchmark/scenarios/raft_scenario.py +17 -2
  331. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  332. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  333. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  334. helm/benchmark/scenarios/scenario.py +9 -1
  335. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  336. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  337. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  338. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  339. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  340. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  341. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  342. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  343. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  344. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  345. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  346. helm/benchmark/scenarios/spider_scenario.py +91 -0
  347. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  348. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  349. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  350. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  351. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  352. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  353. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  354. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  355. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  356. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  357. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  358. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  359. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  360. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  361. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  362. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  363. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  364. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  366. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  367. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  368. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  369. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  370. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  371. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  372. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  373. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  374. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  375. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  376. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  377. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  378. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  379. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  380. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  381. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  382. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  383. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  384. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  385. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  386. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  387. helm/benchmark/server.py +11 -0
  388. helm/benchmark/slurm_runner.py +1 -1
  389. helm/benchmark/static/schema_audio.yaml +752 -0
  390. helm/benchmark/static/schema_autobencher.yaml +150 -0
  391. helm/benchmark/static/schema_call_center.yaml +97 -60
  392. helm/benchmark/static/schema_capabilities.yaml +254 -0
  393. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  394. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  395. helm/benchmark/static/schema_enterprise.yaml +298 -0
  396. helm/benchmark/static/schema_finance.yaml +14 -12
  397. helm/benchmark/static/schema_heim.yaml +1389 -0
  398. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  399. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  400. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  401. helm/benchmark/static/schema_safety.yaml +18 -1
  402. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  403. helm/benchmark/static/schema_social_audio.yaml +224 -0
  404. helm/benchmark/static/schema_sql.yaml +171 -0
  405. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  406. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  407. helm/benchmark/static/schema_vhelm.yaml +109 -36
  408. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  409. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  410. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  411. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  412. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  413. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  414. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  415. helm/benchmark/static_build/config.js +1 -1
  416. helm/benchmark/static_build/index.html +5 -5
  417. helm/benchmark/window_services/default_window_service.py +1 -1
  418. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  419. helm/benchmark/window_services/ice_window_service.py +1 -1
  420. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  421. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  422. helm/benchmark/window_services/local_window_service.py +2 -2
  423. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  424. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  425. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  426. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  427. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  428. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  429. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  430. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  431. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  432. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  433. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  434. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  435. helm/benchmark/window_services/test_utils.py +1 -1
  436. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  437. helm/benchmark/window_services/yalm_window_service.py +1 -1
  438. helm/clients/ai21_client.py +3 -3
  439. helm/clients/aleph_alpha_client.py +1 -1
  440. helm/clients/audio_language/__init__.py +0 -0
  441. helm/clients/audio_language/diva_llama_client.py +118 -0
  442. helm/clients/audio_language/llama_omni_client.py +198 -0
  443. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  444. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  445. helm/clients/auto_client.py +4 -2
  446. helm/clients/azure_openai_client.py +55 -0
  447. helm/clients/bedrock_client.py +201 -7
  448. helm/clients/bedrock_utils.py +33 -0
  449. helm/clients/clip_scorers/clip_scorer.py +1 -1
  450. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  451. helm/clients/cohere_client.py +3 -3
  452. helm/clients/google_client.py +1 -1
  453. helm/clients/http_model_client.py +1 -1
  454. helm/clients/huggingface_client.py +10 -18
  455. helm/clients/ibm_client.py +267 -0
  456. helm/clients/image_generation/adobe_vision_client.py +1 -1
  457. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  458. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  459. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  460. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  461. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  462. helm/clients/image_generation/cogview2_client.py +1 -1
  463. helm/clients/image_generation/dalle2_client.py +1 -1
  464. helm/clients/image_generation/dalle3_client.py +2 -2
  465. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  466. helm/clients/image_generation/dalle_mini/data.py +1 -1
  467. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  468. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  469. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  470. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  471. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  472. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  473. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  474. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  475. helm/clients/image_generation/dalle_mini_client.py +1 -1
  476. helm/clients/image_generation/deep_floyd_client.py +1 -1
  477. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  478. helm/clients/image_generation/lexica_client.py +1 -1
  479. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  480. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  481. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  482. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  483. helm/clients/image_generation/mindalle_client.py +1 -1
  484. helm/clients/image_generation/together_image_generation_client.py +1 -1
  485. helm/clients/lit_gpt_client.py +2 -2
  486. helm/clients/mistral_client.py +62 -18
  487. helm/clients/nvidia_nim_client.py +0 -3
  488. helm/clients/openai_client.py +241 -22
  489. helm/clients/palmyra_client.py +1 -4
  490. helm/clients/reka_client.py +1 -1
  491. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  492. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  493. helm/clients/stanfordhealthcare_google_client.py +43 -0
  494. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  495. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  496. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  497. helm/clients/test_client.py +1 -1
  498. helm/clients/test_together_client.py +6 -1
  499. helm/clients/together_client.py +47 -7
  500. helm/clients/upstage_client.py +23 -0
  501. helm/clients/vertexai_client.py +39 -13
  502. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  503. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  504. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  505. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  506. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  507. helm/clients/vllm_client.py +4 -6
  508. helm/clients/yi_client.py +0 -3
  509. helm/common/audio_utils.py +111 -0
  510. helm/common/file_caches/local_file_cache.py +1 -1
  511. helm/common/file_caches/test_local_file_cache.py +1 -1
  512. helm/common/images_utils.py +2 -2
  513. helm/common/media_object.py +2 -2
  514. helm/common/multimodal_request_utils.py +26 -0
  515. helm/common/reeval_parameters.py +12 -0
  516. helm/common/request.py +6 -2
  517. helm/common/response_format.py +18 -0
  518. helm/common/test_media_object.py +1 -1
  519. helm/config/model_deployments.yaml +1112 -19
  520. helm/config/model_metadata.yaml +985 -44
  521. helm/config/tokenizer_configs.yaml +379 -3
  522. helm/proxy/cli.py +2 -2
  523. helm/proxy/example_queries.py +1 -1
  524. helm/proxy/server.py +11 -4
  525. helm/proxy/services/remote_service.py +1 -1
  526. helm/proxy/services/server_service.py +1 -1
  527. helm/proxy/services/test_remote_service.py +2 -2
  528. helm/proxy/services/test_service.py +1 -1
  529. helm/proxy/static/general.js +122 -0
  530. helm/proxy/static/help.html +99 -0
  531. helm/proxy/static/index.css +57 -0
  532. helm/proxy/static/index.html +40 -0
  533. helm/proxy/static/index.js +456 -0
  534. helm/proxy/static/info-icon.png +0 -0
  535. helm/proxy/test_retry.py +1 -1
  536. helm/proxy/token_counters/auto_token_counter.py +1 -1
  537. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  538. helm/tokenizers/caching_tokenizer.py +2 -30
  539. helm/tokenizers/http_model_tokenizer.py +1 -1
  540. helm/tokenizers/huggingface_tokenizer.py +2 -2
  541. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  542. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  543. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  544. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  545. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  546. helm/tokenizers/tokenizer.py +3 -1
  547. helm/tokenizers/yalm_tokenizer.py +3 -3
  548. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  549. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  550. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  551. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  552. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  553. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  554. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  555. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  556. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  557. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  558. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  559. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  560. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  561. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  562. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  563. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  564. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  565. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  566. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  567. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  568. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  569. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  570. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  571. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  572. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  573. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  574. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  575. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  576. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  577. helm/tokenizers/anthropic_tokenizer.py +0 -52
  578. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  579. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  580. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,150 @@
1
+ from threading import Lock
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ from dataclasses import dataclass
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer
6
+
7
+ from helm.common.cache import CacheConfig
8
+ from helm.common.gpu_utils import get_torch_device_name
9
+ from helm.common.hierarchical_logger import hlog, htrack_block
10
+ from helm.common.media_object import TEXT_TYPE
11
+ from helm.common.request import Request, RequestResult, GeneratedOutput, Token
12
+ from helm.common.request import wrap_request_time
13
+ from helm.clients.client import CachingClient, generate_uid_for_multimodal_prompt
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class LoadedQwenModelProcessor:
18
+ """Loaded model and processor for Qwen."""
19
+
20
+ model: AutoModelForCausalLM
21
+ tokenizer: AutoTokenizer
22
+
23
+
24
+ _models_lock: Lock = Lock()
25
+ _models: Dict[str, Optional[LoadedQwenModelProcessor]] = {
26
+ "Qwen/Qwen-Audio-Chat": None,
27
+ }
28
+
29
+
30
+ class QwenAudioLMClient(CachingClient):
31
+ """
32
+ From https://huggingface.co/Qwen/Qwen-Audio,
33
+ Qwen-Audio-Chat (Qwen Large Audio Language Model) is the audio multimodal version of the large model series,
34
+ Qwen (abbr. Tongyi Qianwen), proposed by Alibaba Cloud. Qwen-Audio-Chat accepts audio, text as inputs, outputs text.
35
+ Alibaba released Qwen-Audio and Qwen-Audio-Chat, which is a chatbot model based on Qwen-Audio.
36
+ We for now integrated Qwen-Audio-Chat for instruction-following tasks.
37
+
38
+ Paper: https://arxiv.org/abs/2311.07919
39
+ """
40
+
41
+ def __init__(self, cache_config: CacheConfig):
42
+ super().__init__(cache_config=cache_config)
43
+ self._device: str = get_torch_device_name()
44
+
45
+ def _get_model(self, helm_model_name: str) -> LoadedQwenModelProcessor:
46
+ global _models_lock
47
+ global _models
48
+
49
+ model_name: str
50
+ if helm_model_name == "qwen-audio-chat":
51
+ model_name = "Qwen/Qwen-Audio-Chat"
52
+ else:
53
+ raise ValueError(f"Unhandled model name: {helm_model_name}")
54
+
55
+ # Ensure that only one thread is loading the model at a time
56
+ with _models_lock:
57
+ loaded_model_processor = _models[model_name]
58
+ if loaded_model_processor is None:
59
+ hlog(f"Loading model {model_name} and caching in memory...")
60
+ model = AutoModelForCausalLM.from_pretrained(
61
+ model_name,
62
+ device_map=self._device,
63
+ trust_remote_code=True,
64
+ bf16=True,
65
+ ).eval()
66
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
67
+ _models[model_name] = LoadedQwenModelProcessor(model, tokenizer)
68
+ loaded_model_processor = _models[model_name]
69
+
70
+ assert loaded_model_processor is not None
71
+ return loaded_model_processor
72
+
73
+ def make_request(self, request: Request) -> RequestResult:
74
+ assert request.multimodal_prompt is not None, "Multimodal prompt is required"
75
+
76
+ loaded_model_processor: LoadedQwenModelProcessor = self._get_model(request.model_engine)
77
+ model = loaded_model_processor.model
78
+ tokenizer = loaded_model_processor.tokenizer
79
+
80
+ generation_args = {
81
+ "max_length": request.max_tokens,
82
+ }
83
+
84
+ query: List[Dict[str, str]] = []
85
+ prompt_text: str = ""
86
+
87
+ for media_object in request.multimodal_prompt.media_objects:
88
+ if media_object.is_type("audio") and media_object.location:
89
+ query.append({"audio": media_object.location})
90
+ elif media_object.is_type(TEXT_TYPE):
91
+ if media_object.text is None:
92
+ raise ValueError("MediaObject of text type has missing text field value")
93
+
94
+ query.append({"text": media_object.text})
95
+ prompt_text += media_object.text
96
+ else:
97
+ raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
98
+
99
+ completions: List[GeneratedOutput] = []
100
+ request_time: float = 0
101
+ request_datetime: Optional[int] = None
102
+ all_cached: bool = True
103
+
104
+ with htrack_block(f"Generating for prompt: {prompt_text}"):
105
+ for completion_index in range(request.num_completions):
106
+ try:
107
+
108
+ def do_it() -> Dict[str, Any]:
109
+ completion, _ = model.chat(tokenizer, query=tokenizer.from_list_format(query), history=None)
110
+ tokens: List[str] = tokenizer.tokenize(completion)
111
+ return {"output": (completion, tokens)}
112
+
113
+ # Include the prompt and model name in the cache key
114
+ cache_key = CachingClient.make_cache_key(
115
+ raw_request={
116
+ "completion_index": completion_index,
117
+ "model": request.model,
118
+ "prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt),
119
+ **generation_args,
120
+ },
121
+ request=request,
122
+ )
123
+ result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
124
+ except RuntimeError as model_error:
125
+ return RequestResult(
126
+ success=False, cached=False, error=str(model_error), completions=[], embedding=[]
127
+ )
128
+
129
+ text, tokens = result["output"]
130
+
131
+ # Tokenize truncated text to get the list of tokens
132
+ completions.append(
133
+ GeneratedOutput(
134
+ text=text, logprob=0, tokens=[Token(text=str(token), logprob=0) for token in tokens]
135
+ )
136
+ )
137
+
138
+ request_time += result["request_time"]
139
+ # Use the datetime from the first completion because that's when the request was fired
140
+ request_datetime = request_datetime or result.get("request_datetime")
141
+ all_cached = all_cached and cached
142
+
143
+ return RequestResult(
144
+ success=True,
145
+ cached=all_cached,
146
+ request_time=request_time,
147
+ request_datetime=request_datetime,
148
+ completions=completions,
149
+ embedding=[],
150
+ )
@@ -71,6 +71,7 @@ class AutoClient(Client):
71
71
  model_deployment.client_spec,
72
72
  constant_bindings={
73
73
  "cache_config": cache_config,
74
+ "model_name": model_deployment.model_name,
74
75
  "tokenizer_name": model_deployment.tokenizer_name,
75
76
  },
76
77
  provider_bindings={
@@ -81,6 +82,7 @@ class AutoClient(Client):
81
82
  "org_id": lambda: self.credentials.get(
82
83
  host_organization + "OrgId", None
83
84
  ), # OpenAI, GooseAI, Microsoft
85
+ "base_url": lambda: self.credentials.get(host_organization + "BaseUrl", None),
84
86
  "moderation_api_client": lambda: self.get_moderation_api_client(), # OpenAI DALL-E
85
87
  "lock_file_path": lambda: os.path.join(
86
88
  self.file_storage_path, f"{host_organization}.lock"
@@ -131,7 +133,7 @@ class AutoClient(Client):
131
133
  return replace(last_attempt.value, error=f"{retry_error}. Error: {last_attempt.value.error}")
132
134
 
133
135
  def get_gcs_client(self):
134
- from .gcs_client import GCSClient
136
+ from helm.clients.gcs_client import GCSClient
135
137
 
136
138
  bucket_name: str = self.credentials["gcsBucketName"]
137
139
  cache_config: CacheConfig = self.cache_backend_config.get_cache_config("gcs")
@@ -144,7 +146,7 @@ class AutoClient(Client):
144
146
  return NudityCheckClient(cache_config)
145
147
 
146
148
  def get_clip_score_client(self):
147
- from .clip_score_client import CLIPScoreClient
149
+ from helm.clients.clip_score_client import CLIPScoreClient
148
150
 
149
151
  cache_config: CacheConfig = self.cache_backend_config.get_cache_config("clip_score")
150
152
  return CLIPScoreClient(cache_config)
@@ -0,0 +1,55 @@
1
+ import os
2
+ from typing import Dict, Optional
3
+
4
+ from helm.clients.openai_client import OpenAIClient
5
+ from helm.common.cache import CacheConfig
6
+ from helm.common.optional_dependencies import handle_module_not_found_error
7
+ from helm.proxy.retry import NonRetriableException
8
+ from helm.tokenizers.tokenizer import Tokenizer
9
+
10
+ try:
11
+ from openai import AzureOpenAI
12
+ except ModuleNotFoundError as e:
13
+ handle_module_not_found_error(e, ["openai"])
14
+
15
+
16
+ class AzureOpenAIClient(OpenAIClient):
17
+ API_VERSION = "2024-07-01-preview"
18
+
19
+ def __init__(
20
+ self,
21
+ tokenizer: Tokenizer,
22
+ tokenizer_name: str,
23
+ cache_config: CacheConfig,
24
+ api_key: Optional[str] = None,
25
+ endpoint: Optional[str] = None,
26
+ api_version: Optional[str] = None,
27
+ default_headers: Optional[Dict[str, str]] = None,
28
+ base_url: Optional[str] = None,
29
+ azure_openai_deployment_name: Optional[str] = None,
30
+ ):
31
+ super().__init__(
32
+ tokenizer=tokenizer,
33
+ tokenizer_name=tokenizer_name,
34
+ cache_config=cache_config,
35
+ api_key="unused",
36
+ openai_model_name=azure_openai_deployment_name,
37
+ )
38
+ azure_endpoint = endpoint or os.getenv("AZURE_OPENAI_ENDPOINT")
39
+ assert (base_url is None) != (azure_endpoint is None), "Exactly one of endoint or base_url must be provided"
40
+ if base_url:
41
+ self.client = AzureOpenAI(
42
+ api_key=api_key,
43
+ api_version=api_version or AzureOpenAIClient.API_VERSION,
44
+ base_url=base_url,
45
+ default_headers=default_headers,
46
+ )
47
+ elif azure_endpoint:
48
+ self.client = AzureOpenAI(
49
+ api_key=api_key,
50
+ api_version=api_version or AzureOpenAIClient.API_VERSION,
51
+ azure_endpoint=azure_endpoint,
52
+ default_headers=default_headers,
53
+ )
54
+ else:
55
+ raise NonRetriableException("Must provide either endpoint or base_url")
@@ -2,12 +2,13 @@ from abc import abstractmethod
2
2
  from copy import deepcopy
3
3
  import json
4
4
  import os
5
- from typing import Any, Dict, List, Mapping, Optional
5
+ from typing import Any, Dict, List, Mapping, Optional, TypedDict
6
+ from datetime import datetime
6
7
 
7
8
  from helm.common.cache import CacheConfig
8
9
  from helm.clients.client import CachingClient, truncate_and_tokenize_response_text
9
10
  from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
10
- from helm.clients.bedrock_utils import get_bedrock_client
11
+ from helm.clients.bedrock_utils import get_bedrock_client, get_bedrock_client_v1
11
12
  from helm.tokenizers.tokenizer import Tokenizer
12
13
 
13
14
 
@@ -23,27 +24,41 @@ class BedrockClient(CachingClient):
23
24
  def convert_raw_response_to_completions(self, response: Dict, request: Request) -> List[GeneratedOutput]:
24
25
  raise NotImplementedError()
25
26
 
27
+ """
28
+ Amazon Bedrock is a fully managed service that provides s selection of leading foundation models (FMs) from Amazon
29
+ and other partner model providers.
30
+ """
31
+
32
+ @property
33
+ @abstractmethod
34
+ def model_provider(self) -> str:
35
+ raise NotImplementedError()
36
+
26
37
  def __init__(
27
38
  self,
28
39
  cache_config: CacheConfig,
29
40
  tokenizer: Tokenizer,
30
41
  tokenizer_name: str,
31
- bedrock_model_id: Optional[str] = None,
32
42
  assumed_role: Optional[str] = None,
33
43
  region: Optional[str] = None,
34
44
  ):
35
45
  super().__init__(cache_config=cache_config)
36
46
  self.tokenizer = tokenizer
37
47
  self.tokenizer_name = tokenizer_name
38
- self.bedrock_model_id = bedrock_model_id
39
48
  self.bedrock_client = get_bedrock_client(
40
49
  assumed_role=assumed_role or os.environ.get("BEDROCK_ASSUME_ROLE", None),
41
- region=region or os.environ.get("AWS_DEFAULT_REGION", None),
50
+ region=region,
42
51
  )
43
52
 
44
53
  def make_request(self, request: Request) -> RequestResult:
45
- # model_id should be something like "amazon.titan-tg1-large"
46
- model_id = self.bedrock_model_id if self.bedrock_model_id else request.model.replace("/", ".")
54
+ # model_id should be something like "amazon.titan-tg1-large", replace amazon- prefix with model creator name
55
+ model_name = request.model.split("/")[-1]
56
+ # check if model_name starts with "amazon-"
57
+ if self.model_provider == "amazon":
58
+ model_id = f"{self.model_provider}.{model_name}"
59
+ else:
60
+ model_id = model_name.replace("amazon-", f"{self.model_provider}.")
61
+
47
62
  raw_request = self.convert_request_to_raw_request(request)
48
63
 
49
64
  # modelId isn't part of raw_request, so it must be explicitly passed into the input to
@@ -58,6 +73,7 @@ class BedrockClient(CachingClient):
58
73
 
59
74
  try:
60
75
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
76
+
61
77
  except Exception as error:
62
78
  return RequestResult(
63
79
  success=False,
@@ -79,12 +95,109 @@ class BedrockClient(CachingClient):
79
95
  )
80
96
 
81
97
 
98
+ class _ContentBlock(TypedDict):
99
+ text: str
100
+
101
+
102
+ class _Message(TypedDict):
103
+ role: str
104
+ content: List[_ContentBlock]
105
+
106
+
107
+ class BedrockNovaClient(CachingClient):
108
+ """
109
+ Amazon Bedrock is a fully managed service that provides s selection of leading foundation models (FMs) from Amazon
110
+ and other partner model providers.
111
+ """
112
+
113
+ def __init__(
114
+ self,
115
+ cache_config: CacheConfig,
116
+ tokenizer: Tokenizer,
117
+ tokenizer_name: str,
118
+ assumed_role: Optional[str] = None,
119
+ region: Optional[str] = None,
120
+ ):
121
+ super().__init__(cache_config=cache_config)
122
+ self.tokenizer = tokenizer
123
+ self.tokenizer_name = tokenizer_name
124
+ self.bedrock_client = get_bedrock_client_v1(
125
+ assumed_role=assumed_role or os.environ.get("BEDROCK_ASSUME_ROLE", None),
126
+ region=region,
127
+ )
128
+
129
+ def _get_messages_from_request(self, request: Request) -> List[_Message]:
130
+ if request.prompt and request.messages:
131
+ raise ValueError(f"Only one of `prompt` and `messages` may be set in request: {request}")
132
+ if request.multimodal_prompt:
133
+ raise ValueError(f"`multimodal_prompt` is not supported in request: {request}")
134
+
135
+ if request.messages:
136
+ return [
137
+ {"role": message["role"], "content": [{"text": message["content"]}]} for message in request.messages
138
+ ]
139
+ else:
140
+ return [{"role": "user", "content": [{"text": request.prompt}]}]
141
+
142
+ def convert_request_to_raw_request(self, request: Request) -> Dict:
143
+ model_id = request.model.replace("/", ".")
144
+ messages = self._get_messages_from_request(request)
145
+
146
+ return {
147
+ "modelId": model_id,
148
+ "inferenceConfig": {
149
+ "temperature": request.temperature,
150
+ "maxTokens": request.max_tokens,
151
+ "topP": request.top_p,
152
+ },
153
+ "messages": messages,
154
+ }
155
+
156
+ def make_request(self, request: Request) -> RequestResult:
157
+ raw_request = self.convert_request_to_raw_request(request)
158
+ cache_key = CachingClient.make_cache_key(raw_request, request)
159
+
160
+ def do_it() -> Dict[Any, Any]:
161
+ return self.bedrock_client.converse(**raw_request)
162
+
163
+ response, cached = self.cache.get(cache_key, do_it)
164
+
165
+ completions = self.convert_raw_response_to_completions(response, request)
166
+ dt = datetime.strptime(response["ResponseMetadata"]["HTTPHeaders"]["date"], "%a, %d %b %Y %H:%M:%S GMT")
167
+ # Use API reported latency rather than client measured latency
168
+ request_time = response["metrics"]["latencyMs"] / 1000
169
+
170
+ return RequestResult(
171
+ success=True,
172
+ cached=cached,
173
+ request_time=request_time,
174
+ request_datetime=int(dt.timestamp()),
175
+ completions=completions,
176
+ embedding=[],
177
+ )
178
+
179
+ def convert_raw_response_to_completions(self, response: Dict, request: Request) -> List[GeneratedOutput]:
180
+ completions: List[GeneratedOutput] = []
181
+ raw_completion = response["output"]
182
+ output_text = raw_completion["message"]["content"][0]["text"]
183
+ finish_reason = response["stopReason"]
184
+ completion = truncate_and_tokenize_response_text(
185
+ output_text.lstrip(), request, self.tokenizer, self.tokenizer_name, finish_reason
186
+ )
187
+ completions.append(completion)
188
+ return completions
189
+
190
+
191
+ # Amazon Bedrock Client for Titan Models
82
192
  class BedrockTitanClient(BedrockClient):
83
193
  _COMPLETION_REASON_TO_FINISH_REASON = {
84
194
  "LENGTH": "length",
85
195
  "FINISH": "endoftext",
86
196
  }
87
197
 
198
+ # creator org for titan
199
+ model_provider = "amazon"
200
+
88
201
  def convert_request_to_raw_request(self, request: Request) -> Dict:
89
202
  # TODO: Support the following:
90
203
  # - top_k_per_token
@@ -115,6 +228,7 @@ class BedrockTitanClient(BedrockClient):
115
228
  # - tokens
116
229
  # - logprob
117
230
  completions: List[GeneratedOutput] = []
231
+
118
232
  for raw_completion in response["results"]:
119
233
  output_text = raw_completion["outputText"]
120
234
  # Call lstrip() Titan has the tendency to emit "\n" as the first token in the generated text output.
@@ -126,3 +240,83 @@ class BedrockTitanClient(BedrockClient):
126
240
  )
127
241
  completions.append(completion)
128
242
  return completions
243
+
244
+
245
+ # Amazon Bedrock Client for Mistral Models
246
+ class BedrockMistralClient(BedrockClient):
247
+ _COMPLETION_REASON_TO_FINISH_REASON = {
248
+ "length": "length",
249
+ "stop": "endoftext",
250
+ }
251
+
252
+ model_provider = "mistral"
253
+
254
+ def convert_request_to_raw_request(self, request: Request) -> Dict:
255
+ # TODO: Support the following:
256
+ # - top_k_per_token
257
+ # - echo_prompt
258
+ # - num_completions
259
+ return {
260
+ "prompt": f"[INST]{request.prompt}[/INST]",
261
+ "temperature": request.temperature,
262
+ "top_p": request.top_p,
263
+ "max_tokens": request.max_tokens,
264
+ }
265
+
266
+ def convert_raw_response_to_completions(self, response: Dict, request: Request) -> List[GeneratedOutput]:
267
+ # - logprob
268
+ completions: List[GeneratedOutput] = []
269
+
270
+ for raw_completion in response["outputs"]:
271
+ output_text = raw_completion["text"]
272
+
273
+ finish_reason = BedrockMistralClient._COMPLETION_REASON_TO_FINISH_REASON.get(
274
+ raw_completion["stop_reason"], raw_completion["stop_reason"].lower()
275
+ )
276
+ # Work around generated outputs with leading whitespace due to issue #2467
277
+ # TODO(#2467): Remove workaround
278
+ completion = truncate_and_tokenize_response_text(
279
+ output_text.lstrip(), request, self.tokenizer, self.tokenizer_name, finish_reason
280
+ )
281
+ completions.append(completion)
282
+
283
+ return completions
284
+
285
+
286
+ # Amazon Bedrock Client for LLAMA Models
287
+ class BedrockLlamaClient(BedrockClient):
288
+ _COMPLETION_REASON_TO_FINISH_REASON = {
289
+ "length": "length",
290
+ "stop": "endoftext",
291
+ }
292
+
293
+ model_provider = "meta"
294
+
295
+ def convert_request_to_raw_request(self, request: Request) -> Dict:
296
+ # TODO: Support the following:
297
+ # - top_k_per_token
298
+ # - echo_prompt
299
+ # - num_completions
300
+ return {
301
+ "prompt": f"[INST]{request.prompt}[/INST]",
302
+ "temperature": request.temperature,
303
+ "top_p": request.top_p,
304
+ "max_gen_len": request.max_tokens,
305
+ }
306
+
307
+ def convert_raw_response_to_completions(self, response: Dict, request: Request) -> List[GeneratedOutput]:
308
+ # - logprob
309
+ completions: List[GeneratedOutput] = []
310
+ output_text = response["generation"]
311
+
312
+ finish_reason = BedrockLlamaClient._COMPLETION_REASON_TO_FINISH_REASON.get(
313
+ response["stop_reason"], response["stop_reason"].lower()
314
+ )
315
+ # Work around generated outputs with leading whitespace due to issue #2467
316
+ # TODO(#2467): Remove workaround
317
+ completion = truncate_and_tokenize_response_text(
318
+ output_text.lstrip(), request, self.tokenizer, self.tokenizer_name, finish_reason
319
+ )
320
+ completions.append(completion)
321
+
322
+ return completions
@@ -8,6 +8,7 @@ from helm.common.optional_dependencies import handle_module_not_found_error
8
8
 
9
9
  try:
10
10
  import boto3
11
+ from boto3 import Session
11
12
  from botocore.config import Config
12
13
  except ModuleNotFoundError as e:
13
14
  handle_module_not_found_error(e, ["aws"])
@@ -70,3 +71,35 @@ def get_bedrock_client(
70
71
 
71
72
  hlog(f"Amazon Bedrock client successfully created with endpoint {bedrock_client._endpoint}")
72
73
  return bedrock_client
74
+
75
+
76
+ def get_bedrock_client_v1(
77
+ region: Optional[str] = None,
78
+ service_name: str = "bedrock-runtime",
79
+ assumed_role: Optional[str] = None,
80
+ read_timeout: int = 5000,
81
+ connect_timeout: int = 5000,
82
+ max_attempts: int = 10,
83
+ ):
84
+ boto_config = Config(
85
+ read_timeout=read_timeout, connect_timeout=connect_timeout, retries={"max_attempts": max_attempts}
86
+ )
87
+
88
+ if assumed_role:
89
+ session = boto3.Session(region_name=region)
90
+ # Assume role and get credentials
91
+ sts = session.client("sts")
92
+ creds = sts.assume_role(RoleArn=str(assumed_role), RoleSessionName="crfm-helm")["Credentials"]
93
+ session = Session(
94
+ aws_access_key_id=creds["AccessKeyId"],
95
+ aws_secret_access_key=creds["SecretAccessKey"],
96
+ aws_session_token=creds["SessionToken"],
97
+ )
98
+ return session.client(
99
+ service_name=service_name,
100
+ region_name=region,
101
+ config=boto_config,
102
+ )
103
+
104
+ # default to instance role to get the aws credentials or aws configured credentials
105
+ return boto3.client(service_name=service_name, region_name=region, config=boto_config)
@@ -6,7 +6,7 @@ import torch
6
6
  from helm.common.gpu_utils import get_torch_device
7
7
  from helm.common.images_utils import open_image
8
8
  from helm.common.optional_dependencies import handle_module_not_found_error
9
- from .base_clip_scorer import BaseCLIPScorer
9
+ from helm.clients.clip_scorers.base_clip_scorer import BaseCLIPScorer
10
10
 
11
11
 
12
12
  _ = torch.manual_seed(42)
@@ -4,7 +4,7 @@ import transformers
4
4
  from helm.common.gpu_utils import get_torch_device, get_torch_device_name
5
5
  from helm.common.images_utils import open_image
6
6
  from helm.common.optional_dependencies import handle_module_not_found_error
7
- from .base_clip_scorer import BaseCLIPScorer
7
+ from helm.clients.clip_scorers.base_clip_scorer import BaseCLIPScorer
8
8
 
9
9
  _ = torch.manual_seed(42)
10
10
 
@@ -164,12 +164,12 @@ class CohereRawChatRequest(TypedDict):
164
164
  message: str
165
165
  model: Optional[str]
166
166
  preamble: Optional[str]
167
- chat_history: Optional[Sequence[cohere.ChatMessage]]
167
+ chat_history: Optional[Sequence[cohere.ChatbotMessage]]
168
168
  temperature: Optional[float]
169
169
  max_tokens: Optional[int]
170
170
  k: Optional[int]
171
171
  p: Optional[float]
172
- seed: Optional[float]
172
+ seed: Optional[int]
173
173
  stop_sequences: Optional[Sequence[str]]
174
174
  frequency_penalty: Optional[float]
175
175
  presence_penalty: Optional[float]
@@ -188,7 +188,7 @@ def convert_to_raw_chat_request(request: Request) -> CohereRawChatRequest:
188
188
  "k": request.top_k_per_token,
189
189
  "p": request.top_p,
190
190
  "stop_sequences": request.stop_sequences,
191
- "seed": float(request.random) if request.random is not None else None,
191
+ "seed": int(request.random) if request.random is not None else None,
192
192
  "frequency_penalty": request.frequency_penalty,
193
193
  "presence_penalty": request.presence_penalty,
194
194
  }
@@ -2,7 +2,7 @@ from typing import List, Dict
2
2
 
3
3
  from helm.common.cache import CacheConfig
4
4
  from helm.common.request import Request, RequestResult, GeneratedOutput, Token
5
- from .client import CachingClient, truncate_sequence
5
+ from helm.clients.client import CachingClient, truncate_sequence
6
6
 
7
7
 
8
8
  class GoogleClient(CachingClient):
@@ -11,7 +11,7 @@ from helm.common.request import (
11
11
  Token,
12
12
  EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
13
13
  )
14
- from .client import CachingClient
14
+ from helm.clients.client import CachingClient
15
15
 
16
16
  import requests
17
17
 
@@ -19,7 +19,7 @@ from helm.common.request import (
19
19
  Token,
20
20
  )
21
21
  from helm.tokenizers.tokenizer import Tokenizer
22
- from .client import CachingClient, truncate_sequence
22
+ from helm.clients.client import CachingClient, truncate_sequence
23
23
  from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer, WrappedPreTrainedTokenizer
24
24
  from threading import Lock
25
25
 
@@ -59,17 +59,23 @@ class HuggingFaceServer:
59
59
  self,
60
60
  pretrained_model_name_or_path: str,
61
61
  wrapped_tokenizer: WrappedPreTrainedTokenizer,
62
- openvino: bool = False,
63
62
  **kwargs,
64
63
  ):
65
64
  self.device: Optional[str]
66
65
  if "device_map" in kwargs:
66
+ if "device" in kwargs:
67
+ raise ValueError("At most one of one of `device` and `device_map` may be specified.")
67
68
  try:
68
69
  import accelerate # noqa: F401
69
70
  except ModuleNotFoundError as e:
70
71
  handle_module_not_found_error(e, ["accelerate"])
71
- hlog(f'Hugging Face device_map set to "{kwargs["device_map"]}".')
72
+ hlog(f'Hugging Face device_map set to "{kwargs["device_map"]}" from kwargs.')
72
73
  self.device = None
74
+ elif "device" in kwargs:
75
+ if "device_map" in kwargs:
76
+ raise ValueError("At most one of one of `device` and `device_map` may be specified.")
77
+ hlog(f'Hugging Face device set to "{kwargs["device"]}" from kwargs.')
78
+ self.device = kwargs.pop("device")
73
79
  elif torch.cuda.is_available():
74
80
  hlog('Hugging Face device set to "cuda:0" because CUDA is available.')
75
81
  self.device = "cuda:0"
@@ -85,20 +91,7 @@ class HuggingFaceServer:
85
91
 
86
92
  with htrack_block(f"Loading Hugging Face model {pretrained_model_name_or_path}"):
87
93
  # WARNING this may fail if your GPU does not have enough memory
88
- if openvino:
89
- # Optimum Intel provides a simple interface to optimize Transformer models and convert them to \
90
- # OpenVINO™ Intermediate Representation (IR) format to accelerate end-to-end pipelines on \
91
- # Intel® architectures using OpenVINO™ runtime.
92
- try:
93
- from optimum.intel.openvino import OVModelForCausalLM
94
- except ModuleNotFoundError as e:
95
- handle_module_not_found_error(e, ["openvino"])
96
-
97
- self.device = "cpu"
98
- self.model = OVModelForCausalLM.from_pretrained(
99
- pretrained_model_name_or_path, export=True, **kwargs
100
- ).to(self.device)
101
- elif self.device is None:
94
+ if self.device is None:
102
95
  # kwargs contains device_map=auto
103
96
  # Do not call to() because accelerate will take care of model device placement.
104
97
  self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **kwargs)
@@ -113,7 +106,6 @@ class HuggingFaceServer:
113
106
  encoded_input = tokenizer(raw_request["prompt"], return_tensors="pt", return_token_type_ids=False).to(
114
107
  0 if self.device is None else self.device
115
108
  )
116
-
117
109
  stopping_criteria: Optional[StoppingCriteriaList] = None
118
110
  optional_args = {}
119
111
  if len(raw_request["stop_sequences"]) > 0: