crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  31. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  32. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  33. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  34. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  35. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  36. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  37. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  38. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  39. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  40. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  41. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  42. helm/benchmark/annotation/medalign_annotator.py +100 -0
  43. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  44. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  45. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  46. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  47. helm/benchmark/annotation/model_as_judge.py +218 -6
  48. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  49. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  50. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  51. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  52. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  53. helm/benchmark/annotation/spider_annotator.py +18 -0
  54. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  55. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  56. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  57. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  58. helm/benchmark/annotation_executor.py +35 -15
  59. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  60. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  61. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  62. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  63. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  64. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  65. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  66. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  67. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  68. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  69. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  70. helm/benchmark/augmentations/perturbation.py +1 -1
  71. helm/benchmark/augmentations/space_perturbation.py +2 -2
  72. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  73. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  74. helm/benchmark/augmentations/test_perturbation.py +16 -13
  75. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  76. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  77. helm/benchmark/data_preprocessor.py +2 -2
  78. helm/benchmark/huggingface_registration.py +2 -7
  79. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  80. helm/benchmark/metrics/basic_metrics.py +6 -6
  81. helm/benchmark/metrics/bbq_metrics.py +2 -2
  82. helm/benchmark/metrics/bias_metrics.py +12 -3
  83. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  84. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  85. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  86. helm/benchmark/metrics/classification_metrics.py +76 -12
  87. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  88. helm/benchmark/metrics/code_metrics.py +5 -5
  89. helm/benchmark/metrics/comet_metric.py +125 -0
  90. helm/benchmark/metrics/common_metric_specs.py +9 -2
  91. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  92. helm/benchmark/metrics/copyright_metrics.py +4 -4
  93. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  94. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  95. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  96. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  97. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  98. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  99. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  100. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  101. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  102. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  103. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  104. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  105. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  106. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  107. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  108. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  109. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  110. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  111. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  112. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  113. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  114. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  115. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  116. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  117. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  118. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  120. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  121. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  122. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  123. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  124. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  125. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  126. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  127. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  128. helm/benchmark/metrics/medalign_metrics.py +34 -0
  129. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  130. helm/benchmark/metrics/medec_metrics.py +101 -0
  131. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  132. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  133. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  134. helm/benchmark/metrics/metric.py +3 -3
  135. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  136. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  137. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  138. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  139. helm/benchmark/metrics/nltk_helper.py +32 -0
  140. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  141. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  142. helm/benchmark/metrics/output_processing_metric.py +60 -0
  143. helm/benchmark/metrics/output_processors.py +15 -0
  144. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  145. helm/benchmark/metrics/ranking_metrics.py +3 -3
  146. helm/benchmark/metrics/reference_metric.py +3 -3
  147. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  148. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  149. helm/benchmark/metrics/spider_metrics.py +7 -0
  150. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  151. helm/benchmark/metrics/statistic.py +1 -1
  152. helm/benchmark/metrics/summac/model_summac.py +1 -1
  153. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  154. helm/benchmark/metrics/summarization_metrics.py +19 -9
  155. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  156. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  157. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  158. helm/benchmark/metrics/test_metric.py +1 -1
  159. helm/benchmark/metrics/test_statistic.py +2 -2
  160. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  161. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  162. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  163. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  164. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  165. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  167. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  169. helm/benchmark/metrics/unitxt_metrics.py +4 -1
  170. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  171. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  172. helm/benchmark/model_metadata_registry.py +16 -0
  173. helm/benchmark/presentation/summarize.py +23 -10
  174. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  175. helm/benchmark/reeval_run.py +203 -0
  176. helm/benchmark/reeval_runner.py +355 -0
  177. helm/benchmark/run.py +8 -17
  178. helm/benchmark/run_expander.py +78 -8
  179. helm/benchmark/run_spec_factory.py +12 -0
  180. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  181. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  182. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  183. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  184. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  185. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  186. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  187. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  188. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  189. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  190. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  191. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  192. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  193. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  194. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  195. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  196. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  197. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  198. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  199. helm/benchmark/run_specs/vlm_run_specs.py +75 -2
  200. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  201. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  202. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  203. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  204. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  205. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  206. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  207. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  208. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  209. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  210. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  211. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  212. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  213. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  214. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  215. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  216. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  217. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  218. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  219. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  220. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  221. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  222. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  223. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  224. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  225. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  226. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  227. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  228. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  229. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  230. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  231. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  232. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  233. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  234. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  235. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  236. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  237. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  238. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  239. helm/benchmark/scenarios/bold_scenario.py +1 -1
  240. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  241. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  242. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  243. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  244. helm/benchmark/scenarios/clear_scenario.py +153 -0
  245. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  246. helm/benchmark/scenarios/code_scenario.py +17 -4
  247. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  248. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  249. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  250. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  251. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  252. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  253. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  254. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  255. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  256. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  257. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  258. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  259. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  260. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  261. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  262. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  263. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  264. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  265. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  266. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  267. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  268. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  269. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  270. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  271. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  272. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  273. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  274. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  275. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  276. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  277. helm/benchmark/scenarios/ice_scenario.py +8 -4
  278. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  279. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  280. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  281. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  282. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  283. helm/benchmark/scenarios/koala_scenario.py +1 -1
  284. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  285. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  286. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  287. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  288. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  289. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  290. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  291. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  292. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  293. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  294. helm/benchmark/scenarios/math_scenario.py +9 -1
  295. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  296. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  297. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  298. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  299. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  300. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  301. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  302. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  303. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  304. helm/benchmark/scenarios/medec_scenario.py +120 -0
  305. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  306. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  307. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  308. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  309. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  310. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  311. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  312. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  313. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  314. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  315. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  316. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  317. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  318. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  319. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  320. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  321. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  322. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  323. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  324. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  325. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  326. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  328. helm/benchmark/scenarios/quac_scenario.py +10 -1
  329. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  330. helm/benchmark/scenarios/raft_scenario.py +17 -2
  331. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  332. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  333. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  334. helm/benchmark/scenarios/scenario.py +9 -1
  335. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  336. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  337. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  338. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  339. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  340. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  341. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  342. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  343. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  344. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  345. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  346. helm/benchmark/scenarios/spider_scenario.py +91 -0
  347. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  348. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  349. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  350. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  351. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  352. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  353. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  354. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  355. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  356. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  357. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  358. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  359. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  360. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  361. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  362. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  363. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  364. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  366. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  367. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  368. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  369. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  370. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  371. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  372. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  373. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  374. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  375. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  376. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  377. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  378. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  379. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  380. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  381. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  382. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  383. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  384. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  385. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  386. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  387. helm/benchmark/server.py +11 -0
  388. helm/benchmark/slurm_runner.py +1 -1
  389. helm/benchmark/static/schema_audio.yaml +752 -0
  390. helm/benchmark/static/schema_autobencher.yaml +150 -0
  391. helm/benchmark/static/schema_call_center.yaml +97 -60
  392. helm/benchmark/static/schema_capabilities.yaml +254 -0
  393. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  394. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  395. helm/benchmark/static/schema_enterprise.yaml +298 -0
  396. helm/benchmark/static/schema_finance.yaml +14 -12
  397. helm/benchmark/static/schema_heim.yaml +1389 -0
  398. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  399. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  400. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  401. helm/benchmark/static/schema_safety.yaml +18 -1
  402. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  403. helm/benchmark/static/schema_social_audio.yaml +224 -0
  404. helm/benchmark/static/schema_sql.yaml +171 -0
  405. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  406. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  407. helm/benchmark/static/schema_vhelm.yaml +109 -36
  408. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  409. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  410. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  411. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  412. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  413. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  414. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  415. helm/benchmark/static_build/config.js +1 -1
  416. helm/benchmark/static_build/index.html +5 -5
  417. helm/benchmark/window_services/default_window_service.py +1 -1
  418. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  419. helm/benchmark/window_services/ice_window_service.py +1 -1
  420. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  421. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  422. helm/benchmark/window_services/local_window_service.py +2 -2
  423. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  424. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  425. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  426. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  427. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  428. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  429. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  430. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  431. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  432. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  433. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  434. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  435. helm/benchmark/window_services/test_utils.py +1 -1
  436. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  437. helm/benchmark/window_services/yalm_window_service.py +1 -1
  438. helm/clients/ai21_client.py +3 -3
  439. helm/clients/aleph_alpha_client.py +1 -1
  440. helm/clients/audio_language/__init__.py +0 -0
  441. helm/clients/audio_language/diva_llama_client.py +118 -0
  442. helm/clients/audio_language/llama_omni_client.py +198 -0
  443. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  444. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  445. helm/clients/auto_client.py +4 -2
  446. helm/clients/azure_openai_client.py +55 -0
  447. helm/clients/bedrock_client.py +201 -7
  448. helm/clients/bedrock_utils.py +33 -0
  449. helm/clients/clip_scorers/clip_scorer.py +1 -1
  450. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  451. helm/clients/cohere_client.py +3 -3
  452. helm/clients/google_client.py +1 -1
  453. helm/clients/http_model_client.py +1 -1
  454. helm/clients/huggingface_client.py +10 -18
  455. helm/clients/ibm_client.py +267 -0
  456. helm/clients/image_generation/adobe_vision_client.py +1 -1
  457. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  458. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  459. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  460. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  461. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  462. helm/clients/image_generation/cogview2_client.py +1 -1
  463. helm/clients/image_generation/dalle2_client.py +1 -1
  464. helm/clients/image_generation/dalle3_client.py +2 -2
  465. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  466. helm/clients/image_generation/dalle_mini/data.py +1 -1
  467. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  468. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  469. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  470. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  471. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  472. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  473. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  474. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  475. helm/clients/image_generation/dalle_mini_client.py +1 -1
  476. helm/clients/image_generation/deep_floyd_client.py +1 -1
  477. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  478. helm/clients/image_generation/lexica_client.py +1 -1
  479. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  480. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  481. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  482. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  483. helm/clients/image_generation/mindalle_client.py +1 -1
  484. helm/clients/image_generation/together_image_generation_client.py +1 -1
  485. helm/clients/lit_gpt_client.py +2 -2
  486. helm/clients/mistral_client.py +62 -18
  487. helm/clients/nvidia_nim_client.py +0 -3
  488. helm/clients/openai_client.py +241 -22
  489. helm/clients/palmyra_client.py +1 -4
  490. helm/clients/reka_client.py +1 -1
  491. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  492. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  493. helm/clients/stanfordhealthcare_google_client.py +43 -0
  494. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  495. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  496. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  497. helm/clients/test_client.py +1 -1
  498. helm/clients/test_together_client.py +6 -1
  499. helm/clients/together_client.py +47 -7
  500. helm/clients/upstage_client.py +23 -0
  501. helm/clients/vertexai_client.py +39 -13
  502. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  503. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  504. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  505. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  506. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  507. helm/clients/vllm_client.py +4 -6
  508. helm/clients/yi_client.py +0 -3
  509. helm/common/audio_utils.py +111 -0
  510. helm/common/file_caches/local_file_cache.py +1 -1
  511. helm/common/file_caches/test_local_file_cache.py +1 -1
  512. helm/common/images_utils.py +2 -2
  513. helm/common/media_object.py +2 -2
  514. helm/common/multimodal_request_utils.py +26 -0
  515. helm/common/reeval_parameters.py +12 -0
  516. helm/common/request.py +6 -2
  517. helm/common/response_format.py +18 -0
  518. helm/common/test_media_object.py +1 -1
  519. helm/config/model_deployments.yaml +1112 -19
  520. helm/config/model_metadata.yaml +985 -44
  521. helm/config/tokenizer_configs.yaml +379 -3
  522. helm/proxy/cli.py +2 -2
  523. helm/proxy/example_queries.py +1 -1
  524. helm/proxy/server.py +11 -4
  525. helm/proxy/services/remote_service.py +1 -1
  526. helm/proxy/services/server_service.py +1 -1
  527. helm/proxy/services/test_remote_service.py +2 -2
  528. helm/proxy/services/test_service.py +1 -1
  529. helm/proxy/static/general.js +122 -0
  530. helm/proxy/static/help.html +99 -0
  531. helm/proxy/static/index.css +57 -0
  532. helm/proxy/static/index.html +40 -0
  533. helm/proxy/static/index.js +456 -0
  534. helm/proxy/static/info-icon.png +0 -0
  535. helm/proxy/test_retry.py +1 -1
  536. helm/proxy/token_counters/auto_token_counter.py +1 -1
  537. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  538. helm/tokenizers/caching_tokenizer.py +2 -30
  539. helm/tokenizers/http_model_tokenizer.py +1 -1
  540. helm/tokenizers/huggingface_tokenizer.py +2 -2
  541. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  542. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  543. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  544. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  545. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  546. helm/tokenizers/tokenizer.py +3 -1
  547. helm/tokenizers/yalm_tokenizer.py +3 -3
  548. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  549. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  550. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  551. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  552. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  553. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  554. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  555. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  556. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  557. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  558. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  559. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  560. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  561. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  562. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  563. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  564. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  565. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  566. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  567. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  568. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  569. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  570. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  571. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  572. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  573. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  574. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  575. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  576. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  577. helm/tokenizers/anthropic_tokenizer.py +0 -52
  578. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  579. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  580. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
@@ -2,9 +2,9 @@ import shutil
2
2
  import tempfile
3
3
 
4
4
  from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
5
- from .test_utils import get_tokenizer_service, TEST_PROMPT
6
- from .tokenizer_service import TokenizerService
7
- from .window_service_factory import WindowServiceFactory
5
+ from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
6
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
7
+ from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
8
8
 
9
9
 
10
10
  class TestOPTWindowService:
@@ -2,9 +2,9 @@ from tempfile import TemporaryDirectory
2
2
  from typing import List
3
3
 
4
4
  from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
5
- from .tokenizer_service import TokenizerService
6
- from .window_service_factory import WindowServiceFactory
7
- from .test_utils import get_tokenizer_service, TEST_PROMPT
5
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
6
+ from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
7
+ from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
8
8
 
9
9
 
10
10
  class TestPalmyraWindowService:
@@ -3,9 +3,9 @@ import tempfile
3
3
  from typing import List
4
4
 
5
5
  from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
6
- from .tokenizer_service import TokenizerService
7
- from .window_service_factory import WindowServiceFactory
8
- from .test_utils import get_tokenizer_service, TEST_PROMPT
6
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
7
+ from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
8
+ from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
9
9
 
10
10
 
11
11
  class TestT0ppWindowService:
@@ -3,9 +3,9 @@ import tempfile
3
3
  from typing import List
4
4
 
5
5
  from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
6
- from .tokenizer_service import TokenizerService
7
- from .window_service_factory import WindowServiceFactory
8
- from .test_utils import get_tokenizer_service, TEST_PROMPT
6
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
7
+ from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
8
+ from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
9
9
 
10
10
 
11
11
  class TestT511bWindowService:
@@ -3,9 +3,9 @@ import tempfile
3
3
  from typing import List
4
4
 
5
5
  from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
6
- from .tokenizer_service import TokenizerService
7
- from .window_service_factory import WindowServiceFactory
8
- from .test_utils import get_tokenizer_service, TEST_PROMPT
6
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
7
+ from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
8
+ from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
9
9
 
10
10
 
11
11
  class TestUL2WindowService:
@@ -4,7 +4,7 @@ from helm.common.authentication import Authentication
4
4
  from helm.common.cache_backend_config import CacheBackendConfig
5
5
  from helm.proxy.services.server_service import ServerService
6
6
  from helm.benchmark.metrics.metric_service import MetricService
7
- from .tokenizer_service import TokenizerService
7
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
8
8
 
9
9
 
10
10
  TEST_PROMPT: str = (
@@ -2,9 +2,9 @@ import shutil
2
2
  import tempfile
3
3
 
4
4
  from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
5
- from .test_utils import get_tokenizer_service, TEST_PROMPT
6
- from .tokenizer_service import TokenizerService
7
- from .window_service_factory import WindowServiceFactory
5
+ from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
6
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
7
+ from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
8
8
 
9
9
 
10
10
  class TestYaLMWindowService:
@@ -1,4 +1,4 @@
1
- from .local_window_service import LocalWindowService
1
+ from helm.benchmark.window_services.local_window_service import LocalWindowService
2
2
 
3
3
 
4
4
  class YaLMWindowService(LocalWindowService):
@@ -11,8 +11,8 @@ from helm.common.request import (
11
11
  GeneratedOutput,
12
12
  Token,
13
13
  )
14
- from .client import CachingClient, truncate_sequence, cleanup_str
15
- from .ai21_utils import AI21RequestError, handle_failed_request
14
+ from helm.clients.client import CachingClient, truncate_sequence, cleanup_str
15
+ from helm.clients.ai21_utils import AI21RequestError, handle_failed_request
16
16
 
17
17
  try:
18
18
  from ai21 import AI21Client as AISDKClient
@@ -186,7 +186,7 @@ class AI21ChatClient(CachingClient):
186
186
  completions: List[GeneratedOutput] = []
187
187
 
188
188
  for choice in response["choices"]:
189
- completions.append(GeneratedOutput(text=choice["message"]["content"], logprob=0.0, tokens=[]))
189
+ completions.append(GeneratedOutput(text=choice["message"]["content"] or "", logprob=0.0, tokens=[]))
190
190
 
191
191
  return RequestResult(
192
192
  success=True,
@@ -4,7 +4,7 @@ from helm.common.cache import CacheConfig
4
4
  from helm.common.media_object import TEXT_TYPE
5
5
  from helm.common.optional_dependencies import handle_module_not_found_error
6
6
  from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
7
- from .client import CachingClient, truncate_sequence, generate_uid_for_multimodal_prompt
7
+ from helm.clients.client import CachingClient, truncate_sequence, generate_uid_for_multimodal_prompt
8
8
 
9
9
  try:
10
10
  from aleph_alpha_client import Client, CompletionRequest, CompletionResponse, Image, Prompt
File without changes
@@ -0,0 +1,118 @@
1
+ import threading
2
+ from typing import Any, Dict, List, Optional, Tuple, TypedDict
3
+
4
+ import numpy as np
5
+ from transformers import AutoModel, PreTrainedModel
6
+
7
+ from helm.clients.client import CachingClient
8
+ from helm.common.cache import CacheConfig
9
+ from helm.common.media_object import TEXT_TYPE
10
+ from helm.common.request import (
11
+ GeneratedOutput,
12
+ Request,
13
+ RequestResult,
14
+ wrap_request_time,
15
+ )
16
+ from helm.common.audio_utils import get_array_from_audio_file
17
+ from helm.proxy.retry import NonRetriableException
18
+
19
+
20
+ _LOCK = threading.Lock()
21
+ _PRE_TRAINED_MODEL: Optional[PreTrainedModel] = None
22
+
23
+
24
+ def _get_pre_trained_model(model_name: str, **kwargs) -> PreTrainedModel:
25
+ global _LOCK
26
+ global _PRE_TRAINED_MODEL
27
+ with _LOCK:
28
+ if _PRE_TRAINED_MODEL is None:
29
+ _PRE_TRAINED_MODEL = AutoModel.from_pretrained(model_name, **kwargs)
30
+ return _PRE_TRAINED_MODEL
31
+
32
+
33
+ class DivaLlamaRequest(TypedDict):
34
+ """Cache key for DivaLlamaClient"""
35
+
36
+ model: str
37
+ media_objects: List[Dict[str, Any]]
38
+
39
+
40
+ class DivaLlamaClient(CachingClient):
41
+ SAMPLE_RATE = 16000
42
+
43
+ def __init__(
44
+ self,
45
+ cache_config: CacheConfig,
46
+ **kwargs,
47
+ ):
48
+ super().__init__(cache_config)
49
+ self.pre_trained_model = _get_pre_trained_model("WillHeld/DiVA-llama-3-v0-8b", trust_remote_code=True, **kwargs)
50
+
51
+ @staticmethod
52
+ def _get_generate_input(request: Request) -> Tuple[np.ndarray, Optional[str]]:
53
+ if request.prompt:
54
+ raise NonRetriableException("request.prompt must be empty for DivaLlamaClient")
55
+ if request.embedding:
56
+ raise NonRetriableException("request.embedding must be empty for DivaLlamaClient")
57
+ if request.messages:
58
+ raise NonRetriableException("request.messages must be empty for DivaLlamaClient")
59
+ if request.multimodal_prompt is None:
60
+ raise NonRetriableException("request.multimodal_prompt must not be None for DivaLlamaClient")
61
+ text_input: Optional[str] = None
62
+ audio_input: Optional[np.ndarray] = None
63
+ for media_object in request.multimodal_prompt.media_objects:
64
+ if media_object.is_type("audio"):
65
+ if audio_input is not None:
66
+ raise NonRetriableException(
67
+ "Only one audio object allowed in request.multimodal_prompt.media_objects"
68
+ )
69
+ assert media_object.location
70
+ audio_input = get_array_from_audio_file(media_object.location, DivaLlamaClient.SAMPLE_RATE)
71
+ elif media_object.is_type(TEXT_TYPE):
72
+ if text_input is not None:
73
+ raise NonRetriableException(
74
+ "Only one text object allowed in request.multimodal_prompt.media_objects"
75
+ )
76
+ assert media_object.text is not None
77
+ text_input = media_object.text
78
+ else:
79
+ raise NonRetriableException(f"Unsupported media content type type: {media_object.content_type}")
80
+ if audio_input is None:
81
+ raise NonRetriableException(
82
+ "Expected a single audio object allowed in request.multimodal_prompt.media_objects"
83
+ )
84
+ return audio_input, text_input
85
+
86
+ def make_request(self, request: Request) -> RequestResult:
87
+ assert request.multimodal_prompt is not None
88
+ raw_request: DivaLlamaRequest = {
89
+ "model": request.model,
90
+ "media_objects": [media_object.to_dict() for media_object in request.multimodal_prompt.media_objects],
91
+ }
92
+
93
+ try:
94
+
95
+ def do_it() -> Dict[str, Any]:
96
+ with _LOCK:
97
+ audio_input, text_input = DivaLlamaClient._get_generate_input(request)
98
+ if text_input is None:
99
+ return {"completions": self.pre_trained_model.generate([audio_input])}
100
+ else:
101
+ return {"completions": self.pre_trained_model.generate([audio_input], [text_input])}
102
+
103
+ cache_key = CachingClient.make_cache_key(raw_request, request)
104
+ response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
105
+ except Exception as e: # Do something if error is encountered.
106
+ error: str = f"HuggingFace error: {e}"
107
+ return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
108
+
109
+ generated_output = GeneratedOutput(text=response["completions"][0], logprob=0, tokens=[])
110
+
111
+ return RequestResult(
112
+ success=True,
113
+ cached=cached,
114
+ request_time=response["request_time"],
115
+ request_datetime=response.get("request_datetime"),
116
+ completions=[generated_output],
117
+ embedding=[],
118
+ )
@@ -0,0 +1,198 @@
1
+ from threading import Lock
2
+ import torch
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from dataclasses import dataclass
6
+ from transformers import AutoTokenizer
7
+ import whisper
8
+ from helm.clients.audio_language.llama_omni.model.builder import load_pretrained_model as load_llama_omni
9
+ from helm.clients.audio_language.llama_omni.model.language_model.omni_speech2s_llama import OmniSpeech2SLlamaForCausalLM
10
+ from helm.clients.audio_language.llama_omni.conversation import conv_templates, Conversation
11
+ from helm.clients.audio_language.llama_omni.preprocess import tokenizer_speech_token
12
+
13
+ from helm.common.cache import CacheConfig
14
+ from helm.common.gpu_utils import get_torch_device_name
15
+ from helm.common.hierarchical_logger import hlog, htrack_block
16
+ from helm.common.media_object import TEXT_TYPE
17
+ from helm.common.request import Request, RequestResult, GeneratedOutput, Token
18
+ from helm.common.request import wrap_request_time
19
+ from helm.clients.client import CachingClient, generate_uid_for_multimodal_prompt
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class LoadedLlamaOmniModelProcessor:
24
+ """Loaded model and processor for Qwen."""
25
+
26
+ model: OmniSpeech2SLlamaForCausalLM
27
+ tokenizer: AutoTokenizer
28
+
29
+
30
+ _models_lock: Lock = Lock()
31
+ _models: Dict[str, Optional[LoadedLlamaOmniModelProcessor]] = {
32
+ "ICTNLP/Llama-3.1-8B-Omni": None,
33
+ }
34
+
35
+
36
+ class LlamaOmniAudioLMClient(CachingClient):
37
+ """
38
+ From https://github.com/ictnlp/LLaMA-Omni,
39
+ LLaMA-Omni is the audio multimodal version based on the LLaMA-3.1-8B large language model,
40
+ developed by ICTNLP group. LLaMA-Omni accepts audio, text as inputs, and outputs text.
41
+
42
+ Paper: https://arxiv.org/abs/2409.06666
43
+ """
44
+
45
+ END_OF_TEXT_TOKEN: str = "<|im_end|>"
46
+ CONV_MODE: str = "llama_3"
47
+ PAD_ID: int = 128004
48
+ MEL_NUM: int = 128
49
+
50
+ def __init__(self, cache_config: CacheConfig):
51
+ super().__init__(cache_config=cache_config)
52
+ self._device: str = get_torch_device_name()
53
+
54
+ def _get_model(self, helm_model_name: str) -> LoadedLlamaOmniModelProcessor:
55
+ global _models_lock
56
+ global _models
57
+
58
+ model_name: str
59
+ if helm_model_name == "llama-3.1-8b-omni":
60
+ model_name = "ICTNLP/Llama-3.1-8B-Omni"
61
+ else:
62
+ raise ValueError(f"Unhandled model name: {helm_model_name}")
63
+
64
+ # Ensure that only one thread is loading the model at a time
65
+ with _models_lock:
66
+ loaded_model_processor = _models[model_name]
67
+ if loaded_model_processor is None:
68
+ hlog(f"Loading model {model_name} and caching in memory...")
69
+ # Follow the official LLaMA-Omni model loading pattern:
70
+ # https://github.com/ictnlp/LLaMA-Omni/blob/main/omni_speech/infer/run.sh
71
+ tokenizer, model, _ = load_llama_omni(model_name, None, s2s=True)
72
+ _models[model_name] = LoadedLlamaOmniModelProcessor(model, tokenizer)
73
+ loaded_model_processor = _models[model_name]
74
+
75
+ assert loaded_model_processor is not None
76
+ return loaded_model_processor
77
+
78
+ def _load_local_audio(self, media_object) -> torch.Tensor:
79
+ assert media_object.is_local_file, "LLaMA-Omni only supports local audio file input"
80
+ audio_media = whisper.load_audio(media_object.location)
81
+ audio_media = whisper.pad_or_trim(audio_media)
82
+ audio_media = whisper.log_mel_spectrogram(audio_media, n_mels=self.MEL_NUM).permute(1, 0)
83
+ return audio_media
84
+
85
+ def make_request(self, request: Request) -> RequestResult:
86
+ assert request.multimodal_prompt is not None, "Multimodal prompt is required"
87
+
88
+ loaded_model_processor: LoadedLlamaOmniModelProcessor = self._get_model(request.model_engine)
89
+ model = loaded_model_processor.model
90
+ tokenizer = loaded_model_processor.tokenizer
91
+
92
+ # The generation configs are taken from the official LLaMA-Omni repository
93
+ # https://github.com/ictnlp/LLaMA-Omni/blob/main/omni_speech/infer/infer.py#L116
94
+ generation_args = {
95
+ "max_new_tokens": 25,
96
+ "do_sample": False,
97
+ "use_cache": False,
98
+ "pad_token_id": self.PAD_ID,
99
+ "streaming_unit_gen": False,
100
+ "top_p": None,
101
+ }
102
+
103
+ input_text_query: Dict[str, str]
104
+ input_audio_query: Dict[str, Any]
105
+ prompt_text: str = ""
106
+
107
+ for media_object in request.multimodal_prompt.media_objects:
108
+ if media_object.is_type("audio") and media_object.location:
109
+ input_audio_query = {"audio": self._load_local_audio(media_object)}
110
+ elif media_object.is_type(TEXT_TYPE):
111
+ if media_object.text is None:
112
+ raise ValueError("MediaObject of text type has missing text field value")
113
+ input_text_query = {"text": "<speech>\n" + media_object.text}
114
+ prompt_text += media_object.text
115
+ else:
116
+ raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
117
+
118
+ completions: List[GeneratedOutput] = []
119
+ request_time: float = 0
120
+ request_datetime: Optional[int] = None
121
+ all_cached: bool = True
122
+
123
+ with htrack_block(f"Generating for prompt: {prompt_text}"):
124
+ for completion_index in range(request.num_completions):
125
+ try:
126
+
127
+ def do_it() -> Dict[str, Any]:
128
+ conv: Conversation = conv_templates[self.CONV_MODE].copy()
129
+ conv.append_message(conv.roles[0], input_text_query["text"])
130
+ conv.append_message(conv.roles[1], None)
131
+ query: str = conv.get_prompt()
132
+ # LLama-Omni requires a batch input
133
+ text_inputs = (
134
+ tokenizer_speech_token(query, tokenizer, return_tensors="pt").unsqueeze(0).to(self._device)
135
+ )
136
+ audio_inputs = (
137
+ input_audio_query["audio"].to(dtype=torch.float16, device=self._device).unsqueeze(0)
138
+ )
139
+ speech_length = torch.LongTensor([audio_inputs.shape[1]])
140
+ pred, _ = model.generate(
141
+ text_inputs,
142
+ audio_inputs,
143
+ speech_length,
144
+ None,
145
+ None,
146
+ None,
147
+ None,
148
+ None,
149
+ None,
150
+ None,
151
+ None,
152
+ False,
153
+ None,
154
+ None,
155
+ **generation_args,
156
+ )
157
+ completion = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)
158
+ tokens: List[str] = tokenizer.tokenize(completion)
159
+ return {"output": (completion, tokens)}
160
+
161
+ # Include the prompt and model name in the cache key
162
+ cache_key = CachingClient.make_cache_key(
163
+ raw_request={
164
+ "completion_index": completion_index,
165
+ "model": request.model,
166
+ "prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt),
167
+ **generation_args,
168
+ },
169
+ request=request,
170
+ )
171
+ result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
172
+ except RuntimeError as model_error:
173
+ return RequestResult(
174
+ success=False, cached=False, error=str(model_error), completions=[], embedding=[]
175
+ )
176
+
177
+ text, tokens = result["output"]
178
+
179
+ # Tokenize truncated text to get the list of tokens
180
+ completions.append(
181
+ GeneratedOutput(
182
+ text=text, logprob=0, tokens=[Token(text=str(token), logprob=0) for token in tokens]
183
+ )
184
+ )
185
+
186
+ request_time += result["request_time"]
187
+ # Use the datetime from the first completion because that's when the request was fired
188
+ request_datetime = request_datetime or result.get("request_datetime")
189
+ all_cached = all_cached and cached
190
+
191
+ return RequestResult(
192
+ success=True,
193
+ cached=all_cached,
194
+ request_time=request_time,
195
+ request_datetime=request_datetime,
196
+ completions=completions,
197
+ embedding=[],
198
+ )
@@ -0,0 +1,188 @@
1
+ from threading import Lock
2
+ import librosa
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from dataclasses import dataclass
6
+ from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
7
+
8
+ from helm.common.cache import CacheConfig
9
+ from helm.common.gpu_utils import get_torch_device_name
10
+ from helm.common.hierarchical_logger import hlog, htrack_block
11
+ from helm.common.media_object import TEXT_TYPE
12
+ from helm.common.request import Request, RequestResult, GeneratedOutput, Token
13
+ from helm.common.request import wrap_request_time
14
+ from helm.clients.client import CachingClient, generate_uid_for_multimodal_prompt
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class LoadedQwenModelProcessor:
19
+ """Loaded model and processor for Qwen."""
20
+
21
+ model: Qwen2AudioForConditionalGeneration
22
+ tokenizer: AutoProcessor
23
+
24
+
25
+ _models_lock: Lock = Lock()
26
+ _models: Dict[str, Optional[LoadedQwenModelProcessor]] = {
27
+ "Qwen/Qwen2-Audio-7B-Instruct": None,
28
+ }
29
+
30
+
31
+ class Qwen2AudioLMClient(CachingClient):
32
+ """
33
+ From https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct,
34
+ Qwen2-Audio-Instruct (Qwen2 Large Vision Language Model) is the audito multimodal version of the large model series,
35
+ Qwen2 (abbr. Tongyi Qianwen), proposed by Alibaba Cloud. Qwen2-Audio-Instruct accepts audio, text as inputs,
36
+ outputs text.
37
+ Alibaba released Qwen-Audio and Qwen-Audio-Instruct, which is a instruction-following model based on Qwen-Audio.
38
+ We for now integrated Qwen2-Audio-Instruct for instruction-following tasks.
39
+
40
+ Paper: https://arxiv.org/abs/2407.10759
41
+ """
42
+
43
+ END_OF_TEXT_TOKEN: str = "<|im_end|>"
44
+
45
+ def __init__(self, cache_config: CacheConfig):
46
+ super().__init__(cache_config=cache_config)
47
+ self._device: str = get_torch_device_name()
48
+
49
+ def _get_model(self, helm_model_name: str) -> LoadedQwenModelProcessor:
50
+ global _models_lock
51
+ global _models
52
+
53
+ model_name: str
54
+ if helm_model_name == "qwen2-audio-7b-instruct":
55
+ model_name = "Qwen/Qwen2-Audio-7B-Instruct"
56
+ else:
57
+ raise ValueError(f"Unhandled model name: {helm_model_name}")
58
+
59
+ # Ensure that only one thread is loading the model at a time
60
+ with _models_lock:
61
+ loaded_model_processor = _models[model_name]
62
+ if loaded_model_processor is None:
63
+ hlog(f"Loading model {model_name} and caching in memory...")
64
+ model = Qwen2AudioForConditionalGeneration.from_pretrained(
65
+ model_name,
66
+ device_map=self._device,
67
+ ).eval()
68
+ tokenizer = AutoProcessor.from_pretrained(
69
+ model_name,
70
+ )
71
+ _models[model_name] = LoadedQwenModelProcessor(model, tokenizer)
72
+ loaded_model_processor = _models[model_name]
73
+
74
+ assert loaded_model_processor is not None
75
+ return loaded_model_processor
76
+
77
+ def make_request(self, request: Request) -> RequestResult:
78
+ assert request.multimodal_prompt is not None, "Multimodal prompt is required"
79
+
80
+ loaded_model_processor: LoadedQwenModelProcessor = self._get_model(request.model_engine)
81
+ model = loaded_model_processor.model
82
+ tokenizer = loaded_model_processor.tokenizer
83
+
84
+ input_query: List[Dict[str, Any]] = []
85
+ query: List[Dict[str, str]] = []
86
+ prompt_text: str = ""
87
+
88
+ input_query.append({"role": "system", "content": "You are a helpful assistant."})
89
+ prompt_text += "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
90
+ for media_num, media_object in enumerate(request.multimodal_prompt.media_objects):
91
+ if media_object.is_type("audio") and media_object.location:
92
+ assert media_object.is_local_file, "Only local audio files are supported"
93
+ query.append({"type": "audio", "audio_url": media_object.location})
94
+
95
+ prompt_text += f"<|im_start|>user\nAudio {media_num+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
96
+ elif media_object.is_type(TEXT_TYPE):
97
+ if media_object.text is None:
98
+ raise ValueError("MediaObject of text type has missing text field value")
99
+ query.append({"type": "text", "text": media_object.text})
100
+ prompt_text += media_object.text
101
+ else:
102
+ raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
103
+ prompt_text += "<|im_end|>\n<|im_start|>assistant\n"
104
+
105
+ input_query.append({"role": "user", "content": query})
106
+ completions: List[GeneratedOutput] = []
107
+ request_time: float = 0
108
+ request_datetime: Optional[int] = None
109
+ all_cached: bool = True
110
+
111
+ with htrack_block(f"Generating for prompt: {prompt_text}"):
112
+ for completion_index in range(request.num_completions):
113
+ try:
114
+
115
+ def do_it() -> Dict[str, Any]:
116
+ inputs = tokenizer.apply_chat_template(input_query, add_generation_prompt=True, tokenize=False)
117
+ audios: List[Any] = []
118
+ # Refer to the official Qwen2-Audio documentation for the format of the input query
119
+ # https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct
120
+ for message in input_query:
121
+ if isinstance(message["content"], list):
122
+ for element in message["content"]:
123
+ if element["type"] == "audio":
124
+ audios.append(
125
+ librosa.load(
126
+ element["audio_url"],
127
+ sr=tokenizer.feature_extractor.sampling_rate,
128
+ )[0]
129
+ )
130
+ inputs = tokenizer(
131
+ text=inputs,
132
+ audios=audios,
133
+ sampling_rate=tokenizer.feature_extractor.sampling_rate,
134
+ return_tensors="pt",
135
+ padding=True,
136
+ )
137
+ input_length = inputs.input_ids.size(1)
138
+ # Qwen2-Audio-Instruct counts input into the max_length,
139
+ # so we need to add the length of the prompt
140
+ inputs = inputs.to(self._device)
141
+ pred = model.generate(**inputs, max_length=request.max_tokens + input_length)[:, input_length:]
142
+
143
+ completion = tokenizer.decode(
144
+ pred.cpu()[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
145
+ )
146
+ # The processor of Qwen2-Audio-Instruct consists an AutoTokenizer and a WhisperFeatureExtractor
147
+ tokens: List[str] = tokenizer.tokenizer.tokenize(completion)
148
+ return {"output": (completion, tokens)}
149
+
150
+ # Include the prompt and model name in the cache key
151
+ cache_key = CachingClient.make_cache_key(
152
+ raw_request={
153
+ "completion_index": completion_index,
154
+ "model": request.model,
155
+ "prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt),
156
+ "max_tokens": request.max_tokens,
157
+ },
158
+ request=request,
159
+ )
160
+ result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
161
+ except RuntimeError as model_error:
162
+ return RequestResult(
163
+ success=False, cached=False, error=str(model_error), completions=[], embedding=[]
164
+ )
165
+
166
+ text, tokens = result["output"]
167
+ hlog(f"Generated: {text}")
168
+
169
+ # Tokenize truncated text to get the list of tokens
170
+ completions.append(
171
+ GeneratedOutput(
172
+ text=text, logprob=0, tokens=[Token(text=str(token), logprob=0) for token in tokens]
173
+ )
174
+ )
175
+
176
+ request_time += result["request_time"]
177
+ # Use the datetime from the first completion because that's when the request was fired
178
+ request_datetime = request_datetime or result.get("request_datetime")
179
+ all_cached = all_cached and cached
180
+
181
+ return RequestResult(
182
+ success=True,
183
+ cached=all_cached,
184
+ request_time=request_time,
185
+ request_datetime=request_datetime,
186
+ completions=completions,
187
+ embedding=[],
188
+ )