crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/call_center_annotator.py +22 -11
  36. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  37. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  38. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  39. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  40. helm/benchmark/annotation/harm_bench_annotator.py +11 -24
  41. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  42. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  43. helm/benchmark/annotation/live_qa_annotator.py +10 -5
  44. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  45. helm/benchmark/annotation/medalign_annotator.py +100 -0
  46. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  47. helm/benchmark/annotation/medication_qa_annotator.py +90 -61
  48. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  49. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  50. helm/benchmark/annotation/model_as_judge.py +281 -18
  51. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  52. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  54. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  55. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  56. helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
  57. helm/benchmark/annotation/spider_annotator.py +18 -0
  58. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  59. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  60. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  61. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  62. helm/benchmark/annotation/xstest_annotator.py +20 -30
  63. helm/benchmark/annotation_executor.py +35 -15
  64. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  65. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  66. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  67. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  68. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  69. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  70. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  71. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  72. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  73. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  74. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  75. helm/benchmark/augmentations/perturbation.py +1 -1
  76. helm/benchmark/augmentations/space_perturbation.py +2 -2
  77. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  78. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  79. helm/benchmark/augmentations/test_perturbation.py +16 -13
  80. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  81. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  82. helm/benchmark/data_preprocessor.py +2 -2
  83. helm/benchmark/huggingface_registration.py +2 -7
  84. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  85. helm/benchmark/metrics/basic_metrics.py +6 -6
  86. helm/benchmark/metrics/bbq_metrics.py +2 -2
  87. helm/benchmark/metrics/bias_metrics.py +12 -3
  88. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  89. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  90. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  91. helm/benchmark/metrics/classification_metrics.py +76 -12
  92. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  93. helm/benchmark/metrics/code_metrics.py +5 -5
  94. helm/benchmark/metrics/comet_metric.py +125 -0
  95. helm/benchmark/metrics/common_metric_specs.py +9 -2
  96. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  97. helm/benchmark/metrics/copyright_metrics.py +4 -4
  98. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  99. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  101. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  102. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  103. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  104. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  105. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  106. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  107. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  108. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  109. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  110. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  113. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  115. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  116. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  117. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  118. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  120. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  122. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  123. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  124. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  125. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  126. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  127. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  128. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  129. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  130. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  131. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  132. helm/benchmark/metrics/medalign_metrics.py +34 -0
  133. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  134. helm/benchmark/metrics/medec_metrics.py +101 -0
  135. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  136. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  137. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  138. helm/benchmark/metrics/metric.py +3 -3
  139. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  140. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  141. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  142. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  143. helm/benchmark/metrics/nltk_helper.py +32 -0
  144. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  145. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  146. helm/benchmark/metrics/output_processing_metric.py +60 -0
  147. helm/benchmark/metrics/output_processors.py +15 -0
  148. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  149. helm/benchmark/metrics/ranking_metrics.py +3 -3
  150. helm/benchmark/metrics/reference_metric.py +3 -3
  151. helm/benchmark/metrics/safety_metrics.py +39 -17
  152. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  153. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  154. helm/benchmark/metrics/spider_metrics.py +7 -0
  155. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  156. helm/benchmark/metrics/statistic.py +1 -1
  157. helm/benchmark/metrics/summac/model_summac.py +1 -1
  158. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  159. helm/benchmark/metrics/summarization_metrics.py +19 -9
  160. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  161. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  162. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  163. helm/benchmark/metrics/test_metric.py +1 -1
  164. helm/benchmark/metrics/test_statistic.py +2 -2
  165. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  167. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  169. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  170. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  171. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  172. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  173. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  174. helm/benchmark/metrics/unitxt_metrics.py +21 -4
  175. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  176. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  177. helm/benchmark/model_metadata_registry.py +16 -0
  178. helm/benchmark/presentation/create_plots.py +1 -1
  179. helm/benchmark/presentation/schema.py +3 -0
  180. helm/benchmark/presentation/summarize.py +119 -256
  181. helm/benchmark/presentation/test_summarize.py +145 -3
  182. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  183. helm/benchmark/reeval_run.py +203 -0
  184. helm/benchmark/reeval_runner.py +355 -0
  185. helm/benchmark/run.py +8 -17
  186. helm/benchmark/run_expander.py +105 -8
  187. helm/benchmark/run_spec_factory.py +12 -0
  188. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  189. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  190. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  191. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  192. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  193. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  194. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  195. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  196. helm/benchmark/run_specs/finance_run_specs.py +6 -2
  197. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  198. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  199. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  200. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  201. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  202. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  203. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  204. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
  205. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  206. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  207. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  208. helm/benchmark/run_specs/vlm_run_specs.py +83 -5
  209. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  210. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  211. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  212. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  213. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  214. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  215. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  216. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  217. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  218. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  219. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  220. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  221. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  222. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  223. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  224. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  225. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  226. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  227. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  228. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  229. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  230. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  231. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  232. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  233. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  234. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  235. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  236. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  237. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  238. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  239. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  240. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  241. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  242. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  243. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  244. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  245. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  246. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  247. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  248. helm/benchmark/scenarios/bold_scenario.py +1 -1
  249. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  250. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  251. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  252. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  253. helm/benchmark/scenarios/clear_scenario.py +153 -0
  254. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  255. helm/benchmark/scenarios/code_scenario.py +17 -4
  256. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  257. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  258. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  259. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  260. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  261. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  262. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  263. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  264. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  265. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  266. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  267. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  268. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  269. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  270. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  271. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  272. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  273. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  274. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  275. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  276. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  277. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  278. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  279. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  280. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  281. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  282. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  283. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  284. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  285. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  286. helm/benchmark/scenarios/ice_scenario.py +8 -4
  287. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  288. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  289. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  290. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  291. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  292. helm/benchmark/scenarios/koala_scenario.py +1 -1
  293. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  294. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  295. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  296. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  297. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  298. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  299. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  300. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  301. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  302. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  303. helm/benchmark/scenarios/math_scenario.py +9 -1
  304. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  305. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  306. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  307. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  308. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  309. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  310. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  311. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  312. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  313. helm/benchmark/scenarios/medec_scenario.py +120 -0
  314. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  315. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  316. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  317. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  318. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  319. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  320. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  321. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  322. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  323. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  324. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  325. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  326. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  327. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  328. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  329. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  330. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  331. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  332. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  333. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  334. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  335. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  336. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  337. helm/benchmark/scenarios/quac_scenario.py +10 -1
  338. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  339. helm/benchmark/scenarios/raft_scenario.py +18 -3
  340. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  341. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  342. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  343. helm/benchmark/scenarios/scenario.py +9 -1
  344. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
  345. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  346. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  347. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  348. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  349. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  350. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  351. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  352. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  353. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  354. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  355. helm/benchmark/scenarios/spider_scenario.py +91 -0
  356. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  357. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  358. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  359. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  360. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  361. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  362. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  363. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  364. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  366. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  367. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  368. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  369. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  370. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  371. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  372. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  373. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  374. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  375. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  376. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  377. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  378. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  379. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  380. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  381. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  382. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  383. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  384. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  385. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  386. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  387. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  388. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  389. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  390. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  391. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  392. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  393. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  394. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  395. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  396. helm/benchmark/server.py +11 -0
  397. helm/benchmark/slurm_runner.py +1 -1
  398. helm/benchmark/static/schema_audio.yaml +752 -0
  399. helm/benchmark/static/schema_autobencher.yaml +150 -0
  400. helm/benchmark/static/schema_call_center.yaml +97 -60
  401. helm/benchmark/static/schema_capabilities.yaml +254 -0
  402. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  403. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  404. helm/benchmark/static/schema_enterprise.yaml +298 -0
  405. helm/benchmark/static/schema_finance.yaml +14 -12
  406. helm/benchmark/static/schema_heim.yaml +1389 -0
  407. helm/benchmark/static/schema_legal.yaml +566 -0
  408. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  409. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  410. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  411. helm/benchmark/static/schema_safety.yaml +42 -6
  412. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
  413. helm/benchmark/static/schema_social_audio.yaml +224 -0
  414. helm/benchmark/static/schema_sql.yaml +171 -0
  415. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
  416. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  417. helm/benchmark/static/schema_vhelm.yaml +151 -47
  418. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  419. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  420. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  421. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  422. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  423. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  424. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  425. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  426. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  427. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  428. helm/benchmark/static_build/config.js +1 -1
  429. helm/benchmark/static_build/index.html +5 -5
  430. helm/benchmark/window_services/default_window_service.py +1 -1
  431. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  432. helm/benchmark/window_services/ice_window_service.py +1 -1
  433. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  434. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  435. helm/benchmark/window_services/local_window_service.py +2 -2
  436. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  437. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  438. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  439. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  440. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  441. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  442. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  443. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  444. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  445. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  446. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  447. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  448. helm/benchmark/window_services/test_utils.py +1 -1
  449. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  450. helm/benchmark/window_services/tokenizer_service.py +0 -5
  451. helm/benchmark/window_services/yalm_window_service.py +1 -1
  452. helm/clients/ai21_client.py +3 -3
  453. helm/clients/aleph_alpha_client.py +1 -1
  454. helm/clients/audio_language/__init__.py +0 -0
  455. helm/clients/audio_language/diva_llama_client.py +118 -0
  456. helm/clients/audio_language/llama_omni_client.py +198 -0
  457. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  458. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  459. helm/clients/auto_client.py +4 -2
  460. helm/clients/azure_openai_client.py +55 -0
  461. helm/clients/bedrock_client.py +201 -7
  462. helm/clients/bedrock_utils.py +33 -0
  463. helm/clients/clip_scorers/clip_scorer.py +1 -1
  464. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  465. helm/clients/cohere_client.py +3 -3
  466. helm/clients/google_client.py +1 -1
  467. helm/clients/http_model_client.py +1 -1
  468. helm/clients/huggingface_client.py +10 -18
  469. helm/clients/ibm_client.py +267 -0
  470. helm/clients/image_generation/adobe_vision_client.py +1 -1
  471. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  472. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  473. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  474. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  475. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  476. helm/clients/image_generation/cogview2_client.py +1 -1
  477. helm/clients/image_generation/dalle2_client.py +1 -1
  478. helm/clients/image_generation/dalle3_client.py +2 -2
  479. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  480. helm/clients/image_generation/dalle_mini/data.py +1 -1
  481. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  482. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  483. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  484. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  485. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  486. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  487. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  488. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  489. helm/clients/image_generation/dalle_mini_client.py +1 -1
  490. helm/clients/image_generation/deep_floyd_client.py +1 -1
  491. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  492. helm/clients/image_generation/lexica_client.py +1 -1
  493. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  494. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  495. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  496. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  497. helm/clients/image_generation/mindalle_client.py +1 -1
  498. helm/clients/image_generation/together_image_generation_client.py +1 -1
  499. helm/clients/lit_gpt_client.py +2 -2
  500. helm/clients/mistral_client.py +62 -18
  501. helm/clients/nvidia_nim_client.py +0 -3
  502. helm/clients/openai_client.py +255 -21
  503. helm/clients/palmyra_client.py +2 -6
  504. helm/clients/reka_client.py +1 -1
  505. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  506. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  507. helm/clients/stanfordhealthcare_google_client.py +43 -0
  508. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  509. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  510. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  511. helm/clients/test_client.py +1 -1
  512. helm/clients/test_together_client.py +6 -1
  513. helm/clients/together_client.py +69 -7
  514. helm/clients/upstage_client.py +23 -0
  515. helm/clients/vertexai_client.py +39 -13
  516. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  517. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  518. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  519. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  520. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  521. helm/clients/vllm_client.py +4 -6
  522. helm/clients/yi_client.py +0 -3
  523. helm/common/audio_utils.py +111 -0
  524. helm/common/cache.py +8 -30
  525. helm/common/file_caches/local_file_cache.py +1 -1
  526. helm/common/file_caches/test_local_file_cache.py +1 -1
  527. helm/common/images_utils.py +2 -2
  528. helm/common/key_value_store.py +9 -9
  529. helm/common/media_object.py +2 -2
  530. helm/common/mongo_key_value_store.py +3 -3
  531. helm/common/multimodal_request_utils.py +26 -0
  532. helm/common/reeval_parameters.py +12 -0
  533. helm/common/request.py +6 -2
  534. helm/common/response_format.py +18 -0
  535. helm/common/test_cache.py +1 -48
  536. helm/common/test_media_object.py +1 -1
  537. helm/common/tokenization_request.py +0 -9
  538. helm/config/model_deployments.yaml +1258 -33
  539. helm/config/model_metadata.yaml +1110 -41
  540. helm/config/tokenizer_configs.yaml +403 -3
  541. helm/proxy/cli.py +2 -2
  542. helm/proxy/example_queries.py +1 -1
  543. helm/proxy/server.py +11 -13
  544. helm/proxy/services/remote_service.py +1 -7
  545. helm/proxy/services/server_service.py +6 -19
  546. helm/proxy/services/service.py +0 -6
  547. helm/proxy/services/test_remote_service.py +2 -2
  548. helm/proxy/services/test_service.py +1 -1
  549. helm/proxy/static/general.js +122 -0
  550. helm/proxy/static/help.html +99 -0
  551. helm/proxy/static/index.css +57 -0
  552. helm/proxy/static/index.html +40 -0
  553. helm/proxy/static/index.js +456 -0
  554. helm/proxy/static/info-icon.png +0 -0
  555. helm/proxy/test_retry.py +1 -1
  556. helm/proxy/token_counters/auto_token_counter.py +1 -1
  557. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  558. helm/tokenizers/caching_tokenizer.py +2 -30
  559. helm/tokenizers/http_model_tokenizer.py +1 -1
  560. helm/tokenizers/huggingface_tokenizer.py +2 -2
  561. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  562. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  563. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  564. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  565. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  566. helm/tokenizers/tokenizer.py +3 -1
  567. helm/tokenizers/yalm_tokenizer.py +3 -3
  568. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  569. crfm_helm-0.5.3.dist-info/METADATA +0 -355
  570. crfm_helm-0.5.3.dist-info/RECORD +0 -699
  571. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  572. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  573. helm/benchmark/data_overlap/light_scenario.py +0 -60
  574. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  575. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  576. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  577. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  578. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  579. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  580. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  581. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  582. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  583. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  584. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  585. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  586. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  587. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  588. helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
  589. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  590. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  591. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  592. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  593. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  594. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  595. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  596. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  597. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  598. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  599. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  600. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  601. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  602. helm/tokenizers/anthropic_tokenizer.py +0 -52
  603. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  604. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  605. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
  606. /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
@@ -0,0 +1,267 @@
1
+ from abc import ABC
2
+ from abc import abstractmethod
3
+
4
+ from helm.common.hierarchical_logger import hlog
5
+ from helm.common.cache import CacheConfig
6
+ from helm.common.request import (
7
+ Request,
8
+ RequestResult,
9
+ Token,
10
+ wrap_request_time,
11
+ EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
12
+ GeneratedOutput,
13
+ )
14
+
15
+ from helm.clients.client import CachingClient
16
+ from helm.common.optional_dependencies import handle_module_not_found_error
17
+ from typing import TypeVar, Generic
18
+ from typing import Any, Dict, List
19
+ from threading import Semaphore, Lock
20
+ import threading
21
+
22
+ try:
23
+ from ibm_watsonx_ai import Credentials
24
+ from ibm_watsonx_ai.foundation_models import ModelInference
25
+ from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
26
+ from ibm_watsonx_ai.foundation_models.schema import (
27
+ TextChatParameters,
28
+ TextGenParameters,
29
+ ReturnOptionProperties,
30
+ )
31
+
32
+ except ModuleNotFoundError as e:
33
+ handle_module_not_found_error(e, ["ibm"])
34
+
35
+ # Define the maximum number of parallel executions is limited by IBM API
36
+ MAX_CONCURRENT_REQUESTS = 8
37
+ __semaphores: Dict[str, Semaphore] = dict()
38
+ __semaphores_lock = Lock()
39
+
40
+
41
+ def _get_semaphore(model: str) -> Semaphore:
42
+ with __semaphores_lock:
43
+ if model not in __semaphores:
44
+ __semaphores[model] = threading.Semaphore(MAX_CONCURRENT_REQUESTS)
45
+
46
+ return __semaphores[model]
47
+
48
+
49
+ T = TypeVar("T", TextGenParameters, TextChatParameters)
50
+
51
+
52
+ class ModelInferenceHandler(ABC, Generic[T]):
53
+ @abstractmethod
54
+ def __init__(self, inference_engine: ModelInference):
55
+ """
56
+ :type inference_engine: object
57
+ """
58
+ self.inference_engine = inference_engine
59
+
60
+ @abstractmethod
61
+ def serve_request(self, prompt: str, params: T) -> Dict:
62
+ pass
63
+
64
+ @abstractmethod
65
+ def parse_response(self, response: dict) -> List[GeneratedOutput]:
66
+ pass
67
+
68
+ @abstractmethod
69
+ def create_params(self, request: Request) -> T:
70
+ pass
71
+
72
+
73
+ class GenerateInferenceHandler(ModelInferenceHandler[TextGenParameters]):
74
+
75
+ def __init__(self, inference_engine: ModelInference):
76
+ self.inference_engine = inference_engine
77
+
78
+ def create_params(self, request: Request) -> TextGenParameters:
79
+ def set_temperature_requirements():
80
+ # Default temperature 0.05 required by ibm/granite-13b-instruct-v2
81
+ if self.inference_engine.model_id == "ibm/granite-13b-instruct-v2":
82
+ return 0.05
83
+ return 1e-7 if request.temperature == 0 else request.temperature
84
+
85
+ return TextGenParameters(
86
+ temperature=set_temperature_requirements(),
87
+ top_p=request.top_p,
88
+ max_new_tokens=request.max_tokens,
89
+ return_options=ReturnOptionProperties(
90
+ input_text=True,
91
+ generated_tokens=True,
92
+ input_tokens=False,
93
+ token_logprobs=True,
94
+ token_ranks=False,
95
+ ),
96
+ include_stop_sequence=False,
97
+ prompt_variables=None,
98
+ )
99
+
100
+ def serve_request(self, prompt: str, params: TextGenParameters) -> Dict:
101
+ semaphore = _get_semaphore(self.inference_engine.model_id)
102
+
103
+ with semaphore:
104
+ response = self.inference_engine.generate(
105
+ prompt=prompt,
106
+ params=params,
107
+ )
108
+ return response
109
+
110
+ def parse_response(self, response: dict) -> List[GeneratedOutput]:
111
+ completions = []
112
+ try:
113
+ for r in response["results"]:
114
+ sequence_logprob: float = 0
115
+ tokens: List[Token] = []
116
+ generated_text = r["generated_text"]
117
+ for token_and_logprob in r["generated_tokens"]:
118
+ logprob = token_and_logprob.get("logprob", 0)
119
+ text = token_and_logprob["text"]
120
+ tokens.append(Token(text=text, logprob=logprob))
121
+ sequence_logprob += logprob
122
+
123
+ completion = GeneratedOutput(text=generated_text, logprob=sequence_logprob, tokens=tokens)
124
+ completions.append(completion)
125
+ except Exception as e:
126
+ hlog(f"GenerateInferenceHandler failed with exception {e} during parse_response {response}")
127
+ return completions
128
+
129
+
130
+ class ChatModelInferenceHandler(ModelInferenceHandler[TextChatParameters]):
131
+ def __init__(self, inference_engine: ModelInference):
132
+ self.inference_engine = inference_engine
133
+
134
+ def create_params(self, request: Request) -> TextChatParameters:
135
+ return TextChatParameters(
136
+ logprobs=True,
137
+ presence_penalty=0,
138
+ frequency_penalty=0,
139
+ temperature=request.temperature,
140
+ max_tokens=request.max_tokens,
141
+ top_p=request.top_p,
142
+ )
143
+
144
+ def parse_response(self, response: dict) -> List[GeneratedOutput]:
145
+ completions = []
146
+ try:
147
+ for raw_completion in response["choices"]:
148
+ sequence_logprob: float = 0
149
+ tokens: List[Token] = []
150
+ generated_text = raw_completion["message"]["content"]
151
+
152
+ for token_and_logprob in raw_completion["logprobs"]["content"]:
153
+ logprob = token_and_logprob["logprob"]
154
+ text = token_and_logprob["token"]
155
+ tokens.append(Token(text=text, logprob=logprob))
156
+ sequence_logprob += logprob
157
+
158
+ completion = GeneratedOutput(text=generated_text, logprob=sequence_logprob, tokens=tokens)
159
+ completions.append(completion)
160
+ except Exception as e:
161
+ hlog(f"ChatModelInferenceHandler failed with exception {e} during parse_response {response}")
162
+ return completions
163
+
164
+ def serve_request(self, prompt: str, params: TextChatParameters) -> Dict:
165
+ semaphore = _get_semaphore(self.inference_engine.model_id)
166
+
167
+ with semaphore:
168
+ response = self.inference_engine.chat(
169
+ messages=[{"role": "user", "content": prompt}],
170
+ params=params,
171
+ )
172
+ return response
173
+
174
+
175
+ class IbmClient(CachingClient, ABC):
176
+ def __init__(
177
+ self,
178
+ cache_config: CacheConfig,
179
+ api_key: str,
180
+ region: str,
181
+ location: dict,
182
+ watsonx_model_name: str,
183
+ **kwargs,
184
+ ):
185
+ super().__init__(cache_config=cache_config)
186
+ self.project_id = None
187
+ self.url = None
188
+ self.watsonx_model_name = watsonx_model_name
189
+ self.api_key = api_key
190
+ self.region = region
191
+ self.kwargs = kwargs
192
+ for entry in location:
193
+ if entry["region"].lower() == self.region.lower():
194
+ self.project_id = entry["project_id"]
195
+ self.url = entry["url"]
196
+
197
+ assert self.project_id is not None, (
198
+ "Missed project_id for specified region configuration in credentials.conf, should be in list "
199
+ "of JSON objects with 'region', 'url', 'project_id' per region"
200
+ )
201
+ assert self.url is not None, (
202
+ "Missed url for specified region configuration in credentials.conf, should be in list "
203
+ "of JSON objects with 'region', 'url', 'project_id' per region"
204
+ )
205
+
206
+ self.inference_engine = ModelInference(
207
+ model_id=self.watsonx_model_name,
208
+ params={GenParams.MAX_NEW_TOKENS: 2000},
209
+ credentials=Credentials(api_key=api_key, url=self.url),
210
+ project_id=self.project_id,
211
+ )
212
+
213
+ hlog("Started IBM Client")
214
+
215
+ @abstractmethod
216
+ def make_request(self, request: Request) -> RequestResult:
217
+ pass
218
+
219
+ def do_call(self, inference_handler: ModelInferenceHandler, request: Request) -> RequestResult:
220
+ params = inference_handler.create_params(request=request)
221
+
222
+ def do_it() -> Dict[str, Any]:
223
+ return inference_handler.serve_request(prompt=request.prompt, params=params)
224
+
225
+ raw_request = {"prompt": request.prompt, "params": params.to_dict(), "model": request.model}
226
+
227
+ cache_key = CachingClient.make_cache_key(raw_request, request)
228
+ response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
229
+ completions = inference_handler.parse_response(response)
230
+ return RequestResult(
231
+ success=True,
232
+ cached=cached,
233
+ request_time=response["request_time"],
234
+ request_datetime=response.get("request_datetime"),
235
+ completions=completions,
236
+ embedding=[],
237
+ )
238
+
239
+
240
+ class IbmChatClient(IbmClient):
241
+
242
+ def make_request(self, request: Request) -> RequestResult:
243
+ # Embedding not supported for this model
244
+ if request.embedding:
245
+ return EMBEDDING_UNAVAILABLE_REQUEST_RESULT
246
+ try:
247
+ return self.do_call(
248
+ inference_handler=ChatModelInferenceHandler(inference_engine=self.inference_engine), request=request
249
+ )
250
+
251
+ except Exception as e:
252
+ error: str = f"IBM Chat client Model error: {e}"
253
+ return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
254
+
255
+
256
+ class IbmTextClient(IbmClient):
257
+ def make_request(self, request: Request) -> RequestResult:
258
+ # Embedding not supported for this model
259
+ if request.embedding:
260
+ return EMBEDDING_UNAVAILABLE_REQUEST_RESULT
261
+ try:
262
+ return self.do_call(
263
+ inference_handler=GenerateInferenceHandler(inference_engine=self.inference_engine), request=request
264
+ )
265
+ except Exception as e:
266
+ error: str = f"IBM Text client Model error: {e}"
267
+ return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
@@ -9,7 +9,7 @@ from helm.common.tokenization_request import (
9
9
  DecodeRequestResult,
10
10
  )
11
11
  from helm.clients.client import Client, CachingClient
12
- from .image_generation_client_utils import get_single_image_multimedia_object
12
+ from helm.clients.image_generation.image_generation_client_utils import get_single_image_multimedia_object
13
13
 
14
14
 
15
15
  class AdobeVisionClient(Client):
@@ -9,7 +9,7 @@ from helm.common.tokenization_request import (
9
9
  DecodeRequestResult,
10
10
  )
11
11
  from helm.clients.client import Client, CachingClient
12
- from .image_generation_client_utils import get_single_image_multimedia_object
12
+ from helm.clients.image_generation.image_generation_client_utils import get_single_image_multimedia_object
13
13
 
14
14
 
15
15
  class AlephAlphaImageGenerationClient(Client):
@@ -6,9 +6,9 @@
6
6
  @Contact : dm18@mails.tsinghua.edu.cn
7
7
  """
8
8
 
9
- from .direct_sr import DirectSuperResolution
10
- from .iterative_sr import IterativeSuperResolution
11
- from .sr_group import SRGroup
9
+ from helm.clients.image_generation.cogview2.sr_pipeline.direct_sr import DirectSuperResolution
10
+ from helm.clients.image_generation.cogview2.sr_pipeline.iterative_sr import IterativeSuperResolution
11
+ from helm.clients.image_generation.cogview2.sr_pipeline.sr_group import SRGroup
12
12
 
13
13
  DirectSuperResolution
14
14
  IterativeSuperResolution
@@ -10,8 +10,11 @@
10
10
  import torch
11
11
  from icetk import icetk as tokenizer
12
12
 
13
- from .dsr_sampling import filling_sequence_dsr, IterativeEntfilterStrategy
14
- from .dsr_model import DsrModel
13
+ from helm.clients.image_generation.cogview2.sr_pipeline.dsr_sampling import (
14
+ filling_sequence_dsr,
15
+ IterativeEntfilterStrategy,
16
+ )
17
+ from helm.clients.image_generation.cogview2.sr_pipeline.dsr_model import DsrModel
15
18
  from helm.common.optional_dependencies import handle_module_not_found_error
16
19
 
17
20
 
@@ -8,8 +8,11 @@
8
8
  import torch
9
9
  from icetk import icetk as tokenizer
10
10
 
11
- from .itersr_sampling import filling_sequence_itersr, IterativeEntfilterStrategy
12
- from .itersr_model import ItersrModel
11
+ from helm.clients.image_generation.cogview2.sr_pipeline.itersr_sampling import (
12
+ filling_sequence_itersr,
13
+ IterativeEntfilterStrategy,
14
+ )
15
+ from helm.clients.image_generation.cogview2.sr_pipeline.itersr_model import ItersrModel
13
16
  from helm.common.optional_dependencies import handle_module_not_found_error
14
17
 
15
18
 
@@ -7,8 +7,8 @@
7
7
  """
8
8
 
9
9
  # here put the import lib
10
- from .direct_sr import DirectSuperResolution
11
- from .iterative_sr import IterativeSuperResolution
10
+ from helm.clients.image_generation.cogview2.sr_pipeline.direct_sr import DirectSuperResolution
11
+ from helm.clients.image_generation.cogview2.sr_pipeline.iterative_sr import IterativeSuperResolution
12
12
 
13
13
  from helm.common.optional_dependencies import handle_module_not_found_error
14
14
 
@@ -20,7 +20,7 @@ from helm.common.tokenization_request import (
20
20
  )
21
21
  from helm.clients.client import Client, CachingClient
22
22
  from helm.clients.image_generation.cogview2.coglm_strategy import CoglmStrategy
23
- from .image_generation_client_utils import get_single_image_multimedia_object
23
+ from helm.clients.image_generation.image_generation_client_utils import get_single_image_multimedia_object
24
24
 
25
25
 
26
26
  class CogView2Client(Client):
@@ -15,7 +15,7 @@ from helm.common.tokenization_request import (
15
15
  )
16
16
  from helm.clients.moderation_api_client import ModerationAPIClient
17
17
  from helm.clients.client import Client, CachingClient
18
- from .image_generation_client_utils import get_single_image_multimedia_object
18
+ from helm.clients.image_generation.image_generation_client_utils import get_single_image_multimedia_object
19
19
 
20
20
  try:
21
21
  import openai
@@ -7,8 +7,8 @@ from helm.common.optional_dependencies import handle_module_not_found_error
7
7
  from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
8
8
  from helm.clients.moderation_api_client import ModerationAPIClient
9
9
  from helm.clients.client import CachingClient
10
- from .dalle2_client import DALLE2Client
11
- from .image_generation_client_utils import get_single_image_multimedia_object
10
+ from helm.clients.image_generation.dalle2_client import DALLE2Client
11
+ from helm.clients.image_generation.image_generation_client_utils import get_single_image_multimedia_object
12
12
 
13
13
  try:
14
14
  import openai
@@ -1,3 +1,3 @@
1
1
  __version__ = "0.1.4"
2
2
 
3
- from .model import DalleBart, DalleBartProcessor
3
+ from helm.clients.image_generation.dalle_mini.model import DalleBart, DalleBartProcessor
@@ -6,7 +6,7 @@ from pathlib import Path
6
6
  import numpy as np
7
7
  from datasets import Dataset, load_dataset
8
8
 
9
- from .model.text import TextNormalizer
9
+ from helm.clients.image_generation.dalle_mini.model.text import TextNormalizer
10
10
  from helm.common.optional_dependencies import handle_module_not_found_error
11
11
 
12
12
  try:
@@ -1,5 +1,5 @@
1
- from .configuration import DalleBartConfig
2
- from .modeling import DalleBart
3
- from .partitions import set_partitions
4
- from .processor import DalleBartProcessor
5
- from .tokenizer import DalleBartTokenizer
1
+ from helm.clients.image_generation.dalle_mini.model.configuration import DalleBartConfig
2
+ from helm.clients.image_generation.dalle_mini.model.modeling import DalleBart
3
+ from helm.clients.image_generation.dalle_mini.model.partitions import set_partitions
4
+ from helm.clients.image_generation.dalle_mini.model.processor import DalleBartProcessor
5
+ from helm.clients.image_generation.dalle_mini.model.tokenizer import DalleBartTokenizer
@@ -18,7 +18,7 @@ import warnings
18
18
  from transformers.configuration_utils import PretrainedConfig
19
19
  from transformers.utils import logging
20
20
 
21
- from .utils import PretrainedFromWandbMixin
21
+ from helm.clients.image_generation.dalle_mini.model.utils import PretrainedFromWandbMixin
22
22
 
23
23
  logger = logging.get_logger(__name__)
24
24
 
@@ -35,8 +35,8 @@ from transformers.utils import ModelOutput, logging
35
35
  from transformers.generation.configuration_utils import GenerationConfig
36
36
 
37
37
  from helm.common.optional_dependencies import handle_module_not_found_error
38
- from .configuration import DalleBartConfig
39
- from .utils import PretrainedFromWandbMixin
38
+ from helm.clients.image_generation.dalle_mini.model.configuration import DalleBartConfig
39
+ from helm.clients.image_generation.dalle_mini.model.utils import PretrainedFromWandbMixin
40
40
 
41
41
  try:
42
42
  import flax
@@ -2,10 +2,10 @@
2
2
 
3
3
  from typing import List
4
4
 
5
- from .configuration import DalleBartConfig
6
- from .text import TextNormalizer
7
- from .tokenizer import DalleBartTokenizer
8
- from .utils import PretrainedFromWandbMixin
5
+ from helm.clients.image_generation.dalle_mini.model.configuration import DalleBartConfig
6
+ from helm.clients.image_generation.dalle_mini.model.text import TextNormalizer
7
+ from helm.clients.image_generation.dalle_mini.model.tokenizer import DalleBartTokenizer
8
+ from helm.clients.image_generation.dalle_mini.model.utils import PretrainedFromWandbMixin
9
9
  from helm.common.optional_dependencies import handle_module_not_found_error
10
10
 
11
11
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  from transformers import BartTokenizerFast
4
4
 
5
- from .utils import PretrainedFromWandbMixin
5
+ from helm.clients.image_generation.dalle_mini.model.utils import PretrainedFromWandbMixin
6
6
 
7
7
 
8
8
  class DalleBartTokenizer(PretrainedFromWandbMixin, BartTokenizerFast):
@@ -1 +1 @@
1
- from . import *
1
+ from helm.clients.image_generation.dalle_mini.vqgan_jax import *
@@ -2,8 +2,8 @@ import re
2
2
 
3
3
  import torch
4
4
 
5
- from .modeling_flax_vqgan import VQModel
6
- from .configuration_vqgan import VQGANConfig
5
+ from helm.clients.image_generation.dalle_mini.vqgan_jax.modeling_flax_vqgan import VQModel
6
+ from helm.clients.image_generation.dalle_mini.vqgan_jax.configuration_vqgan import VQGANConfig
7
7
  from helm.common.optional_dependencies import handle_module_not_found_error
8
8
 
9
9
  try:
@@ -6,7 +6,7 @@ import math
6
6
 
7
7
  from transformers.modeling_flax_utils import FlaxPreTrainedModel
8
8
 
9
- from .configuration_vqgan import VQGANConfig
9
+ from helm.clients.image_generation.dalle_mini.vqgan_jax.configuration_vqgan import VQGANConfig
10
10
  from helm.common.optional_dependencies import handle_module_not_found_error
11
11
 
12
12
  try:
@@ -15,7 +15,7 @@ from helm.common.tokenization_request import (
15
15
  TokenizationRequestResult,
16
16
  )
17
17
  from helm.clients.client import Client, CachingClient
18
- from .image_generation_client_utils import get_single_image_multimedia_object
18
+ from helm.clients.image_generation.image_generation_client_utils import get_single_image_multimedia_object
19
19
 
20
20
 
21
21
  class DALLEMiniClient(Client):
@@ -9,7 +9,7 @@ from helm.common.tokenization_request import (
9
9
  DecodeRequestResult,
10
10
  )
11
11
  from helm.clients.client import Client, CachingClient
12
- from .image_generation_client_utils import get_single_image_multimedia_object
12
+ from helm.clients.image_generation.image_generation_client_utils import get_single_image_multimedia_object
13
13
 
14
14
 
15
15
  class DeepFloydClient(Client):
@@ -17,7 +17,7 @@ from helm.common.tokenization_request import (
17
17
  TokenizationRequestResult,
18
18
  )
19
19
  from helm.clients.client import Client, CachingClient
20
- from .image_generation_client_utils import get_single_image_multimedia_object
20
+ from helm.clients.image_generation.image_generation_client_utils import get_single_image_multimedia_object
21
21
 
22
22
 
23
23
  _models_lock: Lock = Lock()
@@ -14,7 +14,7 @@ from helm.common.tokenization_request import (
14
14
  DecodeRequestResult,
15
15
  )
16
16
  from helm.clients.client import Client, CachingClient
17
- from .image_generation_client_utils import get_single_image_multimedia_object
17
+ from helm.clients.image_generation.image_generation_client_utils import get_single_image_multimedia_object
18
18
 
19
19
 
20
20
  class LexicaClient(Client):
@@ -11,12 +11,12 @@ from typing import Optional, Tuple
11
11
  from torch.cuda.amp import autocast
12
12
  from torch.optim.lr_scheduler import CosineAnnealingLR
13
13
  from torch.nn import functional as F
14
- from .stage1.vqgan import VQGAN
15
- from .stage2.transformer import Transformer1d, iGPT
16
- from .. import utils
17
- from ..utils.config import get_base_config
18
- from ..utils.sampling import sampling, sampling_igpt
19
- from .tokenizer import build_tokenizer
14
+ from helm.clients.image_generation.mindalle.models.stage1.vqgan import VQGAN
15
+ from helm.clients.image_generation.mindalle.models.stage2.transformer import Transformer1d, iGPT
16
+ from helm.clients.image_generation.mindalle import utils
17
+ from helm.clients.image_generation.mindalle.utils.config import get_base_config
18
+ from helm.clients.image_generation.mindalle.utils.sampling import sampling, sampling_igpt
19
+ from helm.clients.image_generation.mindalle.models.tokenizer import build_tokenizer
20
20
  from helm.common.optional_dependencies import handle_module_not_found_error
21
21
 
22
22
 
@@ -7,7 +7,7 @@ import torch
7
7
  import torch.nn as nn
8
8
  from typing import List, Tuple, Optional
9
9
 
10
- from .layers import Encoder, Decoder
10
+ from helm.clients.image_generation.mindalle.models.stage1.layers import Encoder, Decoder
11
11
  from helm.common.optional_dependencies import handle_module_not_found_error
12
12
 
13
13
 
@@ -11,7 +11,7 @@ import torch
11
11
  import torch.nn as nn
12
12
  from typing import Optional, Tuple, List
13
13
  from torch.cuda.amp import autocast
14
- from .layers import Block
14
+ from helm.clients.image_generation.mindalle.models.stage2.layers import Block
15
15
 
16
16
  from helm.common.optional_dependencies import handle_module_not_found_error
17
17
 
@@ -1,3 +1,3 @@
1
- from .utils import *
2
- from .config import *
3
- from .sampling import *
1
+ from helm.clients.image_generation.mindalle.utils.utils import *
2
+ from helm.clients.image_generation.mindalle.utils.config import *
3
+ from helm.clients.image_generation.mindalle.utils.sampling import *
@@ -15,7 +15,7 @@ from helm.common.tokenization_request import (
15
15
  TokenizationRequestResult,
16
16
  )
17
17
  from helm.clients.client import Client, CachingClient
18
- from .image_generation_client_utils import get_single_image_multimedia_object
18
+ from helm.clients.image_generation.image_generation_client_utils import get_single_image_multimedia_object
19
19
 
20
20
  try:
21
21
  from PIL import Image
@@ -13,7 +13,7 @@ from helm.common.tokenization_request import (
13
13
  )
14
14
 
15
15
  from helm.clients.client import CachingClient, Client
16
- from .image_generation_client_utils import get_single_image_multimedia_object
16
+ from helm.clients.image_generation.image_generation_client_utils import get_single_image_multimedia_object
17
17
 
18
18
 
19
19
  class TogetherImageGenerationClient(Client):
@@ -12,8 +12,8 @@ from helm.common.optional_dependencies import OptionalDependencyNotInstalled
12
12
  from helm.common.request import Request, RequestResult, GeneratedOutput, Token
13
13
  from helm.tokenizers.tokenizer import Tokenizer
14
14
 
15
- from .client import CachingClient
16
- from .lit_gpt_generate import generate # type: ignore
15
+ from helm.clients.client import CachingClient
16
+ from helm.clients.lit_gpt_generate import generate # type: ignore
17
17
 
18
18
  try:
19
19
  import lightning as L