crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -5,14 +5,14 @@ from typing import Any, Dict, List
5
5
 
6
6
  from helm.clients.openai_client import OpenAIClient
7
7
  from helm.common.cache import CacheConfig
8
- from helm.common.hierarchical_logger import hlog
8
+ from helm.common.hierarchical_logger import hwarn
9
9
  from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token, ErrorFlags
10
10
  from helm.common.tokenization_request import (
11
11
  TokenizationRequest,
12
12
  TokenizationRequestResult,
13
13
  )
14
14
  from helm.tokenizers.tokenizer import Tokenizer
15
- from .client import CachingClient, truncate_sequence
15
+ from helm.clients.client import CachingClient, truncate_sequence
16
16
 
17
17
 
18
18
  _CONTENT_MODERATION_KEY = "fail.content.moderation.failed"
@@ -103,10 +103,7 @@ class PalmyraClient(CachingClient):
103
103
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
104
104
 
105
105
  if _is_content_moderation_failure(response):
106
- hlog(
107
- f"WARNING: Returning empty request for {request.model_deployment} "
108
- "due to content moderation filter"
109
- )
106
+ hwarn(f"Returning empty request for {request.model_deployment} " "due to content moderation filter")
110
107
  return RequestResult(
111
108
  success=False,
112
109
  cached=False,
@@ -163,6 +160,3 @@ class PalmyraChatClient(OpenAIClient):
163
160
  org_id=None,
164
161
  base_url="https://api.writer.com/v1/chat",
165
162
  )
166
-
167
- def _is_chat_model_engine(self, model_engine: str) -> bool:
168
- return True
@@ -6,10 +6,10 @@ from helm.proxy.retry import NonRetriableException
6
6
  from helm.common.cache import CacheConfig
7
7
  from helm.common.media_object import TEXT_TYPE
8
8
  from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput
9
- from helm.common.hierarchical_logger import hlog
9
+ from helm.common.hierarchical_logger import hwarn
10
10
  from helm.common.optional_dependencies import handle_module_not_found_error
11
11
  from helm.tokenizers.tokenizer import Tokenizer
12
- from .client import CachingClient, truncate_and_tokenize_response_text
12
+ from helm.clients.client import CachingClient, truncate_and_tokenize_response_text
13
13
 
14
14
  try:
15
15
  import reka
@@ -121,7 +121,7 @@ class RekaClient(CachingClient):
121
121
  if messages[-1]["role"] != "user":
122
122
  raise ValueError("Last message must have role 'user'")
123
123
  if request.prompt != "":
124
- hlog("WARNING: Since message is set, prompt will be ignored")
124
+ hwarn("Since message is set, prompt will be ignored")
125
125
  reka_chat_history = self._convert_messages_to_reka_chat_history(messages)
126
126
  else:
127
127
  current_chat_history: Dict[str, Any] = {
@@ -0,0 +1,58 @@
1
+ from typing import Optional
2
+
3
+ from helm.clients.azure_openai_client import AzureOpenAIClient
4
+ from helm.common.cache import CacheConfig
5
+ from helm.proxy.retry import NonRetriableException
6
+ from helm.tokenizers.tokenizer import Tokenizer
7
+
8
+
9
+ class StanfordHealthCareAzureOpenAIClient(AzureOpenAIClient):
10
+ """
11
+ Client for accessing OpenAI models hosted on Stanford Health Care's model API.
12
+
13
+ Configure by setting the following in prod_env/credentials.conf:
14
+
15
+ ```
16
+ stanfordhealthcareEndpoint: https://your-domain-name/
17
+ stanfordhealthcareApiKey: your-private-key
18
+ ```
19
+ """
20
+
21
+ CREDENTIAL_HEADER_NAME = "Ocp-Apim-Subscription-Key"
22
+
23
+ def __init__(
24
+ self,
25
+ tokenizer: Tokenizer,
26
+ tokenizer_name: str,
27
+ cache_config: CacheConfig,
28
+ openai_model_name: str,
29
+ api_version: str,
30
+ api_key: Optional[str] = None,
31
+ endpoint: Optional[str] = None,
32
+ base_url: Optional[str] = None,
33
+ ):
34
+ if not api_key:
35
+ raise NonRetriableException("Must provide API key through credentials.conf")
36
+ if base_url:
37
+ base_url = base_url.format(endpoint=endpoint)
38
+ super().__init__(
39
+ tokenizer=tokenizer,
40
+ tokenizer_name=tokenizer_name,
41
+ cache_config=cache_config,
42
+ api_key="unused",
43
+ base_url=base_url,
44
+ azure_openai_deployment_name=openai_model_name,
45
+ api_version=api_version,
46
+ default_headers={StanfordHealthCareAzureOpenAIClient.CREDENTIAL_HEADER_NAME: api_key},
47
+ )
48
+ else:
49
+ super().__init__(
50
+ tokenizer=tokenizer,
51
+ tokenizer_name=tokenizer_name,
52
+ cache_config=cache_config,
53
+ api_key="unused",
54
+ endpoint=endpoint,
55
+ azure_openai_deployment_name=openai_model_name,
56
+ api_version=api_version,
57
+ default_headers={StanfordHealthCareAzureOpenAIClient.CREDENTIAL_HEADER_NAME: api_key},
58
+ )
@@ -0,0 +1,31 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from helm.common.request import (
4
+ Request,
5
+ GeneratedOutput,
6
+ )
7
+ from helm.clients.stanfordhealthcare_http_model_client import StanfordHealthCareHTTPModelClient
8
+
9
+
10
+ class StanfordHealthCareClaudeClient(StanfordHealthCareHTTPModelClient):
11
+ """
12
+ Client for accessing Claude models hosted on Stanford Health Care's model API.
13
+
14
+ Configure by setting the following in prod_env/credentials.conf:
15
+
16
+ ```
17
+ stanfordhealthcareEndpoint: https://your-domain-name/
18
+ stanfordhealthcareApiKey: your-private-key
19
+ ```
20
+ """
21
+
22
+ def get_request(self, request: Request) -> Dict[str, Any]:
23
+ return {
24
+ "model_id": self.model,
25
+ "prompt_text": request.prompt,
26
+ "max_tokens": request.max_tokens,
27
+ "temperature": request.temperature,
28
+ }
29
+
30
+ def parse_response(self, response: Dict[str, Any]) -> List[GeneratedOutput]:
31
+ return [GeneratedOutput(text=item["text"], logprob=0, tokens=[]) for item in response["content"]]
@@ -0,0 +1,43 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from helm.common.request import (
4
+ Request,
5
+ GeneratedOutput,
6
+ )
7
+ from helm.clients.stanfordhealthcare_http_model_client import StanfordHealthCareHTTPModelClient
8
+
9
+
10
+ class StanfordHealthCareGoogleClient(StanfordHealthCareHTTPModelClient):
11
+ """
12
+ Client for accessing Google models hosted on Stanford Health Care's model API.
13
+
14
+ Configure by setting the following in prod_env/credentials.conf:
15
+
16
+ ```
17
+ stanfordhealthcareEndpoint: https://your-domain-name/
18
+ stanfordhealthcareApiKey: your-private-key
19
+ ```
20
+ """
21
+
22
+ def get_request(self, request: Request) -> Dict[str, Any]:
23
+ return {
24
+ "contents": {
25
+ "role": "user",
26
+ "parts": {"text": request.prompt},
27
+ },
28
+ "generation_config": {
29
+ "temperature": request.temperature,
30
+ "topP": request.top_p,
31
+ "topK": request.top_k_per_token,
32
+ "candidateCount": 1,
33
+ "maxOutputTokens": request.max_tokens,
34
+ "stopSequences": request.stop_sequences,
35
+ },
36
+ }
37
+
38
+ def parse_response(self, response: Dict[str, Any]) -> List[GeneratedOutput]:
39
+ completion = ""
40
+ for item in response["content"]:
41
+ if "content" in item["candidates"][0]:
42
+ completion += item["candidates"][0]["content"]["parts"][0]["text"]
43
+ return [GeneratedOutput(text=completion, logprob=0, tokens=[])]
@@ -0,0 +1,93 @@
1
+ import requests
2
+
3
+ from abc import ABC, abstractmethod
4
+ from dataclasses import asdict
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from helm.common.cache import CacheConfig
8
+ from helm.common.request import (
9
+ wrap_request_time,
10
+ Request,
11
+ RequestResult,
12
+ GeneratedOutput,
13
+ EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
14
+ )
15
+ from helm.clients.client import CachingClient
16
+
17
+
18
+ class StanfordHealthCareHTTPModelClient(CachingClient, ABC):
19
+ """
20
+ Client for accessing Stanford Health Care models via HTTP requests.
21
+
22
+ Configure by setting the following in prod_env/credentials.conf:
23
+
24
+ ```
25
+ stanfordhealthcareEndpoint: https://your-domain-name/
26
+ stanfordhealthcareApiKey: your-private-key
27
+ ```
28
+ """
29
+
30
+ CREDENTIAL_HEADER_NAME = "Ocp-Apim-Subscription-Key"
31
+
32
+ def __init__(
33
+ self,
34
+ cache_config: CacheConfig,
35
+ deployment: str,
36
+ endpoint: str = "http://localhost:8080",
37
+ do_cache: bool = False,
38
+ timeout: int = 3000,
39
+ api_key: Optional[str] = None,
40
+ model: Optional[str] = None,
41
+ ):
42
+ super().__init__(cache_config=cache_config)
43
+ assert api_key, "API key must be provided"
44
+ self.endpoint = endpoint
45
+ self.timeout = timeout
46
+ self.do_cache = do_cache
47
+ self.deployment = deployment
48
+ self.model = model
49
+ self.default_headers = {StanfordHealthCareHTTPModelClient.CREDENTIAL_HEADER_NAME: api_key}
50
+
51
+ def make_request(self, request: Request) -> RequestResult:
52
+ cache_key = asdict(request)
53
+ if request.embedding:
54
+ return EMBEDDING_UNAVAILABLE_REQUEST_RESULT
55
+
56
+ raw_request = self.get_request(request)
57
+
58
+ try:
59
+
60
+ def do_it() -> Dict[str, Any]:
61
+ url = f"{self.endpoint}/{self.deployment}"
62
+ response = requests.post(url, json=raw_request, headers=self.default_headers, timeout=self.timeout)
63
+ response.raise_for_status()
64
+ response_json = response.json()
65
+ if type(response_json) == list:
66
+ response_json = {"content": response_json}
67
+ return response_json
68
+
69
+ if self.do_cache:
70
+ response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
71
+ else:
72
+ response, cached = wrap_request_time(do_it)(), False
73
+
74
+ completions = self.parse_response(response)
75
+
76
+ return RequestResult(
77
+ success=True,
78
+ cached=cached,
79
+ error=None,
80
+ completions=completions,
81
+ embedding=[],
82
+ request_time=response["request_time"],
83
+ )
84
+ except requests.exceptions.RequestException as e:
85
+ return RequestResult(success=False, cached=False, error=f"Request error: {e}", completions=[], embedding=[])
86
+
87
+ @abstractmethod
88
+ def get_request(self, request: Request) -> Dict[str, Any]:
89
+ pass
90
+
91
+ @abstractmethod
92
+ def parse_response(self, response: Dict[str, Any]) -> List[GeneratedOutput]:
93
+ pass
@@ -0,0 +1,62 @@
1
+ from typing import Optional
2
+
3
+ from helm.clients.openai_client import OpenAIClient
4
+ from helm.common.cache import CacheConfig
5
+ from helm.common.optional_dependencies import handle_module_not_found_error
6
+ from helm.proxy.retry import NonRetriableException
7
+ from helm.tokenizers.tokenizer import Tokenizer
8
+
9
+ try:
10
+ from openai import OpenAI
11
+ except ModuleNotFoundError as e:
12
+ handle_module_not_found_error(e, ["openai"])
13
+
14
+
15
+ class StanfordHealthCareOpenAIClient(OpenAIClient):
16
+ """
17
+ Client for accessing OpenAI models hosted on Stanford Health Care's model API.
18
+
19
+ Configure by setting the following in prod_env/credentials.conf:
20
+
21
+ ```
22
+ stanfordhealthcareEndpoint: https://your-domain-name/
23
+ stanfordhealthcareApiKey: your-private-key
24
+ ```
25
+ """
26
+
27
+ CREDENTIAL_HEADER_NAME = "Ocp-Apim-Subscription-Key"
28
+
29
+ def __init__(
30
+ self,
31
+ tokenizer: Tokenizer,
32
+ tokenizer_name: str,
33
+ cache_config: CacheConfig,
34
+ model_name: str,
35
+ api_key: Optional[str] = None,
36
+ endpoint: Optional[str] = None,
37
+ base_url: Optional[str] = None,
38
+ openai_model_name: Optional[str] = None,
39
+ output_processor: Optional[str] = None,
40
+ ):
41
+ super().__init__(
42
+ tokenizer=tokenizer,
43
+ tokenizer_name=tokenizer_name,
44
+ cache_config=cache_config,
45
+ api_key="unused",
46
+ openai_model_name=openai_model_name,
47
+ output_processor=output_processor,
48
+ )
49
+ if not endpoint:
50
+ raise NonRetriableException("Must provide endpoint through credentials.conf")
51
+ if not api_key:
52
+ raise NonRetriableException("Must provide API key through credentials.conf")
53
+ if not base_url:
54
+ raise NonRetriableException("Must provide base url through model_deployments")
55
+ # Guess the base URL part based on the model name
56
+ # Maybe make this configurable instead?
57
+ base_url = base_url.format(endpoint=endpoint)
58
+ self.client = OpenAI(
59
+ api_key="dummy",
60
+ base_url=base_url,
61
+ default_headers={StanfordHealthCareOpenAIClient.CREDENTIAL_HEADER_NAME: api_key},
62
+ )
@@ -0,0 +1,42 @@
1
+ from typing import Dict, Optional
2
+
3
+ from helm.clients.azure_openai_client import AzureOpenAIClient
4
+ from helm.common.cache import CacheConfig
5
+ from helm.proxy.retry import NonRetriableException
6
+ from helm.tokenizers.tokenizer import Tokenizer
7
+
8
+
9
+ class StanfordHealthCareSHCOpenAIClient(AzureOpenAIClient):
10
+ """
11
+ Client for accessing OpenAI models hosted on Stanford Health Care's model API.
12
+
13
+ Configure by setting the following in prod_env/credentials.conf:
14
+
15
+ ```
16
+ stanfordhealthcareEndpoint: https://your-domain-name/
17
+ stanfordhealthcareApiKey: your-private-key
18
+ ```
19
+ """
20
+
21
+ API_VERSION = "2024-08-01-preview"
22
+
23
+ def __init__(
24
+ self,
25
+ tokenizer: Tokenizer,
26
+ tokenizer_name: str,
27
+ cache_config: CacheConfig,
28
+ api_key: Optional[str] = None,
29
+ endpoint: Optional[str] = None,
30
+ default_headers: Optional[Dict[str, str]] = None,
31
+ ):
32
+ if not api_key:
33
+ raise NonRetriableException("Must provide API key through credentials.conf")
34
+ super().__init__(
35
+ tokenizer=tokenizer,
36
+ tokenizer_name=tokenizer_name,
37
+ cache_config=cache_config,
38
+ api_key=api_key,
39
+ endpoint=endpoint,
40
+ api_version=StanfordHealthCareSHCOpenAIClient.API_VERSION,
41
+ default_headers=default_headers,
42
+ )
@@ -1,6 +1,6 @@
1
1
  from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
2
2
  from helm.tokenizers.auto_tokenizer import AutoTokenizer
3
- from .client import truncate_sequence, truncate_and_tokenize_response_text
3
+ from helm.clients.client import truncate_sequence, truncate_and_tokenize_response_text
4
4
  from typing import List
5
5
  from helm.common.request import Request, GeneratedOutput, Token
6
6
 
@@ -5,7 +5,12 @@ import tempfile
5
5
  from helm.common.cache import BlackHoleCacheConfig, SqliteCacheConfig
6
6
  from helm.common.request import Request
7
7
 
8
- from .together_client import TogetherClient, TogetherChatClient, TogetherCompletionClient, TogetherClientError
8
+ from helm.clients.together_client import (
9
+ TogetherClient,
10
+ TogetherChatClient,
11
+ TogetherCompletionClient,
12
+ TogetherClientError,
13
+ )
9
14
 
10
15
 
11
16
  class TestTogetherClient:
@@ -1,15 +1,18 @@
1
1
  from copy import deepcopy
2
2
  from itertools import zip_longest
3
+ import re
3
4
  import threading
4
- from typing import List, Dict, Any, Mapping, Optional, TypedDict, Union
5
+ from typing import Callable, List, Dict, Any, Mapping, Optional, Tuple, TypedDict, Union
6
+ from typing_extensions import NotRequired
5
7
 
6
8
  import requests
7
9
  from retrying import retry
8
10
 
9
11
  from helm.common.cache import CacheConfig
10
12
  from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
13
+ from helm.common.object_spec import get_class_by_name
11
14
  from helm.common.optional_dependencies import handle_module_not_found_error
12
- from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
15
+ from helm.common.request import Thinking, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
13
16
  from helm.clients.client import CachingClient, truncate_sequence, cleanup_str
14
17
 
15
18
  try:
@@ -98,6 +101,19 @@ class JobNotFinishedError(TogetherClientError):
98
101
  pass
99
102
 
100
103
 
104
+ def _parse_thinking(input: str) -> Tuple[str, str]:
105
+ """Return a tuple of thinking text and output text."""
106
+ match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
107
+ if match:
108
+ return (match.group(1), match.group(2))
109
+
110
+ match = re.match(r"<think>\n?(.*)", input, re.DOTALL)
111
+ if match:
112
+ return (match.group(1), "")
113
+
114
+ return (input, "")
115
+
116
+
101
117
  class TogetherClient(CachingClient):
102
118
  """
103
119
  Client for the models where we evaluate offline. Since the queries are handled offline, the `TogetherClient` just
@@ -313,15 +329,31 @@ class TogetherRawChatRequest(TypedDict):
313
329
  logprobs: int
314
330
  echo: bool
315
331
  n: int
332
+ response_format: NotRequired[Dict[str, Any]]
316
333
 
317
334
 
318
335
  class TogetherChatClient(CachingClient):
319
336
  """Client that uses the Python Together library for chat models."""
320
337
 
321
- def __init__(self, cache_config: CacheConfig, api_key: Optional[str], together_model: Optional[str] = None):
338
+ def __init__(
339
+ self,
340
+ cache_config: CacheConfig,
341
+ api_key: Optional[str],
342
+ together_model: Optional[str] = None,
343
+ disable_logprobs: Optional[bool] = None,
344
+ output_processor: Optional[str] = None,
345
+ parse_thinking: Optional[bool] = None,
346
+ ):
322
347
  super().__init__(cache_config=cache_config)
323
348
  self._client = Together(api_key=api_key)
324
349
  self._together_model = together_model
350
+ self._disable_logprobs = bool(disable_logprobs)
351
+ # self.output_processor is actually a function, not a class
352
+ self._parse_thinking = bool(parse_thinking)
353
+
354
+ self.output_processor: Optional[Callable[[str], str]] = (
355
+ get_class_by_name(output_processor) if output_processor else None
356
+ )
325
357
 
326
358
  def convert_to_raw_chat_request(self, request: Request) -> TogetherRawChatRequest:
327
359
  request.validate()
@@ -353,7 +385,11 @@ class TogetherChatClient(CachingClient):
353
385
  model = self._together_model
354
386
  else:
355
387
  model = request.model
356
- return {
388
+ if self._disable_logprobs:
389
+ logprobs = 0
390
+ else:
391
+ logprobs = min(request.top_k_per_token, 1)
392
+ raw_chat_request: TogetherRawChatRequest = {
357
393
  "messages": messages,
358
394
  "model": model,
359
395
  "max_tokens": request.max_tokens,
@@ -361,10 +397,16 @@ class TogetherChatClient(CachingClient):
361
397
  "temperature": request.temperature,
362
398
  "top_p": request.top_p,
363
399
  "top_k": request.top_k_per_token,
364
- "logprobs": min(request.top_k_per_token, 1),
400
+ "logprobs": logprobs,
365
401
  "echo": request.echo_prompt,
366
402
  "n": request.num_completions,
367
403
  }
404
+ if request.response_format and request.response_format.json_schema:
405
+ raw_chat_request["response_format"] = {
406
+ "type": "json_object",
407
+ "schema": request.response_format.json_schema,
408
+ }
409
+ return raw_chat_request
368
410
 
369
411
  def make_request(self, request: Request) -> RequestResult:
370
412
  raw_request = self.convert_to_raw_chat_request(request)
@@ -398,8 +440,21 @@ class TogetherChatClient(CachingClient):
398
440
  if token_text is None:
399
441
  break
400
442
  tokens.append(Token(text=token_text, logprob=token_logprob or 0.0))
443
+ logprob = sum([token.logprob for token in tokens]) if tokens else 0.0
401
444
  assert choice.message.role == "assistant"
402
- generated_outputs.append(GeneratedOutput(text=choice.message.content, logprob=0.0, tokens=tokens))
445
+ output_text = choice.message.content
446
+ if self.output_processor:
447
+ output_text = self.output_processor(output_text)
448
+
449
+ if self._parse_thinking:
450
+ thinking_text, output_text = _parse_thinking(output_text)
451
+ generated_outputs.append(
452
+ GeneratedOutput(
453
+ text=output_text, logprob=logprob, tokens=tokens, thinking=Thinking(text=thinking_text)
454
+ )
455
+ )
456
+ else:
457
+ generated_outputs.append(GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens))
403
458
  return RequestResult(
404
459
  success=True,
405
460
  cached=cached,
@@ -426,16 +481,27 @@ class TogetherRawCompletionRequest(TypedDict):
426
481
  class TogetherCompletionClient(CachingClient):
427
482
  """Client that uses the Python Together library for text completion models."""
428
483
 
429
- def __init__(self, cache_config: CacheConfig, api_key: Optional[str], together_model: Optional[str] = None):
484
+ def __init__(
485
+ self,
486
+ cache_config: CacheConfig,
487
+ api_key: Optional[str],
488
+ together_model: Optional[str] = None,
489
+ disable_logprobs: Optional[bool] = None,
490
+ ):
430
491
  super().__init__(cache_config=cache_config)
431
492
  self._client = Together(api_key=api_key)
432
493
  self._together_model = together_model
494
+ self._disable_logprobs = bool(disable_logprobs)
433
495
 
434
496
  def convert_to_raw_completion_request(self, request: Request) -> TogetherRawCompletionRequest:
435
497
  if self._together_model is not None:
436
498
  model = self._together_model
437
499
  else:
438
500
  model = request.model
501
+ if self._disable_logprobs:
502
+ logprobs = 0
503
+ else:
504
+ logprobs = min(request.top_k_per_token, 1)
439
505
  return {
440
506
  "prompt": request.prompt,
441
507
  "model": model,
@@ -444,7 +510,7 @@ class TogetherCompletionClient(CachingClient):
444
510
  "temperature": request.temperature,
445
511
  "top_p": request.top_p,
446
512
  "top_k": request.top_k_per_token,
447
- "logprobs": min(request.top_k_per_token, 1),
513
+ "logprobs": logprobs,
448
514
  "echo": request.echo_prompt,
449
515
  "n": request.num_completions,
450
516
  }
@@ -481,8 +547,9 @@ class TogetherCompletionClient(CachingClient):
481
547
  if token_text is None:
482
548
  break
483
549
  tokens.append(Token(text=token_text, logprob=token_logprob or 0.0))
550
+ logprob = sum([token.logprob for token in tokens]) if tokens else 0.0
484
551
  assert choice.text
485
- generated_outputs.append(GeneratedOutput(text=choice.text, logprob=0.0, tokens=tokens))
552
+ generated_outputs.append(GeneratedOutput(text=choice.text, logprob=logprob, tokens=tokens))
486
553
  return RequestResult(
487
554
  success=True,
488
555
  cached=cached,
@@ -0,0 +1,23 @@
1
+ from helm.clients.openai_client import OpenAIClient
2
+ from helm.common.cache import CacheConfig
3
+ from helm.tokenizers.tokenizer import Tokenizer
4
+
5
+
6
+ class UpstageChatClient(OpenAIClient):
7
+ """Sends request to a Upstage model using a OpenAI-compatible Chat API."""
8
+
9
+ def __init__(
10
+ self,
11
+ tokenizer: Tokenizer,
12
+ tokenizer_name: str,
13
+ cache_config: CacheConfig,
14
+ api_key: str,
15
+ ):
16
+ super().__init__(
17
+ tokenizer=tokenizer,
18
+ tokenizer_name=tokenizer_name,
19
+ cache_config=cache_config,
20
+ api_key=api_key,
21
+ org_id=None,
22
+ base_url="https://api.upstage.ai/v1/solar",
23
+ )