crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
1
- window.RELEASE = "v1.0.0";
1
+ window.RELEASE = "v1.9.0";
2
2
  window.BENCHMARK_OUTPUT_BASE_URL =
3
3
  "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/";
4
4
  window.PROJECT_ID = "lite";
@@ -2,16 +2,16 @@
2
2
  <html lang="en">
3
3
  <head>
4
4
  <meta charset="UTF-8" />
5
- <link rel="icon" type="image/svg+xml" href="./helm.svg" />
5
+ <link rel="icon" type="image/svg+xml" href="https://crfm.stanford.edu/helm/helm.svg" />
6
6
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
7
  <title>Holistic Evaluation of Language Models (HELM)</title>
8
8
  <meta name="description" content="The Holistic Evaluation of Language Models (HELM) serves as a living benchmark for transparency in language models. Providing broad coverage and recognizing incompleteness, multi-metric measurements, and standardization. All data and analysis are freely accessible on the website for exploration and study." />
9
9
  <script type="text/javascript" src="./config.js"></script>
10
- <script type="module" crossorigin src="./assets/index-3ee38b3d.js"></script>
11
- <link rel="modulepreload" crossorigin href="./assets/react-d4a0b69b.js">
12
- <link rel="modulepreload" crossorigin href="./assets/recharts-6d337683.js">
13
- <link rel="modulepreload" crossorigin href="./assets/tremor-54a99cc4.js">
14
- <link rel="stylesheet" href="./assets/index-05c76bb1.css">
10
+ <script type="module" crossorigin src="./assets/index-94295e78.js"></script>
11
+ <link rel="modulepreload" crossorigin href="./assets/react-f82877fd.js">
12
+ <link rel="modulepreload" crossorigin href="./assets/recharts-4037aff0.js">
13
+ <link rel="modulepreload" crossorigin href="./assets/tremor-38a10867.js">
14
+ <link rel="stylesheet" href="./assets/index-b9779128.css">
15
15
  </head>
16
16
  <body class="block">
17
17
  <div id="root"></div>
@@ -1,4 +1,4 @@
1
- from .local_window_service import LocalWindowService
1
+ from helm.benchmark.window_services.local_window_service import LocalWindowService
2
2
 
3
3
 
4
4
  class DefaultWindowService(LocalWindowService):
@@ -1,7 +1,7 @@
1
1
  from abc import ABC
2
2
 
3
- from helm.common.hierarchical_logger import hlog
4
- from .local_window_service import LocalWindowService
3
+ from helm.common.hierarchical_logger import hwarn
4
+ from helm.benchmark.window_services.local_window_service import LocalWindowService
5
5
 
6
6
 
7
7
  class EncoderDecoderWindowService(LocalWindowService, ABC):
@@ -21,8 +21,8 @@ class EncoderDecoderWindowService(LocalWindowService, ABC):
21
21
  vs. the completions, we check the two values separately.
22
22
  """
23
23
  if expected_completion_token_length > self.max_output_length:
24
- hlog(
25
- f"WARNING: The expected completion token length ({expected_completion_token_length}) "
24
+ hwarn(
25
+ f"The expected completion token length ({expected_completion_token_length}) "
26
26
  f"exceeds the max output length ({self.max_output_length})."
27
27
  )
28
28
  return self.get_num_tokens(text) <= self.max_request_length
@@ -1,4 +1,4 @@
1
- from .local_window_service import LocalWindowService
1
+ from helm.benchmark.window_services.local_window_service import LocalWindowService
2
2
 
3
3
 
4
4
  class ICEWindowService(LocalWindowService):
@@ -1,4 +1,4 @@
1
- from .clip_window_service import CLIPWindowService
1
+ from helm.benchmark.window_services.image_generation.clip_window_service import CLIPWindowService
2
2
 
3
3
 
4
4
  class LexicaSearchWindowService(CLIPWindowService):
@@ -1,4 +1,4 @@
1
- from .clip_window_service import CLIPWindowService
1
+ from helm.benchmark.window_services.image_generation.clip_window_service import CLIPWindowService
2
2
 
3
3
 
4
4
  class OpenAIDALLEWindowService(CLIPWindowService):
@@ -1,8 +1,8 @@
1
1
  from abc import ABC
2
2
  from typing import List, Optional, cast
3
3
 
4
- from .window_service import ConfigurableWindowService, EncodeResult
5
- from .tokenizer_service import TokenizerService
4
+ from helm.benchmark.window_services.window_service import ConfigurableWindowService, EncodeResult
5
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
6
6
  from helm.common.tokenization_request import (
7
7
  DecodeRequest,
8
8
  DecodeRequestResult,
@@ -3,9 +3,9 @@ import tempfile
3
3
  from typing import List
4
4
 
5
5
  from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
6
- from .tokenizer_service import TokenizerService
7
- from .window_service_factory import WindowServiceFactory
8
- from .test_utils import get_tokenizer_service, TEST_PROMPT
6
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
7
+ from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
8
+ from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
9
9
 
10
10
 
11
11
  class TestAnthropicWindowService:
@@ -3,9 +3,9 @@ import tempfile
3
3
  from typing import List
4
4
 
5
5
  from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
6
- from .tokenizer_service import TokenizerService
7
- from .window_service_factory import WindowServiceFactory
8
- from .test_utils import get_tokenizer_service, TEST_PROMPT
6
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
7
+ from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
8
+ from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
9
9
 
10
10
 
11
11
  class TestBloomWindowService:
@@ -3,8 +3,13 @@ import tempfile
3
3
 
4
4
  from helm.benchmark.window_services.tokenizer_service import TokenizerService
5
5
  from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
6
- from .test_utils import get_tokenizer_service, TEST_PROMPT, GPT2_TEST_TOKENS, GPT2_TEST_TOKEN_IDS
7
- from .window_service_factory import WindowServiceFactory
6
+ from helm.benchmark.window_services.test_utils import (
7
+ get_tokenizer_service,
8
+ TEST_PROMPT,
9
+ GPT2_TEST_TOKENS,
10
+ GPT2_TEST_TOKEN_IDS,
11
+ )
12
+ from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
8
13
 
9
14
 
10
15
  class TestGPT2WindowService:
@@ -2,9 +2,14 @@ import shutil
2
2
  import tempfile
3
3
 
4
4
  from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
5
- from .test_utils import get_tokenizer_service, TEST_PROMPT, GPT4_TEST_TOKEN_IDS, GPT4_TEST_TOKENS
6
- from .tokenizer_service import TokenizerService
7
- from .window_service_factory import WindowServiceFactory
5
+ from helm.benchmark.window_services.test_utils import (
6
+ get_tokenizer_service,
7
+ TEST_PROMPT,
8
+ GPT4_TEST_TOKEN_IDS,
9
+ GPT4_TEST_TOKENS,
10
+ )
11
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
12
+ from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
8
13
 
9
14
 
10
15
  class TestOpenAIWindowService:
@@ -2,9 +2,14 @@ import shutil
2
2
  import tempfile
3
3
 
4
4
  from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
5
- from .tokenizer_service import TokenizerService
6
- from .window_service_factory import WindowServiceFactory
7
- from .test_utils import get_tokenizer_service, GPT2_TEST_TOKENS, GPT2_TEST_TOKEN_IDS, TEST_PROMPT
5
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
6
+ from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
7
+ from helm.benchmark.window_services.test_utils import (
8
+ get_tokenizer_service,
9
+ GPT2_TEST_TOKENS,
10
+ GPT2_TEST_TOKEN_IDS,
11
+ TEST_PROMPT,
12
+ )
8
13
 
9
14
 
10
15
  class TestGPTJWindowService:
@@ -3,9 +3,9 @@ import tempfile
3
3
  from typing import List
4
4
 
5
5
  from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
6
- from .tokenizer_service import TokenizerService
7
- from .window_service_factory import WindowServiceFactory
8
- from .test_utils import get_tokenizer_service, TEST_PROMPT
6
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
7
+ from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
8
+ from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
9
9
 
10
10
 
11
11
  class TestGPTNeoXWindowService:
@@ -2,9 +2,14 @@ import shutil
2
2
  import tempfile
3
3
 
4
4
  from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
5
- from .test_utils import get_tokenizer_service, TEST_PROMPT, GPT2_TEST_TOKENS, GPT2_TEST_TOKEN_IDS
6
- from .tokenizer_service import TokenizerService
7
- from .window_service_factory import WindowServiceFactory
5
+ from helm.benchmark.window_services.test_utils import (
6
+ get_tokenizer_service,
7
+ TEST_PROMPT,
8
+ GPT2_TEST_TOKENS,
9
+ GPT2_TEST_TOKEN_IDS,
10
+ )
11
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
12
+ from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
8
13
 
9
14
 
10
15
  class TestOpenAIWindowService:
@@ -2,9 +2,9 @@ import shutil
2
2
  import tempfile
3
3
 
4
4
  from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
5
- from .test_utils import get_tokenizer_service, TEST_PROMPT
6
- from .tokenizer_service import TokenizerService
7
- from .window_service_factory import WindowServiceFactory
5
+ from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
6
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
7
+ from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
8
8
 
9
9
 
10
10
  class TestOPTWindowService:
@@ -2,9 +2,9 @@ from tempfile import TemporaryDirectory
2
2
  from typing import List
3
3
 
4
4
  from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
5
- from .tokenizer_service import TokenizerService
6
- from .window_service_factory import WindowServiceFactory
7
- from .test_utils import get_tokenizer_service, TEST_PROMPT
5
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
6
+ from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
7
+ from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
8
8
 
9
9
 
10
10
  class TestPalmyraWindowService:
@@ -3,9 +3,9 @@ import tempfile
3
3
  from typing import List
4
4
 
5
5
  from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
6
- from .tokenizer_service import TokenizerService
7
- from .window_service_factory import WindowServiceFactory
8
- from .test_utils import get_tokenizer_service, TEST_PROMPT
6
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
7
+ from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
8
+ from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
9
9
 
10
10
 
11
11
  class TestT0ppWindowService:
@@ -3,9 +3,9 @@ import tempfile
3
3
  from typing import List
4
4
 
5
5
  from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
6
- from .tokenizer_service import TokenizerService
7
- from .window_service_factory import WindowServiceFactory
8
- from .test_utils import get_tokenizer_service, TEST_PROMPT
6
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
7
+ from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
8
+ from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
9
9
 
10
10
 
11
11
  class TestT511bWindowService:
@@ -3,9 +3,9 @@ import tempfile
3
3
  from typing import List
4
4
 
5
5
  from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
6
- from .tokenizer_service import TokenizerService
7
- from .window_service_factory import WindowServiceFactory
8
- from .test_utils import get_tokenizer_service, TEST_PROMPT
6
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
7
+ from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
8
+ from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
9
9
 
10
10
 
11
11
  class TestUL2WindowService:
@@ -1,10 +1,9 @@
1
1
  from typing import List
2
2
 
3
- from helm.common.authentication import Authentication
3
+ from helm.common.local_context import LocalContext
4
4
  from helm.common.cache_backend_config import CacheBackendConfig
5
- from helm.proxy.services.server_service import ServerService
6
5
  from helm.benchmark.metrics.metric_service import MetricService
7
- from .tokenizer_service import TokenizerService
6
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
8
7
 
9
8
 
10
9
  TEST_PROMPT: str = (
@@ -229,5 +228,5 @@ GPT4_TEST_TOKENS: List[str] = [
229
228
 
230
229
 
231
230
  def get_tokenizer_service(local_path: str, cache_backend_config: CacheBackendConfig) -> TokenizerService:
232
- service = ServerService(base_path=local_path, root_mode=True, cache_backend_config=cache_backend_config)
233
- return MetricService(service, Authentication("test"))
231
+ context = LocalContext(base_path=local_path, cache_backend_config=cache_backend_config)
232
+ return MetricService(context)
@@ -2,9 +2,9 @@ import shutil
2
2
  import tempfile
3
3
 
4
4
  from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
5
- from .test_utils import get_tokenizer_service, TEST_PROMPT
6
- from .tokenizer_service import TokenizerService
7
- from .window_service_factory import WindowServiceFactory
5
+ from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
6
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
7
+ from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
8
8
 
9
9
 
10
10
  class TestYaLMWindowService:
@@ -1,26 +1,25 @@
1
- from helm.common.authentication import Authentication
1
+ from helm.common.context import Context
2
2
  from helm.common.tokenization_request import (
3
3
  TokenizationRequest,
4
4
  TokenizationRequestResult,
5
5
  DecodeRequest,
6
6
  DecodeRequestResult,
7
7
  )
8
- from helm.proxy.services.service import Service
9
8
 
10
9
 
10
+ # TODO: Rename this to TokenizerContext
11
11
  class TokenizerService:
12
12
  """
13
- A wrapper around `Service` that makes only necessary server requests to tokenize.
13
+ A wrapper around `Context` that makes only necessary server requests to tokenize.
14
14
  """
15
15
 
16
- def __init__(self, service: Service, auth: Authentication):
17
- self._service: Service = service
18
- self._auth: Authentication = auth
16
+ def __init__(self, context: Context):
17
+ self._context: Context = context
19
18
 
20
19
  def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
21
20
  """Tokenize via an API."""
22
- return self._service.tokenize(self._auth, request)
21
+ return self._context.tokenize(request)
23
22
 
24
23
  def decode(self, request: DecodeRequest) -> DecodeRequestResult:
25
24
  """Decode via an API."""
26
- return self._service.decode(self._auth, request)
25
+ return self._context.decode(request)
@@ -1,4 +1,4 @@
1
- from .local_window_service import LocalWindowService
1
+ from helm.benchmark.window_services.local_window_service import LocalWindowService
2
2
 
3
3
 
4
4
  class YaLMWindowService(LocalWindowService):
@@ -11,8 +11,8 @@ from helm.common.request import (
11
11
  GeneratedOutput,
12
12
  Token,
13
13
  )
14
- from .client import CachingClient, truncate_sequence, cleanup_str
15
- from .ai21_utils import AI21RequestError, handle_failed_request
14
+ from helm.clients.client import CachingClient, truncate_sequence, cleanup_str
15
+ from helm.clients.ai21_utils import AI21RequestError, handle_failed_request
16
16
 
17
17
  try:
18
18
  from ai21 import AI21Client as AISDKClient
@@ -186,7 +186,7 @@ class AI21ChatClient(CachingClient):
186
186
  completions: List[GeneratedOutput] = []
187
187
 
188
188
  for choice in response["choices"]:
189
- completions.append(GeneratedOutput(text=choice["message"]["content"], logprob=0.0, tokens=[]))
189
+ completions.append(GeneratedOutput(text=choice["message"]["content"] or "", logprob=0.0, tokens=[]))
190
190
 
191
191
  return RequestResult(
192
192
  success=True,
@@ -4,7 +4,7 @@ from helm.common.cache import CacheConfig
4
4
  from helm.common.media_object import TEXT_TYPE
5
5
  from helm.common.optional_dependencies import handle_module_not_found_error
6
6
  from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
7
- from .client import CachingClient, truncate_sequence, generate_uid_for_multimodal_prompt
7
+ from helm.clients.client import CachingClient, truncate_sequence, generate_uid_for_multimodal_prompt
8
8
 
9
9
  try:
10
10
  from aleph_alpha_client import Client, CompletionRequest, CompletionResponse, Image, Prompt
@@ -1,3 +1,4 @@
1
+ import dataclasses
1
2
  from typing import Any, Dict, List, Optional, TypedDict, Union, cast
2
3
  import json
3
4
  import os
@@ -7,10 +8,11 @@ import time
7
8
  import urllib.parse
8
9
 
9
10
  from helm.common.cache import CacheConfig
10
- from helm.common.hierarchical_logger import htrack_block, hlog
11
+ from helm.common.hierarchical_logger import htrack_block, hlog, hwarn
11
12
  from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
12
13
  from helm.common.optional_dependencies import handle_module_not_found_error
13
14
  from helm.common.request import (
15
+ Thinking,
14
16
  wrap_request_time,
15
17
  EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
16
18
  Request,
@@ -30,8 +32,12 @@ from helm.clients.client import CachingClient, truncate_sequence, truncate_and_t
30
32
  try:
31
33
  from anthropic import Anthropic, BadRequestError
32
34
  from anthropic.types import MessageParam
35
+ from anthropic.types.message import Message
36
+ from anthropic.types.text_block import TextBlock
37
+ from anthropic.types.thinking_block import ThinkingBlock
33
38
  from anthropic.types.image_block_param import ImageBlockParam
34
39
  from anthropic.types.text_block_param import TextBlockParam
40
+ from anthropic.types.thinking_config_enabled_param import ThinkingConfigEnabledParam
35
41
  import websocket
36
42
  except ModuleNotFoundError as e:
37
43
  handle_module_not_found_error(e, ["anthropic"])
@@ -231,30 +237,41 @@ class AnthropicMessagesRequest(TypedDict, total=False):
231
237
  temperature: float
232
238
  top_k: int
233
239
  top_p: float
240
+ thinking: ThinkingConfigEnabledParam
234
241
 
235
242
 
236
243
  class AnthropicMessagesRequestError(NonRetriableException):
237
244
  pass
238
245
 
239
246
 
240
- class AnthropicMessagesResponseError(Exception):
247
+ class AnthropicMessagesEmptyContentError(Exception):
241
248
  pass
242
249
 
243
250
 
244
251
  class AnthropicMessagesClient(CachingClient):
245
252
  # Source: https://docs.anthropic.com/claude/docs/models-overview
246
- MAX_OUTPUT_TOKENS: int = 4096
253
+ MAX_OUTPUT_TOKENS: int = 64000
247
254
 
248
255
  MAX_IMAGE_SIZE_BYTES: int = 5242880 # 5MB
249
256
 
250
257
  def __init__(
251
- self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: CacheConfig, api_key: Optional[str] = None
258
+ self,
259
+ tokenizer: Tokenizer,
260
+ tokenizer_name: str,
261
+ cache_config: CacheConfig,
262
+ thinking_budget_tokens: Optional[int] = None,
263
+ anthropic_model_name: Optional[str] = None,
264
+ api_key: Optional[str] = None,
265
+ stream: Optional[bool] = None,
252
266
  ):
253
267
  super().__init__(cache_config=cache_config)
254
268
  self.tokenizer = tokenizer
255
269
  self.tokenizer_name = tokenizer_name
256
270
  self.client = Anthropic(api_key=api_key)
257
271
  self.api_key: Optional[str] = api_key
272
+ self.anthropic_model_name: Optional[str] = anthropic_model_name
273
+ self.thinking_budget_tokens: Optional[int] = thinking_budget_tokens
274
+ self.stream: Optional[bool] = stream
258
275
 
259
276
  def make_request(self, request: Request) -> RequestResult:
260
277
  if request.max_tokens > AnthropicMessagesClient.MAX_OUTPUT_TOKENS:
@@ -293,8 +310,8 @@ class AnthropicMessagesClient(CachingClient):
293
310
  image_width > AnthropicClient.MAX_IMAGE_DIMENSION
294
311
  or image_height > AnthropicClient.MAX_IMAGE_DIMENSION
295
312
  ):
296
- hlog(
297
- f"WARNING: Image {image_location} exceeds max allowed size: "
313
+ hwarn(
314
+ f"Image {image_location} exceeds max allowed size: "
298
315
  f"{AnthropicClient.MAX_IMAGE_DIMENSION} pixels"
299
316
  )
300
317
  # Save the resized image to a temporary file
@@ -309,8 +326,8 @@ class AnthropicMessagesClient(CachingClient):
309
326
  base64_image = encode_base64(temp_file.name, format="JPEG")
310
327
 
311
328
  elif os.path.getsize(image_location) > AnthropicMessagesClient.MAX_IMAGE_SIZE_BYTES:
312
- hlog(
313
- f"WARNING: Image {image_location} exceeds max allowed size: "
329
+ hwarn(
330
+ f"Image {image_location} exceeds max allowed size: "
314
331
  f"{AnthropicMessagesClient.MAX_IMAGE_SIZE_BYTES} bytes"
315
332
  )
316
333
  # Resize the image so it is smaller than the max allowed size
@@ -351,7 +368,7 @@ class AnthropicMessagesClient(CachingClient):
351
368
 
352
369
  raw_request: AnthropicMessagesRequest = {
353
370
  "messages": messages,
354
- "model": request.model_engine,
371
+ "model": self.anthropic_model_name or request.model_engine,
355
372
  "stop_sequences": request.stop_sequences,
356
373
  "max_tokens": request.max_tokens,
357
374
  "temperature": request.temperature,
@@ -360,6 +377,15 @@ class AnthropicMessagesClient(CachingClient):
360
377
  }
361
378
  if system_message is not None:
362
379
  raw_request["system"] = cast(str, system_message["content"])
380
+ if self.thinking_budget_tokens:
381
+ raw_request["thinking"] = {
382
+ "type": "enabled",
383
+ "budget_tokens": self.thinking_budget_tokens,
384
+ }
385
+ # Avoid error:
386
+ # `top_k` must be unset when thinking is enabled. Please consult our documentation at https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#important-considerations-when-using-extended-thinking # noqa: E501
387
+ del raw_request["top_k"]
388
+
363
389
  completions: List[GeneratedOutput] = []
364
390
 
365
391
  # `num_completions` is not supported, so instead make `num_completions` separate requests.
@@ -367,11 +393,15 @@ class AnthropicMessagesClient(CachingClient):
367
393
 
368
394
  def do_it() -> Dict[str, Any]:
369
395
  try:
370
- result = self.client.messages.create(**raw_request).model_dump()
396
+ if self.stream:
397
+ with self.client.messages.stream(**raw_request) as message_stream:
398
+ result = message_stream.get_final_message().model_dump()
399
+ else:
400
+ result = self.client.messages.create(**raw_request).model_dump()
371
401
  if "content" not in result or not result["content"]:
372
- raise AnthropicMessagesResponseError(f"Anthropic response has empty content: {result}")
373
- elif "text" not in result["content"][0]:
374
- raise AnthropicMessagesResponseError(f"Anthropic response has non-text content: {result}")
402
+ raise AnthropicMessagesEmptyContentError(f"Anthropic response has empty content: {result}")
403
+ elif "text" not in result["content"][-1]:
404
+ raise AnthropicMessagesEmptyContentError(f"Anthropic response has non-text content: {result}")
375
405
  return result
376
406
  except BadRequestError as e:
377
407
  response = e.response.json()
@@ -387,9 +417,10 @@ class AnthropicMessagesClient(CachingClient):
387
417
  },
388
418
  request,
389
419
  )
390
- response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
391
- except AnthropicMessagesResponseError:
392
- hlog("WARNING: Response has empty content")
420
+ raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
421
+
422
+ except AnthropicMessagesEmptyContentError:
423
+ hwarn("Anthropic response has empty content")
393
424
  return RequestResult(
394
425
  success=False,
395
426
  cached=False,
@@ -399,32 +430,41 @@ class AnthropicMessagesClient(CachingClient):
399
430
  error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
400
431
  )
401
432
 
402
- if _is_content_moderation_failure(response):
403
- hlog(
404
- f"WARNING: Returning empty request for {request.model_deployment} "
405
- "due to content moderation filter"
406
- )
433
+ if _is_content_moderation_failure(raw_response):
434
+ hwarn(f"Returning empty request for {request.model_deployment} " "due to content moderation filter")
407
435
  return RequestResult(
408
436
  success=False,
409
437
  cached=cached,
410
- error=response["error"]["message"],
438
+ error=raw_response["error"]["message"],
411
439
  completions=[],
412
440
  embedding=[],
413
441
  error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
414
- request_time=response["request_time"],
415
- request_datetime=response["request_datetime"],
442
+ request_time=raw_response["request_time"],
443
+ request_datetime=raw_response["request_datetime"],
416
444
  )
417
445
 
446
+ response_message: Message = Message.model_validate(raw_response)
447
+ response_text: Optional[str] = None
448
+ response_thinking: Optional[str] = None
449
+ for content in response_message.content:
450
+ if isinstance(content, TextBlock):
451
+ response_text = content.text
452
+ elif isinstance(content, ThinkingBlock):
453
+ response_thinking = content.thinking
454
+ if response_text is None:
455
+ raise Exception("Anthropic response did not contain text block")
418
456
  completion = truncate_and_tokenize_response_text(
419
- response["content"][0]["text"], request, self.tokenizer, self.tokenizer_name, original_finish_reason=""
457
+ response_text, request, self.tokenizer, self.tokenizer_name, original_finish_reason=""
420
458
  )
459
+ if response_thinking is not None:
460
+ completion = dataclasses.replace(completion, thinking=Thinking(text=response_thinking))
421
461
  completions.append(completion)
422
462
 
423
463
  return RequestResult(
424
464
  success=True,
425
465
  cached=cached,
426
- request_time=response["request_time"],
427
- request_datetime=response["request_datetime"],
466
+ request_time=raw_response["request_time"],
467
+ request_datetime=raw_response["request_datetime"],
428
468
  completions=completions,
429
469
  embedding=[],
430
470
  )
@@ -617,8 +657,8 @@ class AnthropicLegacyClient(CachingClient):
617
657
  if logprobs["tokens"] != tokens:
618
658
  # This is a known limitation with the Anthropic API. For now keep track of the
619
659
  # entries with the mismatch.
620
- hlog(
621
- f"WARNING: naive truncation for logprobs did not work."
660
+ hwarn(
661
+ f"naive truncation for logprobs did not work."
622
662
  f"\nRequest:{raw_request}\nExpected: {tokens}\nActual: {logprobs['tokens']}"
623
663
  )
624
664
  check_logprobs = True
File without changes