crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/bbq_metrics.py +12 -0
  27. helm/benchmark/metrics/classification_metrics.py +19 -1
  28. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  29. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  30. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  31. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  32. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  33. helm/benchmark/metrics/comet_metric.py +1 -1
  34. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  35. helm/benchmark/metrics/copyright_metrics.py +1 -1
  36. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  37. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  38. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  39. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  40. helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
  41. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  42. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  43. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  44. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  45. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  46. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  47. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  48. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  49. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  50. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  51. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  52. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  53. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  54. helm/benchmark/metrics/medec_metrics.py +25 -2
  55. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  56. helm/benchmark/metrics/metric.py +25 -0
  57. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  58. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  59. helm/benchmark/metrics/safety_metrics.py +13 -1
  60. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  61. helm/benchmark/metrics/summac/model_summac.py +3 -3
  62. helm/benchmark/metrics/summarization_metrics.py +129 -1
  63. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  64. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  65. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  66. helm/benchmark/model_deployment_registry.py +11 -19
  67. helm/benchmark/presentation/create_plots.py +11 -2
  68. helm/benchmark/presentation/run_display.py +13 -3
  69. helm/benchmark/presentation/run_entry.py +2 -2
  70. helm/benchmark/presentation/schema.py +10 -22
  71. helm/benchmark/presentation/summarize.py +189 -14
  72. helm/benchmark/presentation/taxonomy_info.py +20 -0
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/run.py +15 -4
  75. helm/benchmark/run_expander.py +4 -0
  76. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  77. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  78. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  79. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  80. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  81. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  82. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  83. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  84. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  85. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  86. helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
  87. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  88. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
  89. helm/benchmark/runner.py +7 -0
  90. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  91. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  92. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  93. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  94. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  95. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  96. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  97. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  98. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  99. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  100. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  101. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  102. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  103. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
  104. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
  105. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
  106. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  107. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  108. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  109. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  110. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  111. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  112. helm/benchmark/scenarios/bold_scenario.py +15 -0
  113. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  115. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  116. helm/benchmark/scenarios/clear_scenario.py +23 -0
  117. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  118. helm/benchmark/scenarios/code_scenario.py +28 -0
  119. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  120. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  121. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  122. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  123. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  124. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  125. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  126. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  127. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  128. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  129. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  130. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  131. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  132. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  133. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  134. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  135. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  136. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  137. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  138. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  139. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  140. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  141. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  142. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  143. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  144. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  145. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  146. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  147. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  148. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  149. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  150. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  151. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  152. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  153. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  154. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  155. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  156. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  157. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  158. helm/benchmark/scenarios/ice_scenario.py +21 -1
  159. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  160. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  161. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  162. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  163. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  164. helm/benchmark/scenarios/koala_scenario.py +21 -1
  165. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  166. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  167. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  168. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  169. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  170. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  171. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  172. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  173. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  175. helm/benchmark/scenarios/math_scenario.py +54 -20
  176. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  177. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  178. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  179. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  180. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  181. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  182. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  183. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  184. helm/benchmark/scenarios/medec_scenario.py +23 -0
  185. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  186. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  187. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  188. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  189. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  190. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  191. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  192. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  193. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  194. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  195. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  196. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  197. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  198. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  199. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  200. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  201. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  202. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  203. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  204. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  205. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  206. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  207. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  208. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  209. helm/benchmark/scenarios/quac_scenario.py +14 -0
  210. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  211. helm/benchmark/scenarios/raft_scenario.py +15 -0
  212. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  213. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  214. helm/benchmark/scenarios/scenario.py +31 -0
  215. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  216. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  217. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  218. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  219. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  220. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  221. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  222. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  223. helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
  224. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  225. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  226. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  227. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  228. helm/benchmark/scenarios/spider_scenario.py +18 -0
  229. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  230. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  231. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  232. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  233. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  234. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  235. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  236. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  237. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  238. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  239. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  240. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  241. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  242. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  243. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  244. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  245. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  246. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  247. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  248. helm/benchmark/slurm_jobs.py +1 -2
  249. helm/benchmark/slurm_runner.py +8 -1
  250. helm/benchmark/static/schema_arabic.yaml +271 -0
  251. helm/benchmark/static/schema_classic.yaml +0 -17
  252. helm/benchmark/static/schema_long_context.yaml +17 -18
  253. helm/benchmark/static/schema_medhelm.yaml +36 -0
  254. helm/benchmark/static/schema_slp.yaml +219 -0
  255. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  256. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  257. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  258. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  259. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  260. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  261. helm/benchmark/static_build/index.html +5 -6
  262. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  263. helm/clients/ai21_client.py +2 -0
  264. helm/clients/aleph_alpha_client.py +2 -0
  265. helm/clients/anthropic_client.py +7 -1
  266. helm/clients/audio_language/diva_llama_client.py +2 -0
  267. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  268. helm/clients/audio_language/llama_omni/constants.py +9 -0
  269. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  270. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  271. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  272. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  273. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  274. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  275. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  276. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  277. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  278. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  279. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  280. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  281. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  282. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  283. helm/clients/audio_language/llama_omni/utils.py +202 -0
  284. helm/clients/audio_language/llama_omni_client.py +2 -1
  285. helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
  286. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  287. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  288. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  289. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  290. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  291. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  292. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  293. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  294. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  295. helm/clients/bedrock_client.py +63 -6
  296. helm/clients/cohere_client.py +3 -0
  297. helm/clients/dspy_client.py +135 -0
  298. helm/clients/google_client.py +2 -0
  299. helm/clients/http_model_client.py +2 -0
  300. helm/clients/huggingface_client.py +4 -3
  301. helm/clients/ibm_client.py +3 -1
  302. helm/clients/image_generation/adobe_vision_client.py +2 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  304. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  305. helm/clients/image_generation/cogview2_client.py +2 -1
  306. helm/clients/image_generation/dalle2_client.py +2 -0
  307. helm/clients/image_generation/dalle_mini_client.py +2 -1
  308. helm/clients/image_generation/deep_floyd_client.py +2 -0
  309. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  310. helm/clients/image_generation/lexica_client.py +2 -0
  311. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  312. helm/clients/image_generation/mindalle_client.py +2 -1
  313. helm/clients/image_generation/together_image_generation_client.py +2 -0
  314. helm/clients/megatron_client.py +2 -0
  315. helm/clients/mistral_client.py +2 -0
  316. helm/clients/moderation_api_client.py +2 -0
  317. helm/clients/openai_client.py +38 -21
  318. helm/clients/openai_responses_client.py +34 -8
  319. helm/clients/openrouter_client.py +31 -0
  320. helm/clients/palmyra_client.py +2 -1
  321. helm/clients/reka_client.py +2 -1
  322. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  323. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  324. helm/clients/test_huggingface_client.py +3 -3
  325. helm/clients/test_openrouter_client.py +69 -0
  326. helm/clients/together_client.py +52 -13
  327. helm/clients/vertexai_client.py +23 -11
  328. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  329. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  330. helm/clients/vision_language/idefics_client.py +2 -1
  331. helm/clients/vision_language/open_flamingo_client.py +2 -1
  332. helm/clients/vision_language/paligemma_client.py +2 -1
  333. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  334. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  335. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  336. helm/clients/vllm_client.py +43 -7
  337. helm/clients/vllm_granite_thinking_client.py +56 -0
  338. helm/clients/writer_client.py +5 -2
  339. helm/common/critique_request.py +0 -1
  340. helm/common/hierarchical_logger.py +103 -34
  341. helm/common/object_spec.py +23 -8
  342. helm/common/optional_dependencies.py +1 -1
  343. helm/common/test_general.py +4 -0
  344. helm/common/test_logging.py +94 -0
  345. helm/config/model_deployments.yaml +1001 -187
  346. helm/config/model_metadata.yaml +602 -18
  347. helm/config/tokenizer_configs.yaml +202 -5
  348. helm/proxy/cli.py +1 -1
  349. helm/proxy/example_queries.py +8 -8
  350. helm/proxy/retry.py +5 -0
  351. helm/proxy/server.py +2 -1
  352. helm/proxy/static/index.css +4 -0
  353. helm/proxy/static/index.js +7 -1
  354. helm/tokenizers/auto_tokenizer.py +2 -2
  355. helm/tokenizers/grok_tokenizer.py +2 -0
  356. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  357. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  358. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  359. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  360. helm/benchmark/metrics/medalign_metrics.py +0 -14
  361. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  362. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  363. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  364. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  365. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  366. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  367. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  368. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  369. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  370. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  371. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
  372. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  373. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  374. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  375. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  376. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  377. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  378. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
  379. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  380. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
  381. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  382. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  383. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  384. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  385. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  386. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  387. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  388. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  389. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  390. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  391. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  392. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  393. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  394. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -1,4 +1,7 @@
1
1
  import logging
2
+ import logging.config
3
+ import yaml
4
+ import os
2
5
  import sys
3
6
  import time
4
7
  from typing import Any, Callable, List, Optional
@@ -34,22 +37,41 @@ class HierarchicalLogger(object):
34
37
  def indent(self) -> str:
35
38
  return " " * len(self.start_times)
36
39
 
37
- def track_begin(self, x: Any) -> None:
38
- self.logger.info(self.indent() + str(x) + " {")
40
+ def track_begin(self, x: Any, **kwargs) -> None:
41
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
42
+ self.logger.info(self.indent() + str(x) + " {", **kwargs)
39
43
  sys.stdout.flush()
40
44
  self.start_times.append(time.time())
41
45
 
42
- def track_end(self) -> None:
46
+ def track_end(self, **kwargs) -> None:
47
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
43
48
  t = time.time() - self.start_times.pop()
44
- self.logger.info(self.indent() + "} [%s]" % (format_time(t)))
49
+ self.logger.info(self.indent() + "} [%s]" % (format_time(t)), **kwargs)
45
50
  sys.stdout.flush()
46
51
 
47
- def log(self, x: Any) -> None:
48
- self.logger.info(self.indent() + str(x))
52
+ def log(self, x: Any, **kwargs) -> None:
53
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
54
+ self.logger.info(self.indent() + str(x), **kwargs)
49
55
  sys.stdout.flush()
50
56
 
51
- def warn(self, x: Any) -> None:
52
- self.logger.warning(self.indent() + str(x))
57
+ def debug(self, x: Any, **kwargs) -> None:
58
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
59
+ self.logger.debug(self.indent() + str(x), **kwargs)
60
+ sys.stdout.flush()
61
+
62
+ def warn(self, x: Any, **kwargs) -> None:
63
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
64
+ self.logger.warning(self.indent() + str(x), **kwargs)
65
+ sys.stdout.flush()
66
+
67
+ def error(self, x: Any, **kwargs) -> None:
68
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
69
+ self.logger.error(self.indent() + str(x), **kwargs)
70
+ sys.stdout.flush()
71
+
72
+ def exception(self, x: Any, **kwargs) -> None:
73
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
74
+ self.logger.exception(self.indent() + str(x), **kwargs)
53
75
  sys.stdout.flush()
54
76
 
55
77
 
@@ -69,23 +91,41 @@ singleton = HierarchicalLogger()
69
91
  # Exposed public methods
70
92
 
71
93
 
72
- def hlog(x: Any) -> None:
73
- singleton.log(x)
94
+ def hdebug(x: Any, **kwargs) -> None:
95
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
96
+ singleton.debug(x, **kwargs)
97
+
98
+
99
+ def hlog(x: Any, **kwargs) -> None:
100
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
101
+ singleton.log(x, **kwargs)
74
102
 
75
103
 
76
- def hwarn(x: Any) -> None:
77
- singleton.warn(x)
104
+ def hwarn(x: Any, **kwargs) -> None:
105
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
106
+ singleton.warn(x, **kwargs)
107
+
108
+
109
+ def herror(x: Any, **kwargs) -> None:
110
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
111
+ singleton.error(x, **kwargs)
112
+
113
+
114
+ def hexception(x: Any, **kwargs) -> None:
115
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
116
+ singleton.exception(x, **kwargs)
78
117
 
79
118
 
80
119
  class htrack_block:
81
- def __init__(self, x: Any) -> None:
120
+ def __init__(self, x: Any, stacklevel=1) -> None:
121
+ self._stacklevel = stacklevel + 1
82
122
  self.x = x
83
123
 
84
124
  def __enter__(self) -> None:
85
- singleton.track_begin(self.x)
125
+ singleton.track_begin(self.x, stacklevel=self._stacklevel)
86
126
 
87
127
  def __exit__(self, tpe: Any, value: Any, callback: Any) -> None:
88
- singleton.track_end()
128
+ singleton.track_end(stacklevel=self._stacklevel)
89
129
 
90
130
 
91
131
  class htrack:
@@ -116,34 +156,63 @@ class htrack:
116
156
  description = description.replace("$" + k, str(v))
117
157
  else:
118
158
  description = ""
119
- with htrack_block(parent + fn.__name__ + description):
159
+ with htrack_block(parent + fn.__name__ + description, stacklevel=2):
120
160
  return fn(*args, **kwargs)
121
161
 
122
162
  return wrapper
123
163
 
124
164
 
125
- def setup_default_logging():
165
+ def setup_default_logging(config_path: Optional[str] = None):
126
166
  """
127
- Setup a default logger to STDOUT for HELM via Python logging
128
- """
129
- formatter = ColoredFormatter(
130
- "%(bold_black)s%(asctime)s%(reset)s %(log_color)s%(levelname)-8s%(reset)s %(message)s",
131
- datefmt="%Y-%m-%dT%H:%M:%S",
132
- reset=True,
133
- log_colors={
134
- "DEBUG": "cyan",
135
- "INFO": "green",
136
- "WARNING": "yellow",
137
- "ERROR": "red",
138
- "CRITICAL": "red,bg_white",
139
- },
140
- secondary_log_colors={},
141
- style="%",
142
- )
167
+ Setup Python logging for HELM
143
168
 
169
+ Priority:
170
+ 1. External config file (YAML or JSON).
171
+ 2. ENV var LOG_LEVEL.
172
+ 3. a default logger to STDOUT
173
+ """
144
174
  logger = logging.getLogger("helm")
145
- logger.setLevel(logging.INFO)
146
175
  logger.propagate = False
176
+
177
+ if config_path and os.path.exists(config_path):
178
+ with open(config_path, "r") as f:
179
+ config = yaml.safe_load(f)
180
+ logging.config.dictConfig(config)
181
+ hdebug("setup custom HELM logging")
182
+ return
183
+
184
+ log_level = (os.getenv("HELM_LOG_LEVEL") or os.getenv("LOG_LEVEL") or "INFO").upper()
185
+ try:
186
+ logger.setLevel(getattr(logging, log_level))
187
+ except AttributeError:
188
+ logger.setLevel(logging.INFO)
189
+
190
+ # Set formatter
191
+ formatter: Optional[logging.Formatter] = None
192
+ if sys.stdout.isatty():
193
+ try:
194
+ formatter = ColoredFormatter(
195
+ "%(bold_black)s%(asctime)s%(reset)s %(log_color)s%(levelname)-8s%(reset)s %(message)s",
196
+ datefmt="%Y-%m-%dT%H:%M:%S",
197
+ reset=True,
198
+ log_colors={
199
+ "DEBUG": "cyan",
200
+ "INFO": "green",
201
+ "WARNING": "yellow",
202
+ "ERROR": "red",
203
+ "CRITICAL": "red,bg_white",
204
+ },
205
+ style="%",
206
+ )
207
+ except ImportError:
208
+ pass
209
+
210
+ if formatter is None:
211
+ # fallback
212
+ formatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
213
+
214
+ # Add default stdout handler
147
215
  handler = logging.StreamHandler(sys.stdout)
148
216
  handler.setFormatter(formatter)
149
217
  logger.addHandler(handler)
218
+ hdebug("setup default HELM logging")
@@ -55,14 +55,23 @@ def inject_object_spec_args(
55
55
  This is loosely based on instance (constant) bindings and provider bindings in Guice dependency injection.
56
56
 
57
57
  Example:
58
-
59
- class MyClass:
60
- def __init__(a: int, b: int, c: int, d: int = 0):
61
- pass
62
-
63
- old_object_spec = ObjectSpec(class_name="MyClass", args={"a": 11})
64
- new_object_spec = inject_object_spec_args(old_object_spec, {"b": 12}, {"c": lambda: 13})
65
- # new_object_spec is now ObjectSpec(class_name="MyClass", args={"a": 11, "b": 12, "c": 13})
58
+ >>> from helm.common.object_spec import * # NOQA
59
+ >>> import sys, types
60
+ >>> # Given a custom class with hashable arguments
61
+ >>> class MyClass:
62
+ ... def __init__(a: int, b: int, c: int, d: int = 0):
63
+ ... pass
64
+ >>> #
65
+ >>> # <boilerplate>: make a dummy module for MyClass to make this doctest exectuable
66
+ >>> sys.modules["my_module"] = type("MyModule", (types.ModuleType,), {"MyClass": MyClass})("my_module")
67
+ >>> # </boilerplate>
68
+ >>> #
69
+ >>> # Define new style and old style object specs
70
+ >>> old_object_spec = ObjectSpec(class_name="my_module.MyClass", args={"a": 11})
71
+ >>> new_object_spec = inject_object_spec_args(old_object_spec, {"b": 12}, {"c": lambda: 13})
72
+ >>> # new_object_spec is now
73
+ >>> print(new_object_spec)
74
+ ObjectSpec(class_name='my_module.MyClass', args={'a': 11, 'b': 12, 'c': 13})
66
75
  """
67
76
  cls = get_class_by_name(spec.class_name)
68
77
  init_signature = inspect.signature(cls.__init__)
@@ -93,6 +102,12 @@ def parse_object_spec(description: str) -> ObjectSpec:
93
102
  <class_name>:<key>=<value>,<key>=<value>
94
103
  Usually, the description is something that's succinct and can be typed on the command-line.
95
104
  Here, value defaults to string.
105
+
106
+ Example:
107
+ >>> from helm.common.object_spec import * # NOQA
108
+ >>> description = 'mscoco:model=huggingface_stable-diffusion-v1-4'
109
+ >>> parse_object_spec(description)
110
+ ObjectSpec(class_name='mscoco', args={'model': 'huggingface_stable-diffusion-v1-4'})
96
111
  """
97
112
 
98
113
  def parse_arg(arg: str) -> Tuple[str, Any]:
@@ -9,7 +9,7 @@ def handle_module_not_found_error(e: ModuleNotFoundError, suggestions: Optional[
9
9
  # TODO: Ask user to install more specific optional dependencies
10
10
  # e.g. crfm-helm[plots] or crfm-helm[server]
11
11
  suggested_commands = " or ".join(
12
- [f"`pip install crfm-helm[{suggestion}]`" for suggestion in (suggestions or []) + ["all"]]
12
+ [f'`pip install "crfm-helm[{suggestion}]"`' for suggestion in (suggestions or []) + ["all"]]
13
13
  )
14
14
  raise OptionalDependencyNotInstalled(
15
15
  f"Optional dependency {e.name} is not installed. Please run {suggested_commands} to install it."
@@ -1,5 +1,8 @@
1
1
  import shutil
2
2
  import os
3
+
4
+ import pytest
5
+
3
6
  from helm.common.general import (
4
7
  ensure_file_downloaded,
5
8
  format_tags,
@@ -12,6 +15,7 @@ from helm.common.general import (
12
15
 
13
16
 
14
17
  def test_ensure_file_downloaded():
18
+ pytest.skip("Skipping download tests because these downloads are not reliable and may be throttled")
15
19
  ensure_file_downloaded("https://ftp.gnu.org/gnu/tar/tar-1.34.tar.gz", "test-tar", unpack=True, unpack_type="untar")
16
20
  assert os.path.isdir("test-tar")
17
21
  shutil.rmtree("test-tar")
@@ -0,0 +1,94 @@
1
+ import sys
2
+ import tempfile
3
+ import textwrap
4
+ import pathlib
5
+ from helm.benchmark import run
6
+ from typing import List, Optional
7
+
8
+
9
+ class ArgvContext:
10
+ """
11
+ Helper to assign a temporary value to sys.argv and then restore it
12
+ """
13
+
14
+ def __init__(self, argv: Optional[List[str]]):
15
+ self.argv = argv
16
+ self._original_argv: Optional[List[str]] = None
17
+
18
+ def __enter__(self):
19
+ self._original_argv = sys.argv[:]
20
+ sys.argv = self.argv or []
21
+
22
+ def __exit__(self, exc_type, exc_val, exc_tb):
23
+ assert self._original_argv is not None # Satisfies mypy
24
+ sys.argv = self._original_argv
25
+
26
+
27
+ def test_run_with_custom_logging_config():
28
+ # Setup temporary directory
29
+ with tempfile.TemporaryDirectory(prefix="helm_test_") as tmp_dir_str:
30
+ tmp_dir = pathlib.Path(tmp_dir_str)
31
+ log_path = tmp_dir / "test.log"
32
+ log_config_path = tmp_dir / "test_config.yaml"
33
+
34
+ # Write custom YAML log config to file
35
+ log_config_text = textwrap.dedent(
36
+ f"""
37
+ version: 1
38
+ disable_existing_loggers: false
39
+ formatters:
40
+ simple:
41
+ datefmt: '%Y-%m-%dT%H:%M:%S'
42
+ format: '%(asctime)s %(levelname)s %(name)s %(message)s'
43
+ handlers:
44
+ file:
45
+ class: logging.FileHandler
46
+ filename: {log_path}
47
+ formatter: simple
48
+ level: DEBUG
49
+ mode: w
50
+ loggers:
51
+ helm:
52
+ handlers:
53
+ - file
54
+ level: DEBUG
55
+ propagate: false
56
+ """
57
+ ).strip()
58
+
59
+ log_config_path.write_text(log_config_text)
60
+
61
+ # Simulate command-line arguments
62
+ argv = [
63
+ "run.py", # Fake script name
64
+ "--run-entries",
65
+ "mmlu:subject=philosophy,model=openai/gpt2",
66
+ "-m",
67
+ "1",
68
+ "--suite",
69
+ "my-suite",
70
+ "--dry-run",
71
+ "--log-config",
72
+ str(log_config_path),
73
+ ]
74
+
75
+ # Call main
76
+ with ArgvContext(argv):
77
+ run.main()
78
+
79
+ # Check log file contents
80
+ assert log_path.exists(), "Log file was not created"
81
+ log_contents = log_path.read_text()
82
+
83
+ # Test that log file was written to disk as requested
84
+ print("Log Contents")
85
+ print("------------")
86
+ print(log_contents)
87
+
88
+ assert (
89
+ "mscoco" in log_contents or "huggingface" in log_contents or "dry-run" in log_contents
90
+ ), "Expected log content not found in log file:\n"
91
+
92
+
93
+ if __name__ == "__main__":
94
+ test_run_with_custom_logging_config()