crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,355 @@
1
+ import dacite
2
+ import json
3
+ import os
4
+ import typing
5
+ from collections import Counter
6
+ from typing import Any, Dict, List, Optional
7
+ import torch
8
+
9
+ from tqdm import tqdm
10
+ from dataclasses import replace
11
+ from datasets import load_dataset
12
+
13
+ from helm.benchmark.adaptation.request_state import RequestState
14
+ from helm.common.general import ensure_directory_exists, write, asdict_without_nones
15
+ from helm.common.hierarchical_logger import hlog, htrack_block, hwarn
16
+ from helm.common.cache import cache_stats
17
+ from helm.benchmark.scenarios.scenario import (
18
+ Scenario,
19
+ create_scenario,
20
+ Instance,
21
+ get_scenario_cache_path,
22
+ with_instance_ids,
23
+ )
24
+ from helm.benchmark.adaptation.adapters.adapter import Adapter
25
+ from helm.benchmark.adaptation.adapters.adapter_factory import AdapterFactory
26
+ from helm.benchmark.adaptation.scenario_state import ScenarioState
27
+ from helm.benchmark.run_spec import RunSpec
28
+ from helm.benchmark.data_preprocessor import DataPreprocessor
29
+ from helm.benchmark.executor import ExecutionSpec
30
+ from helm.benchmark.metrics.metric_name import MetricName
31
+ from helm.benchmark.metrics.dry_run_metrics import DryRunMetric
32
+ from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats, create_metric, Stat
33
+ from helm.benchmark.runner import (
34
+ Runner,
35
+ remove_stats_nans,
36
+ remove_per_instance_stats_nans,
37
+ )
38
+
39
+ scenario_to_metric_name = {
40
+ "air_bench_2024": "air_score",
41
+ "babi_qa": "quasi_exact_match",
42
+ "bbq": "quasi_exact_match",
43
+ "blimp": "exact_match",
44
+ "boolq": "quasi_exact_match",
45
+ "civil_comments": "quasi_exact_match",
46
+ "dyck_language": "exact_match_indicator",
47
+ "entity_data_imputation": "quasi_exact_match",
48
+ "entity_matching": "quasi_exact_match",
49
+ "imdb": "quasi_exact_match",
50
+ "legal_support": "quasi_exact_match",
51
+ "raft": "quasi_exact_match",
52
+ "synthetic_reasoning": "quasi_exact_match",
53
+ "truthful_qa": "exact_match",
54
+ "wikifact": "quasi_exact_match",
55
+ "mmlu": "exact_match",
56
+ "commonsense": "exact_match",
57
+ "gsm": "final_number_exact_match",
58
+ # "gsm": ["exact_match_indicator", "final_number_exact_match"],
59
+ "legalbench": "quasi_exact_match",
60
+ "math": "math_equiv_chain_of_thought",
61
+ "med_qa": "quasi_exact_match",
62
+ "thai_exam": "exact_match",
63
+ }
64
+
65
+
66
+ class REEvalRunner(Runner):
67
+ """
68
+ This runner implements the basic (non-amortized) method described in the paper
69
+ `Reliable and Efficient Amortized Model-Based Evaluation`. This approach, which is
70
+ also known as Computerized Adaptive Testing (CAT) within the framework of Item Response
71
+ Theory (IRT), leverages adaptive testing to evaluate model performance.
72
+
73
+ The difficulties of the questions are provided in a HuggingFace repository. In addition,
74
+ the authors of the paper will supply a Python package for calculating these difficulties.
75
+ At each iteration, the runner estimates the model's ability based on all previously
76
+ administered questions and their corresponding responses. It then selects the next question
77
+ whose difficulty is closest to the estimated ability, thereby reliably and efficiently
78
+ eliciting the model's ability.
79
+ """
80
+
81
+ def __init__(
82
+ self,
83
+ execution_spec: ExecutionSpec,
84
+ output_path: str,
85
+ suite: str,
86
+ skip_instances: bool,
87
+ cache_instances: bool,
88
+ cache_instances_only: bool,
89
+ skip_completed_runs: bool,
90
+ exit_on_error: bool,
91
+ ):
92
+ super().__init__(
93
+ execution_spec=execution_spec,
94
+ output_path=output_path,
95
+ suite=suite,
96
+ skip_instances=skip_instances,
97
+ cache_instances=cache_instances,
98
+ cache_instances_only=cache_instances_only,
99
+ skip_completed_runs=skip_completed_runs,
100
+ exit_on_error=exit_on_error,
101
+ )
102
+
103
+ def _estimate_model_ability(
104
+ self,
105
+ old_ability: float,
106
+ response_correctness: List[float],
107
+ instance_difficulties: List[float],
108
+ ) -> float:
109
+ def closure():
110
+ optim.zero_grad()
111
+ probs = torch.sigmoid(ability + difficulties)
112
+ loss = -torch.distributions.Bernoulli(probs=probs).log_prob(responses).mean()
113
+ loss.backward()
114
+ return loss
115
+
116
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
117
+ responses = torch.tensor(response_correctness, device=device)
118
+ difficulties = torch.tensor(instance_difficulties, device=device)
119
+ ability = torch.tensor([old_ability], requires_grad=True, device=device)
120
+ optim = torch.optim.LBFGS([ability], lr=0.1, max_iter=20, history_size=10, line_search_fn="strong_wolfe")
121
+
122
+ for iteration in range(100):
123
+ loss = optim.step(closure)
124
+
125
+ if iteration > 0:
126
+ prev_ability = ability.clone()
127
+ prev_loss = loss
128
+ d_loss = prev_loss - loss
129
+ d_theta = torch.norm(prev_ability - ability, p=2)
130
+ grad_norm = torch.norm(optim.param_groups[0]["params"][0].grad, p=2)
131
+ if d_loss < 1e-5 and d_theta < 1e-5 and grad_norm < 1e-5:
132
+ break
133
+
134
+ return ability.item()
135
+
136
+ def run_one(self, run_spec: RunSpec):
137
+ run_path: str = self._get_run_path(run_spec)
138
+ if self.skip_completed_runs and self._is_run_completed(run_path):
139
+ hlog(f"Skipping run {run_spec.name} because run is completed and all output files exist.")
140
+ return
141
+ ensure_directory_exists(run_path)
142
+
143
+ # Load the scenario
144
+ scenario: Scenario = create_scenario(run_spec.scenario_spec)
145
+
146
+ # This 'output_path' will be used when the model's input instances are saved.
147
+ args_str = ",".join([f"{k}={v}" for k, v in sorted(run_spec.scenario_spec.args.items())])
148
+ scenario_name_with_args = f"{scenario.name}:{args_str}" if args_str else f"{scenario.name}"
149
+ input_instances_output_path = os.path.join(self.instances_path, scenario_name_with_args)
150
+ input_instances_file_path = os.path.join(input_instances_output_path, "input_instances.json")
151
+
152
+ instances: List[Instance]
153
+ if self.skip_instances:
154
+ instances = []
155
+ else:
156
+ if self.cache_instances and os.path.exists(input_instances_file_path):
157
+ with open(input_instances_file_path) as f:
158
+ json_instances: List[Dict[str, Any]] = json.load(f)
159
+ instances = [dacite.from_dict(Instance, instance) for instance in json_instances]
160
+ else:
161
+ # Create the instances of the scenario
162
+ scenario_output_path = get_scenario_cache_path(self.output_path, scenario.name)
163
+ with htrack_block("scenario.get_instances"):
164
+ instances = scenario.get_instances(scenario_output_path)
165
+ if self.cache_instances and not os.path.exists(input_instances_file_path):
166
+ # Save instances to file
167
+ ensure_directory_exists(input_instances_output_path)
168
+ write(
169
+ os.path.join(input_instances_file_path),
170
+ json.dumps([asdict_without_nones(instance) for instance in instances], indent=2),
171
+ )
172
+ if self.cache_instances_only:
173
+ return # Exit after saving the instances.
174
+
175
+ # Give each instance a unique ID
176
+ if any([instance.id is None for instance in instances]):
177
+ instances = with_instance_ids(instances)
178
+
179
+ # Data preprocessing
180
+ instances = DataPreprocessor(run_spec.data_augmenter_spec).preprocess(
181
+ instances, self.executor.execution_spec.parallelism
182
+ )
183
+
184
+ # Adapt (convert to requests)
185
+ adapter: Adapter = AdapterFactory.get_adapter(run_spec.adapter_spec, self.tokenizer_service)
186
+ unasked_request_states_without_difficulty: List[RequestState] = adapter.adapt(
187
+ instances, self.executor.execution_spec.parallelism
188
+ )
189
+
190
+ # load difficulty
191
+ split_name = "dyck_language_np_3" if scenario.name == "dyck_language" else scenario.name
192
+ try:
193
+ difficulty_dataset = load_dataset("stair-lab/reeval-difficulty", split=split_name)
194
+ prompt_to_difficulty: dict[str, float] = {row["request.prompt"]: row["z"] for row in difficulty_dataset}
195
+ except ValueError:
196
+ hwarn(f"no available difficulty for {split_name}, skipping")
197
+ return
198
+
199
+ unasked_request_states: List[RequestState] = []
200
+ for request_state in unasked_request_states_without_difficulty:
201
+ prompt = request_state.request.prompt
202
+ if prompt in prompt_to_difficulty:
203
+ difficulty = prompt_to_difficulty[prompt]
204
+ current_extra_data = request_state.instance.extra_data or {}
205
+ if "difficulty" in current_extra_data:
206
+ raise Exception("Extra_data already contains a 'difficulty' key.")
207
+ new_extra_data = current_extra_data.copy()
208
+ new_extra_data["difficulty"] = difficulty
209
+ new_instance = replace(request_state.instance, extra_data=new_extra_data)
210
+ new_request_state = replace(request_state, instance=new_instance)
211
+ unasked_request_states.append(new_request_state)
212
+ assert unasked_request_states
213
+
214
+ # Execute the requests in an reeval manner
215
+ assert run_spec.adapter_spec.reeval_parameters is not None
216
+ model_ability: float = run_spec.adapter_spec.reeval_parameters.model_ability or 0.0
217
+ scenario_metric_name: str = scenario_to_metric_name[scenario.name]
218
+
219
+ asked_request_states: List[RequestState] = []
220
+ reeval_trajectory: Dict[str, List[float]] = {
221
+ "model_ability": [],
222
+ "response_correctness": [],
223
+ "instance_difficulties": [],
224
+ }
225
+
226
+ assert run_spec.adapter_spec.max_eval_instances is not None
227
+ for _ in tqdm(range(run_spec.adapter_spec.max_eval_instances), desc="REEval Execution"):
228
+ if not unasked_request_states:
229
+ break
230
+
231
+ selected_item: Optional[RequestState] = None
232
+ min_diff = float("inf")
233
+ for item in unasked_request_states:
234
+ assert item.instance.extra_data is not None
235
+ diff = abs(item.instance.extra_data["difficulty"] + model_ability)
236
+ if diff < min_diff:
237
+ min_diff = diff
238
+ selected_item = item
239
+ assert selected_item is not None
240
+ unasked_request_states.remove(selected_item)
241
+
242
+ # Execute the request
243
+ single_scenario_state: ScenarioState = ScenarioState(
244
+ adapter_spec=run_spec.adapter_spec,
245
+ request_states=[selected_item],
246
+ annotator_specs=run_spec.annotators,
247
+ )
248
+
249
+ # Execute (fill up results)
250
+ single_scenario_state = self.executor.execute(single_scenario_state)
251
+
252
+ # Annotate (post-process the results)
253
+ single_scenario_state = self.annotator_executor.execute(single_scenario_state)
254
+
255
+ # Apply the metrics
256
+ # When performing a dry run, only estimate the number of tokens instead
257
+ # of calculating the metrics.
258
+ metrics: List[MetricInterface] = (
259
+ [DryRunMetric()]
260
+ if self.dry_run
261
+ else [create_metric(metric_spec) for metric_spec in run_spec.metric_specs]
262
+ )
263
+
264
+ temp_per_instance_stats: List[PerInstanceStats] = []
265
+ with htrack_block(f"{len(metrics)} metrics"):
266
+ for metric in metrics:
267
+ with htrack_block(metric):
268
+ temp_metric_result: MetricResult = metric.evaluate(
269
+ single_scenario_state,
270
+ self.metric_service,
271
+ self.eval_cache_path,
272
+ self.executor.execution_spec.parallelism,
273
+ )
274
+ temp_per_instance_stats.extend(temp_metric_result.per_instance_stats)
275
+
276
+ # Update the reeval request states
277
+ asked_request_states.extend(single_scenario_state.request_states)
278
+
279
+ # Update the reeval trajectory
280
+ reeval_trajectory["model_ability"].append(model_ability)
281
+ scenario_metric_value = [
282
+ s for s in temp_per_instance_stats[0].stats if s.name.name == scenario_metric_name
283
+ ][0].mean
284
+
285
+ assert scenario_metric_value is not None
286
+ reeval_trajectory["response_correctness"].append(scenario_metric_value)
287
+ assert selected_item.instance.extra_data is not None
288
+ reeval_trajectory["instance_difficulties"].append(selected_item.instance.extra_data["difficulty"])
289
+
290
+ # Estimate the model ability
291
+ model_ability = self._estimate_model_ability(
292
+ old_ability=model_ability,
293
+ response_correctness=reeval_trajectory["response_correctness"],
294
+ instance_difficulties=reeval_trajectory["instance_difficulties"],
295
+ )
296
+
297
+ # Create the scenario state
298
+ scenario_state: ScenarioState = ScenarioState(
299
+ adapter_spec=run_spec.adapter_spec,
300
+ request_states=asked_request_states,
301
+ annotator_specs=run_spec.annotators,
302
+ )
303
+
304
+ stats: List[Stat] = []
305
+ per_instance_stats: List[PerInstanceStats] = []
306
+ with htrack_block(f"{len(metrics)} metrics"):
307
+ for metric in metrics:
308
+ with htrack_block(metric):
309
+ metric_result: MetricResult = metric.evaluate(
310
+ scenario_state,
311
+ self.metric_service,
312
+ self.eval_cache_path,
313
+ self.executor.execution_spec.parallelism,
314
+ )
315
+ stats.extend(metric_result.aggregated_stats)
316
+ per_instance_stats.extend(metric_result.per_instance_stats)
317
+
318
+ # Check that there aren't duplicate `Stat`s
319
+ # Note: doesn't catch near misses.
320
+ metric_counts: typing.Counter[MetricName] = Counter([stat.name for stat in stats])
321
+ for metric_name, count in metric_counts.items():
322
+ if count > 1:
323
+ hwarn(f"duplicate metric name {metric_name}")
324
+
325
+ # Print out the number of stats
326
+ hlog(f"Generated {len(stats)} stats.")
327
+
328
+ if self.skip_instances:
329
+ hlog("skip_instances was True. Skipping writing results out.")
330
+ return
331
+
332
+ # Output benchmarking information and results to files
333
+ write(os.path.join(run_path, "run_spec.json"), json.dumps(asdict_without_nones(run_spec), indent=2))
334
+
335
+ # Write out scenario
336
+ write(os.path.join(run_path, "scenario.json"), json.dumps(asdict_without_nones(scenario), indent=2))
337
+
338
+ # Write scenario state
339
+ write(os.path.join(run_path, "scenario_state.json"), json.dumps(asdict_without_nones(scenario_state), indent=2))
340
+
341
+ write(
342
+ os.path.join(run_path, "stats.json"),
343
+ json.dumps([asdict_without_nones(stat) for stat in remove_stats_nans(stats)], indent=2),
344
+ )
345
+ write(
346
+ os.path.join(run_path, "per_instance_stats.json"),
347
+ json.dumps(list(map(asdict_without_nones, remove_per_instance_stats_nans(per_instance_stats))), indent=2),
348
+ )
349
+
350
+ write(
351
+ os.path.join(run_path, "reeval_trajectory.json"),
352
+ json.dumps(reeval_trajectory, indent=2),
353
+ )
354
+
355
+ cache_stats.print_status()
helm/benchmark/run.py CHANGED
@@ -9,7 +9,7 @@ from helm.benchmark import model_metadata_registry
9
9
  from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
10
10
  from helm.common.cache_backend_config import MongoCacheBackendConfig, SqliteCacheBackendConfig
11
11
  from helm.common.general import ensure_directory_exists
12
- from helm.common.hierarchical_logger import hlog, htrack, htrack_block
12
+ from helm.common.hierarchical_logger import hlog, htrack, htrack_block, setup_default_logging, hwarn
13
13
  from helm.common.authentication import Authentication
14
14
  from helm.common.object_spec import parse_object_spec, get_class_by_name
15
15
  from helm.proxy.services.remote_service import create_authentication, add_service_args
@@ -200,83 +200,9 @@ def validate_args(args):
200
200
 
201
201
 
202
202
  @htrack(None)
203
- def main():
204
- parser = argparse.ArgumentParser()
205
- add_service_args(parser)
206
- parser.add_argument(
207
- "-c",
208
- "--conf-paths",
209
- nargs="+",
210
- help="Where to read RunSpecs to run from",
211
- default=[],
212
- )
213
- parser.add_argument(
214
- "--models-to-run",
215
- nargs="+",
216
- help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
217
- default=None,
218
- )
219
- parser.add_argument(
220
- "--groups-to-run",
221
- nargs="+",
222
- help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
223
- default=None,
224
- )
225
- parser.add_argument(
226
- "--exit-on-error",
227
- action="store_true",
228
- help="Fail and exit immediately if a particular RunSpec fails.",
229
- )
230
- parser.add_argument(
231
- "--skip-completed-runs",
232
- action="store_true",
233
- help="Skip RunSpecs that have completed i.e. output files exists.",
234
- )
235
- parser.add_argument(
236
- "--priority",
237
- type=int,
238
- default=None,
239
- help="Run RunSpecs with priority less than or equal to this number. "
240
- "If a value for --priority is not specified, run on everything",
241
- )
242
- parser.add_argument(
243
- "--run-specs",
244
- nargs="*",
245
- help="DEPRECATED: Use --run-entries instead. Will be removed in a future release. "
246
- "Specifies run entries to run.",
247
- default=[],
248
- )
249
- parser.add_argument("-r", "--run-entries", nargs="*", help="Specifies run entries to run", default=[])
250
- parser.add_argument(
251
- "--enable-huggingface-models",
252
- nargs="+",
253
- default=[],
254
- help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
255
- "Format: namespace/model_name[@revision]",
256
- )
257
- parser.add_argument(
258
- "--enable-local-huggingface-models",
259
- nargs="+",
260
- default=[],
261
- help="Experimental: Enable using AutoModelForCausalLM models from a local path.",
262
- )
263
- parser.add_argument(
264
- "--runner-class-name",
265
- type=str,
266
- default=None,
267
- help="Full class name of the Runner class to use. If unset, uses the default Runner.",
268
- )
269
- parser.add_argument(
270
- "--openvino",
271
- action="store_true",
272
- default=False,
273
- help="Experimental: Apply openvino optimization to Hugging Face AutoModelForCausalLM models "
274
- "specified with the --enable-huggingface-models and --enable-local-huggingface-models flags.",
275
- )
276
- add_run_args(parser)
277
- args = parser.parse_args()
278
- validate_args(args)
203
+ def helm_run(args):
279
204
 
205
+ validate_args(args)
280
206
  register_builtin_configs_from_helm_package()
281
207
  register_configs_from_directory(args.local_path)
282
208
 
@@ -284,19 +210,13 @@ def main():
284
210
  from helm.benchmark.huggingface_registration import register_huggingface_hub_model_from_flag_value
285
211
 
286
212
  for huggingface_model_name in args.enable_huggingface_models:
287
- if args.openvino:
288
- register_huggingface_hub_model_from_flag_value(huggingface_model_name, args.openvino)
289
- else:
290
- register_huggingface_hub_model_from_flag_value(huggingface_model_name)
213
+ register_huggingface_hub_model_from_flag_value(huggingface_model_name)
291
214
 
292
215
  if args.enable_local_huggingface_models:
293
216
  from helm.benchmark.huggingface_registration import register_huggingface_local_model_from_flag_value
294
217
 
295
218
  for huggingface_model_path in args.enable_local_huggingface_models:
296
- if args.openvino:
297
- register_huggingface_local_model_from_flag_value(huggingface_model_path, args.openvino)
298
- else:
299
- register_huggingface_local_model_from_flag_value(huggingface_model_path)
219
+ register_huggingface_local_model_from_flag_value(huggingface_model_path)
300
220
 
301
221
  run_entries: List[RunEntry] = []
302
222
  if args.conf_paths:
@@ -323,12 +243,16 @@ def main():
323
243
  if model_to_run not in all_models:
324
244
  raise Exception(f"Unknown model '{model_to_run}' passed to --models-to-run")
325
245
  else:
326
- model_expander_pattern = re.compile(
246
+ model_expander_wildcard_pattern = re.compile(
327
247
  r"\bmodel=(?:all|text_code|text|code|instruction_following|full_functionality_text|limited_functionality_text)\b" # noqa: E501
328
248
  )
329
- if any(model_expander_pattern.search(run_entry.description) for run_entry in run_entries):
249
+ if any(model_expander_wildcard_pattern.search(run_entry.description) for run_entry in run_entries):
330
250
  raise Exception("--models-to-run must be set if the `models=` run expander expands to multiple models")
331
251
 
252
+ model_expander_pattern = re.compile(r"\bmodel=\b")
253
+ if not any(model_expander_pattern.search(run_entry.description) for run_entry in run_entries):
254
+ raise Exception("--models-to-run must be set if the `models=` run expander is omitted")
255
+
332
256
  run_specs = run_entries_to_run_specs(
333
257
  run_entries=run_entries,
334
258
  max_eval_instances=args.max_eval_instances,
@@ -367,13 +291,85 @@ def main():
367
291
  )
368
292
 
369
293
  if args.run_specs:
370
- hlog(
371
- "WARNING: The --run-specs flag is deprecated and will be removed in a future release. "
372
- "Use --run-entries instead."
294
+ hwarn(
295
+ "The --run-specs flag is deprecated and will be removed in a future release. " "Use --run-entries instead."
373
296
  )
374
297
 
375
298
  hlog("Done.")
376
299
 
377
300
 
301
+ # Separate parsing from starting HELM so we can setup logging
302
+ def main():
303
+ parser = argparse.ArgumentParser()
304
+ add_service_args(parser)
305
+ parser.add_argument(
306
+ "-c",
307
+ "--conf-paths",
308
+ nargs="+",
309
+ help="Where to read RunSpecs to run from",
310
+ default=[],
311
+ )
312
+ parser.add_argument(
313
+ "--models-to-run",
314
+ nargs="+",
315
+ help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
316
+ default=None,
317
+ )
318
+ parser.add_argument(
319
+ "--groups-to-run",
320
+ nargs="+",
321
+ help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
322
+ default=None,
323
+ )
324
+ parser.add_argument(
325
+ "--exit-on-error",
326
+ action="store_true",
327
+ help="Fail and exit immediately if a particular RunSpec fails.",
328
+ )
329
+ parser.add_argument(
330
+ "--skip-completed-runs",
331
+ action="store_true",
332
+ help="Skip RunSpecs that have completed i.e. output files exists.",
333
+ )
334
+ parser.add_argument(
335
+ "--priority",
336
+ type=int,
337
+ default=None,
338
+ help="Run RunSpecs with priority less than or equal to this number. "
339
+ "If a value for --priority is not specified, run on everything",
340
+ )
341
+ parser.add_argument(
342
+ "--run-specs",
343
+ nargs="*",
344
+ help="DEPRECATED: Use --run-entries instead. Will be removed in a future release. "
345
+ "Specifies run entries to run.",
346
+ default=[],
347
+ )
348
+ parser.add_argument("-r", "--run-entries", nargs="*", help="Specifies run entries to run", default=[])
349
+ parser.add_argument(
350
+ "--enable-huggingface-models",
351
+ nargs="+",
352
+ default=[],
353
+ help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
354
+ "Format: namespace/model_name[@revision]",
355
+ )
356
+ parser.add_argument(
357
+ "--enable-local-huggingface-models",
358
+ nargs="+",
359
+ default=[],
360
+ help="Experimental: Enable using AutoModelForCausalLM models from a local path.",
361
+ )
362
+ parser.add_argument(
363
+ "--runner-class-name",
364
+ type=str,
365
+ default=None,
366
+ help="Full class name of the Runner class to use. If unset, uses the default Runner.",
367
+ )
368
+ add_run_args(parser)
369
+ args = parser.parse_args()
370
+ setup_default_logging()
371
+ return helm_run(args)
372
+
373
+
378
374
  if __name__ == "__main__":
379
375
  main()