crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/call_center_annotator.py +22 -11
  36. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  37. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  38. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  39. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  40. helm/benchmark/annotation/harm_bench_annotator.py +11 -24
  41. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  42. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  43. helm/benchmark/annotation/live_qa_annotator.py +10 -5
  44. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  45. helm/benchmark/annotation/medalign_annotator.py +100 -0
  46. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  47. helm/benchmark/annotation/medication_qa_annotator.py +90 -61
  48. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  49. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  50. helm/benchmark/annotation/model_as_judge.py +281 -18
  51. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  52. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  54. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  55. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  56. helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
  57. helm/benchmark/annotation/spider_annotator.py +18 -0
  58. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  59. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  60. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  61. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  62. helm/benchmark/annotation/xstest_annotator.py +20 -30
  63. helm/benchmark/annotation_executor.py +35 -15
  64. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  65. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  66. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  67. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  68. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  69. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  70. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  71. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  72. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  73. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  74. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  75. helm/benchmark/augmentations/perturbation.py +1 -1
  76. helm/benchmark/augmentations/space_perturbation.py +2 -2
  77. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  78. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  79. helm/benchmark/augmentations/test_perturbation.py +16 -13
  80. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  81. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  82. helm/benchmark/data_preprocessor.py +2 -2
  83. helm/benchmark/huggingface_registration.py +2 -7
  84. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  85. helm/benchmark/metrics/basic_metrics.py +6 -6
  86. helm/benchmark/metrics/bbq_metrics.py +2 -2
  87. helm/benchmark/metrics/bias_metrics.py +12 -3
  88. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  89. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  90. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  91. helm/benchmark/metrics/classification_metrics.py +76 -12
  92. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  93. helm/benchmark/metrics/code_metrics.py +5 -5
  94. helm/benchmark/metrics/comet_metric.py +125 -0
  95. helm/benchmark/metrics/common_metric_specs.py +9 -2
  96. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  97. helm/benchmark/metrics/copyright_metrics.py +4 -4
  98. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  99. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  101. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  102. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  103. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  104. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  105. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  106. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  107. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  108. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  109. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  110. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  113. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  115. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  116. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  117. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  118. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  120. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  122. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  123. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  124. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  125. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  126. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  127. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  128. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  129. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  130. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  131. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  132. helm/benchmark/metrics/medalign_metrics.py +34 -0
  133. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  134. helm/benchmark/metrics/medec_metrics.py +101 -0
  135. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  136. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  137. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  138. helm/benchmark/metrics/metric.py +3 -3
  139. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  140. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  141. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  142. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  143. helm/benchmark/metrics/nltk_helper.py +32 -0
  144. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  145. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  146. helm/benchmark/metrics/output_processing_metric.py +60 -0
  147. helm/benchmark/metrics/output_processors.py +15 -0
  148. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  149. helm/benchmark/metrics/ranking_metrics.py +3 -3
  150. helm/benchmark/metrics/reference_metric.py +3 -3
  151. helm/benchmark/metrics/safety_metrics.py +39 -17
  152. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  153. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  154. helm/benchmark/metrics/spider_metrics.py +7 -0
  155. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  156. helm/benchmark/metrics/statistic.py +1 -1
  157. helm/benchmark/metrics/summac/model_summac.py +1 -1
  158. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  159. helm/benchmark/metrics/summarization_metrics.py +19 -9
  160. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  161. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  162. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  163. helm/benchmark/metrics/test_metric.py +1 -1
  164. helm/benchmark/metrics/test_statistic.py +2 -2
  165. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  167. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  169. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  170. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  171. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  172. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  173. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  174. helm/benchmark/metrics/unitxt_metrics.py +21 -4
  175. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  176. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  177. helm/benchmark/model_metadata_registry.py +16 -0
  178. helm/benchmark/presentation/create_plots.py +1 -1
  179. helm/benchmark/presentation/schema.py +3 -0
  180. helm/benchmark/presentation/summarize.py +119 -256
  181. helm/benchmark/presentation/test_summarize.py +145 -3
  182. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  183. helm/benchmark/reeval_run.py +203 -0
  184. helm/benchmark/reeval_runner.py +355 -0
  185. helm/benchmark/run.py +8 -17
  186. helm/benchmark/run_expander.py +105 -8
  187. helm/benchmark/run_spec_factory.py +12 -0
  188. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  189. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  190. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  191. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  192. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  193. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  194. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  195. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  196. helm/benchmark/run_specs/finance_run_specs.py +6 -2
  197. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  198. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  199. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  200. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  201. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  202. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  203. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  204. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
  205. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  206. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  207. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  208. helm/benchmark/run_specs/vlm_run_specs.py +83 -5
  209. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  210. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  211. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  212. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  213. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  214. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  215. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  216. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  217. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  218. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  219. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  220. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  221. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  222. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  223. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  224. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  225. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  226. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  227. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  228. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  229. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  230. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  231. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  232. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  233. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  234. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  235. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  236. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  237. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  238. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  239. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  240. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  241. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  242. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  243. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  244. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  245. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  246. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  247. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  248. helm/benchmark/scenarios/bold_scenario.py +1 -1
  249. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  250. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  251. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  252. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  253. helm/benchmark/scenarios/clear_scenario.py +153 -0
  254. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  255. helm/benchmark/scenarios/code_scenario.py +17 -4
  256. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  257. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  258. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  259. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  260. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  261. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  262. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  263. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  264. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  265. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  266. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  267. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  268. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  269. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  270. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  271. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  272. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  273. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  274. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  275. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  276. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  277. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  278. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  279. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  280. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  281. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  282. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  283. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  284. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  285. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  286. helm/benchmark/scenarios/ice_scenario.py +8 -4
  287. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  288. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  289. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  290. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  291. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  292. helm/benchmark/scenarios/koala_scenario.py +1 -1
  293. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  294. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  295. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  296. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  297. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  298. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  299. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  300. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  301. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  302. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  303. helm/benchmark/scenarios/math_scenario.py +9 -1
  304. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  305. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  306. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  307. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  308. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  309. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  310. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  311. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  312. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  313. helm/benchmark/scenarios/medec_scenario.py +120 -0
  314. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  315. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  316. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  317. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  318. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  319. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  320. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  321. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  322. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  323. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  324. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  325. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  326. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  327. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  328. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  329. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  330. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  331. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  332. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  333. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  334. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  335. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  336. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  337. helm/benchmark/scenarios/quac_scenario.py +10 -1
  338. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  339. helm/benchmark/scenarios/raft_scenario.py +18 -3
  340. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  341. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  342. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  343. helm/benchmark/scenarios/scenario.py +9 -1
  344. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
  345. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  346. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  347. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  348. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  349. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  350. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  351. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  352. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  353. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  354. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  355. helm/benchmark/scenarios/spider_scenario.py +91 -0
  356. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  357. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  358. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  359. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  360. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  361. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  362. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  363. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  364. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  366. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  367. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  368. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  369. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  370. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  371. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  372. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  373. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  374. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  375. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  376. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  377. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  378. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  379. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  380. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  381. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  382. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  383. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  384. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  385. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  386. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  387. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  388. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  389. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  390. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  391. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  392. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  393. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  394. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  395. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  396. helm/benchmark/server.py +11 -0
  397. helm/benchmark/slurm_runner.py +1 -1
  398. helm/benchmark/static/schema_audio.yaml +752 -0
  399. helm/benchmark/static/schema_autobencher.yaml +150 -0
  400. helm/benchmark/static/schema_call_center.yaml +97 -60
  401. helm/benchmark/static/schema_capabilities.yaml +254 -0
  402. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  403. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  404. helm/benchmark/static/schema_enterprise.yaml +298 -0
  405. helm/benchmark/static/schema_finance.yaml +14 -12
  406. helm/benchmark/static/schema_heim.yaml +1389 -0
  407. helm/benchmark/static/schema_legal.yaml +566 -0
  408. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  409. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  410. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  411. helm/benchmark/static/schema_safety.yaml +42 -6
  412. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
  413. helm/benchmark/static/schema_social_audio.yaml +224 -0
  414. helm/benchmark/static/schema_sql.yaml +171 -0
  415. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
  416. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  417. helm/benchmark/static/schema_vhelm.yaml +151 -47
  418. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  419. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  420. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  421. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  422. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  423. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  424. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  425. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  426. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  427. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  428. helm/benchmark/static_build/config.js +1 -1
  429. helm/benchmark/static_build/index.html +5 -5
  430. helm/benchmark/window_services/default_window_service.py +1 -1
  431. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  432. helm/benchmark/window_services/ice_window_service.py +1 -1
  433. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  434. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  435. helm/benchmark/window_services/local_window_service.py +2 -2
  436. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  437. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  438. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  439. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  440. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  441. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  442. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  443. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  444. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  445. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  446. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  447. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  448. helm/benchmark/window_services/test_utils.py +1 -1
  449. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  450. helm/benchmark/window_services/tokenizer_service.py +0 -5
  451. helm/benchmark/window_services/yalm_window_service.py +1 -1
  452. helm/clients/ai21_client.py +3 -3
  453. helm/clients/aleph_alpha_client.py +1 -1
  454. helm/clients/audio_language/__init__.py +0 -0
  455. helm/clients/audio_language/diva_llama_client.py +118 -0
  456. helm/clients/audio_language/llama_omni_client.py +198 -0
  457. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  458. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  459. helm/clients/auto_client.py +4 -2
  460. helm/clients/azure_openai_client.py +55 -0
  461. helm/clients/bedrock_client.py +201 -7
  462. helm/clients/bedrock_utils.py +33 -0
  463. helm/clients/clip_scorers/clip_scorer.py +1 -1
  464. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  465. helm/clients/cohere_client.py +3 -3
  466. helm/clients/google_client.py +1 -1
  467. helm/clients/http_model_client.py +1 -1
  468. helm/clients/huggingface_client.py +10 -18
  469. helm/clients/ibm_client.py +267 -0
  470. helm/clients/image_generation/adobe_vision_client.py +1 -1
  471. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  472. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  473. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  474. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  475. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  476. helm/clients/image_generation/cogview2_client.py +1 -1
  477. helm/clients/image_generation/dalle2_client.py +1 -1
  478. helm/clients/image_generation/dalle3_client.py +2 -2
  479. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  480. helm/clients/image_generation/dalle_mini/data.py +1 -1
  481. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  482. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  483. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  484. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  485. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  486. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  487. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  488. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  489. helm/clients/image_generation/dalle_mini_client.py +1 -1
  490. helm/clients/image_generation/deep_floyd_client.py +1 -1
  491. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  492. helm/clients/image_generation/lexica_client.py +1 -1
  493. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  494. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  495. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  496. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  497. helm/clients/image_generation/mindalle_client.py +1 -1
  498. helm/clients/image_generation/together_image_generation_client.py +1 -1
  499. helm/clients/lit_gpt_client.py +2 -2
  500. helm/clients/mistral_client.py +62 -18
  501. helm/clients/nvidia_nim_client.py +0 -3
  502. helm/clients/openai_client.py +255 -21
  503. helm/clients/palmyra_client.py +2 -6
  504. helm/clients/reka_client.py +1 -1
  505. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  506. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  507. helm/clients/stanfordhealthcare_google_client.py +43 -0
  508. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  509. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  510. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  511. helm/clients/test_client.py +1 -1
  512. helm/clients/test_together_client.py +6 -1
  513. helm/clients/together_client.py +69 -7
  514. helm/clients/upstage_client.py +23 -0
  515. helm/clients/vertexai_client.py +39 -13
  516. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  517. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  518. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  519. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  520. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  521. helm/clients/vllm_client.py +4 -6
  522. helm/clients/yi_client.py +0 -3
  523. helm/common/audio_utils.py +111 -0
  524. helm/common/cache.py +8 -30
  525. helm/common/file_caches/local_file_cache.py +1 -1
  526. helm/common/file_caches/test_local_file_cache.py +1 -1
  527. helm/common/images_utils.py +2 -2
  528. helm/common/key_value_store.py +9 -9
  529. helm/common/media_object.py +2 -2
  530. helm/common/mongo_key_value_store.py +3 -3
  531. helm/common/multimodal_request_utils.py +26 -0
  532. helm/common/reeval_parameters.py +12 -0
  533. helm/common/request.py +6 -2
  534. helm/common/response_format.py +18 -0
  535. helm/common/test_cache.py +1 -48
  536. helm/common/test_media_object.py +1 -1
  537. helm/common/tokenization_request.py +0 -9
  538. helm/config/model_deployments.yaml +1258 -33
  539. helm/config/model_metadata.yaml +1110 -41
  540. helm/config/tokenizer_configs.yaml +403 -3
  541. helm/proxy/cli.py +2 -2
  542. helm/proxy/example_queries.py +1 -1
  543. helm/proxy/server.py +11 -13
  544. helm/proxy/services/remote_service.py +1 -7
  545. helm/proxy/services/server_service.py +6 -19
  546. helm/proxy/services/service.py +0 -6
  547. helm/proxy/services/test_remote_service.py +2 -2
  548. helm/proxy/services/test_service.py +1 -1
  549. helm/proxy/static/general.js +122 -0
  550. helm/proxy/static/help.html +99 -0
  551. helm/proxy/static/index.css +57 -0
  552. helm/proxy/static/index.html +40 -0
  553. helm/proxy/static/index.js +456 -0
  554. helm/proxy/static/info-icon.png +0 -0
  555. helm/proxy/test_retry.py +1 -1
  556. helm/proxy/token_counters/auto_token_counter.py +1 -1
  557. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  558. helm/tokenizers/caching_tokenizer.py +2 -30
  559. helm/tokenizers/http_model_tokenizer.py +1 -1
  560. helm/tokenizers/huggingface_tokenizer.py +2 -2
  561. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  562. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  563. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  564. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  565. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  566. helm/tokenizers/tokenizer.py +3 -1
  567. helm/tokenizers/yalm_tokenizer.py +3 -3
  568. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  569. crfm_helm-0.5.3.dist-info/METADATA +0 -355
  570. crfm_helm-0.5.3.dist-info/RECORD +0 -699
  571. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  572. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  573. helm/benchmark/data_overlap/light_scenario.py +0 -60
  574. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  575. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  576. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  577. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  578. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  579. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  580. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  581. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  582. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  583. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  584. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  585. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  586. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  587. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  588. helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
  589. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  590. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  591. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  592. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  593. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  594. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  595. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  596. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  597. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  598. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  599. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  600. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  601. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  602. helm/tokenizers/anthropic_tokenizer.py +0 -52
  603. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  604. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  605. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
  606. /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
@@ -86,7 +86,9 @@ tokenizer_configs:
86
86
  # Anthropic
87
87
  - name: anthropic/claude
88
88
  tokenizer_spec:
89
- class_name: "helm.tokenizers.anthropic_tokenizer.AnthropicTokenizer"
89
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
90
+ args:
91
+ pretrained_model_name_or_path: Xenova/claude-tokenizer
90
92
  end_of_text_token: "<|endoftext|>"
91
93
  prefix_token: "<|endoftext|>"
92
94
 
@@ -169,12 +171,25 @@ tokenizer_configs:
169
171
  end_of_text_token: "<|end▁of▁sentence|>"
170
172
  prefix_token: "<|begin▁of▁sentence|>"
171
173
 
174
+ - name: deepseek-ai/deepseek-v3
175
+ tokenizer_spec:
176
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
177
+ end_of_text_token: "<|end▁of▁sentence|>"
178
+ prefix_token: "<|begin▁of▁sentence|>"
179
+
180
+ - name: deepseek-ai/deepseek-r1
181
+ tokenizer_spec:
182
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
183
+ end_of_text_token: "<|end▁of▁sentence|>"
184
+ prefix_token: "<|begin▁of▁sentence|>"
185
+
172
186
  # EleutherAI
173
187
  - name: EleutherAI/gpt-j-6B
174
188
  tokenizer_spec:
175
189
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
176
190
  end_of_text_token: "<|endoftext|>"
177
191
  prefix_token: "<|endoftext|>"
192
+
178
193
  - name: EleutherAI/gpt-neox-20b
179
194
  tokenizer_spec:
180
195
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -317,6 +332,14 @@ tokenizer_configs:
317
332
  prefix_token: "<|begin_of_text|>"
318
333
  end_of_text_token: "<|end_of_text|>"
319
334
 
335
+ - name: meta/llama-3-8b-instruct
336
+ tokenizer_spec:
337
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
338
+ args:
339
+ pretrained_model_name_or_path: meta-llama/Meta-Llama-3.1-8B-Instruct
340
+ prefix_token: "<|begin_of_text|>"
341
+ end_of_text_token: "<|eot_id|>"
342
+
320
343
  - name: meta/llama-3.1-8b
321
344
  tokenizer_spec:
322
345
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -325,6 +348,54 @@ tokenizer_configs:
325
348
  prefix_token: "<|begin_of_text|>"
326
349
  end_of_text_token: "<|end_of_text|>"
327
350
 
351
+ - name: meta/llama-3.1-8b-instruct
352
+ tokenizer_spec:
353
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
354
+ args:
355
+ pretrained_model_name_or_path: meta-llama/Meta-Llama-3.1-8B-Instruct
356
+ prefix_token: "<|begin_of_text|>"
357
+ end_of_text_token: "<|eot_id|>"
358
+
359
+ - name: meta/llama-3.2-3b-instruct
360
+ tokenizer_spec:
361
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
362
+ args:
363
+ pretrained_model_name_or_path: meta-llama/Llama-3.2-3B-Instruct
364
+ prefix_token: "<|begin_of_text|>"
365
+ end_of_text_token: "<|eot_id|>"
366
+
367
+ - name: meta/llama-3.2-1b-instruct
368
+ tokenizer_spec:
369
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
370
+ args:
371
+ pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
372
+ prefix_token: "<|begin_of_text|>"
373
+ end_of_text_token: "<|eot_id|>"
374
+
375
+ - name: meta/llama-3.1-8b-instruct
376
+ tokenizer_spec:
377
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
378
+ args:
379
+ pretrained_model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
380
+ prefix_token: "<|begin_of_text|>"
381
+ end_of_text_token: "<|eot_id|>"
382
+
383
+ - name: meta/llama-3.2-11b-vision-instruct
384
+ tokenizer_spec:
385
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
386
+ args:
387
+ pretrained_model_name_or_path: meta-llama/Llama-3.2-11B-Vision-Instruct
388
+ prefix_token: "<|begin_of_text|>"
389
+ end_of_text_token: "<|eot_id|>"
390
+
391
+ - name: meta/llama-3.3-70b-instruct
392
+ tokenizer_spec:
393
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
394
+ args:
395
+ pretrained_model_name_or_path: meta-llama/Llama-3.3-70B-Instruct
396
+ prefix_token: "<|begin_of_text|>"
397
+ end_of_text_token: "<|eot_id|>"
398
+
328
399
  # 01-ai
329
400
  - name: 01-ai/Yi-6B
330
401
  tokenizer_spec:
@@ -382,6 +453,18 @@ tokenizer_configs:
382
453
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
383
454
  end_of_text_token: "<|endoftext|>"
384
455
  prefix_token: "<s>"
456
+
457
+ - name: microsoft/phi-3.5-mini-instruct
458
+ tokenizer_spec:
459
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
460
+ end_of_text_token: "<|endoftext|>"
461
+ prefix_token: "<s>"
462
+
463
+ - name: microsoft/phi-3.5-mini-instruct
464
+ tokenizer_spec:
465
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
466
+ end_of_text_token: "<|endoftext|>"
467
+ prefix_token: "<s>"
385
468
 
386
469
  # Mistralai
387
470
  - name: mistralai/Mistral-7B-v0.1
@@ -420,6 +503,37 @@ tokenizer_configs:
420
503
  end_of_text_token: "</s>"
421
504
  prefix_token: "<s>"
422
505
 
506
+ - name: mistralai/Mistral-Large-Instruct-2411
507
+ tokenizer_spec:
508
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
509
+ end_of_text_token: "</s>"
510
+ prefix_token: "<s>"
511
+
512
+ - name: mistralai/Ministral-8B-Instruct-2410
513
+ tokenizer_spec:
514
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
515
+ end_of_text_token: "</s>"
516
+ prefix_token: "<s>"
517
+
518
+ - name: mistralai/Mistral-Small-24B-Instruct-2501
519
+ tokenizer_spec:
520
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
521
+ end_of_text_token: "</s>"
522
+ prefix_token: "<s>"
523
+
524
+ # Nectec
525
+ - name: nectec/OpenThaiLLM-Prebuilt-7B
526
+ tokenizer_spec:
527
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
528
+ end_of_text_token: "<|im_end|>"
529
+ prefix_token: ""
530
+
531
+ - name: nectec/Pathumma-llm-text-1.0.0
532
+ tokenizer_spec:
533
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
534
+ end_of_text_token: "<|im_end|>"
535
+ prefix_token: "<|im_start|>"
536
+
423
537
  # Neurips
424
538
  - name: neurips/local
425
539
  tokenizer_spec:
@@ -437,6 +551,14 @@ tokenizer_configs:
437
551
  end_of_text_token: "<|endoftext|>"
438
552
  prefix_token: "<|endoftext|>"
439
553
 
554
+ - name: nvidia/llama-3.1-nemotron-70b-instruct
555
+ tokenizer_spec:
556
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
557
+ args:
558
+ pretrained_model_name_or_path: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
559
+ end_of_text_token: "<|eot_id|>"
560
+ prefix_token: "<|begin_of_text|>"
561
+
440
562
  # OpenAI
441
563
  - name: openai/cl100k_base
442
564
  tokenizer_spec:
@@ -486,8 +608,22 @@ tokenizer_configs:
486
608
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
487
609
  args:
488
610
  pretrained_model_name_or_path: Qwen/Qwen2-72B-Instruct
489
- end_of_text_token: <|im_end|>"
490
- prefix_token: "<|im_start|>'"
611
+ end_of_text_token: "<|im_end|>"
612
+ prefix_token: "<|im_start|>"
613
+
614
+ - name: qwen/qwen2.5-7b-instruct
615
+ tokenizer_spec:
616
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
617
+ args:
618
+ pretrained_model_name_or_path: Qwen/Qwen2.5-7B-Instruct
619
+ end_of_text_token: "<|im_end|>"
620
+ prefix_token: "<|im_start|>"
621
+
622
+ - name: qwen/qwq-32b-preview
623
+ tokenizer_spec:
624
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
625
+ end_of_text_token: "<|im_end|>"
626
+ prefix_token: ""
491
627
 
492
628
  - name: qwen/qwen-vl
493
629
  tokenizer_spec:
@@ -509,6 +645,25 @@ tokenizer_configs:
509
645
  end_of_text_token: "<|endoftext|>"
510
646
  prefix_token: ""
511
647
 
648
+ - name: qwen/qwen-audio-chat
649
+ tokenizer_spec:
650
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
651
+ args:
652
+ pretrained_model_name_or_path: Qwen/Qwen-Audio-Chat
653
+ trust_remote_code: true
654
+ # Source: https://github.com/QwenLM/Qwen-Audio
655
+ end_of_text_token: "<|endoftext|>"
656
+ prefix_token: ""
657
+
658
+ - name: qwen/qwen2-audio-instruct
659
+ tokenizer_spec:
660
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
661
+ args:
662
+ pretrained_model_name_or_path: Qwen/Qwen2-Audio-7B-Instruct
663
+ trust_remote_code: false
664
+ end_of_text_token: "<|endoftext|>"
665
+ prefix_token: ""
666
+
512
667
  # SambaLingo
513
668
  - name: sambanova/sambalingo-thai-base
514
669
  tokenizer_spec:
@@ -549,6 +704,15 @@ tokenizer_configs:
549
704
  end_of_text_token: "</s>"
550
705
  prefix_token: "<s>"
551
706
 
707
+ # Upstage
708
+ - name: upstage/solar-pro-preview-instruct
709
+ tokenizer_spec:
710
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
711
+ args:
712
+ trust_remote_code: true
713
+ end_of_text_token: "<|im_end|>"
714
+ prefix_token: "<|startoftext|>"
715
+
552
716
  # Writer
553
717
  - name: writer/gpt2
554
718
  tokenizer_spec:
@@ -564,3 +728,239 @@ tokenizer_configs:
564
728
  class_name: "helm.tokenizers.yalm_tokenizer.YaLMTokenizer"
565
729
  end_of_text_token: "</s>"
566
730
  prefix_token: "</s>"
731
+
732
+ # Diva Llama
733
+ - name: stanford/diva-llama
734
+ tokenizer_spec:
735
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
736
+ args:
737
+ pretrained_model_name_or_path: WillHeld/DiVA-llama-3-v0-8b
738
+ trust_remote_code: true
739
+ prefix_token: "<|begin_of_text|>"
740
+ end_of_text_token: "<|eot_id|>"
741
+
742
+ # LLaMA-Omni
743
+ - name: ictnlp/llama-3.1-8b-omni
744
+ tokenizer_spec:
745
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
746
+ args:
747
+ pretrained_model_name_or_path: ICTNLP/Llama-3.1-8B-Omni
748
+ trust_remote_code: false
749
+ end_of_text_token: "<|eot_id|>"
750
+ prefix_token: "<|begin_of_text|>"
751
+
752
+ # IBM - Granite 3.0
753
+ - name: ibm-granite/granite-3.0-2b-base
754
+ tokenizer_spec:
755
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
756
+ args:
757
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-2b-base
758
+ end_of_text_token: ""
759
+ prefix_token: ""
760
+
761
+ - name: ibm-granite/granite-3.0-2b-instruct
762
+ tokenizer_spec:
763
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
764
+ args:
765
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-2b-instruct
766
+ end_of_text_token: ""
767
+ prefix_token: ""
768
+
769
+ - name: ibm-granite/granite-3.0-8b-instruct
770
+ tokenizer_spec:
771
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
772
+ args:
773
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-8b-instruct
774
+ end_of_text_token: ""
775
+ prefix_token: ""
776
+
777
+ - name: ibm-granite/granite-3.0-8b-base
778
+ tokenizer_spec:
779
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
780
+ args:
781
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-8b-base
782
+ end_of_text_token: ""
783
+ prefix_token: ""
784
+
785
+ - name: ibm-granite/granite-3.0-3b-a800m-instruct
786
+ tokenizer_spec:
787
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
788
+ args:
789
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-3b-a800m-instruct
790
+ end_of_text_token: ""
791
+ prefix_token: ""
792
+
793
+ - name: ibm-granite/granite-3.0-3b-a800m-base
794
+ tokenizer_spec:
795
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
796
+ args:
797
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-3b-a800m-base
798
+ end_of_text_token: ""
799
+ prefix_token: ""
800
+
801
+ - name: ibm-granite/granite-3.0-1b-a400m-instruct
802
+ tokenizer_spec:
803
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
804
+ args:
805
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-instruct
806
+ end_of_text_token: ""
807
+ prefix_token: ""
808
+
809
+ - name: ibm-granite/granite-3.0-1b-a400m-base
810
+ tokenizer_spec:
811
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
812
+ args:
813
+ pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base
814
+ end_of_text_token: ""
815
+ prefix_token: ""
816
+
817
+ - name: maritaca-ai/sabia-7b
818
+ tokenizer_spec:
819
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
820
+ args:
821
+ pretrained_model_name_or_path: maritaca-ai/sabia-7b
822
+ end_of_text_token: "</s>"
823
+ prefix_token: "<s>"
824
+
825
+ # Granite-3.1-8b-base
826
+ - name: ibm-granite/granite-3.1-8b-base
827
+ tokenizer_spec:
828
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
829
+ args:
830
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-8b-base
831
+ prefix_token: ""
832
+ end_of_text_token: "<|endoftext|>"
833
+
834
+ # Granite-3.1-8b-instruct
835
+ - name: ibm-granite/granite-3.1-8b-instruct
836
+ tokenizer_spec:
837
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
838
+ args:
839
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-8b-instruct
840
+ prefix_token: ""
841
+ end_of_text_token: "<|endoftext|>"
842
+
843
+ # Granite-3.1-2b-instruct
844
+ - name: ibm-granite/granite-3.1-2b-instruct
845
+ tokenizer_spec:
846
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
847
+ args:
848
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-2b-instruct
849
+ prefix_token: ""
850
+ end_of_text_token: ""
851
+
852
+ # Granite-3.1-2b-base
853
+ - name: ibm-granite/granite-3.1-2b-base
854
+ tokenizer_spec:
855
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
856
+ args:
857
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-2b-base
858
+ prefix_token: ""
859
+ end_of_text_token: ""
860
+
861
+ # Granite-3.1-3b-a800m-instruct
862
+ - name: ibm-granite/granite-3.1-3b-a800m-instruct
863
+ tokenizer_spec:
864
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
865
+ args:
866
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-3b-a800m-instruct
867
+ prefix_token: ""
868
+ end_of_text_token: ""
869
+
870
+ # Granite-3.1-3b-a800m-base
871
+ - name: ibm-granite/granite-3.1-3b-a800m-base
872
+ tokenizer_spec:
873
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
874
+ args:
875
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-3b-a800m-base
876
+ prefix_token: ""
877
+ end_of_text_token: ""
878
+
879
+ # Granite-3.1-1b-a400m-instruct
880
+ - name: ibm-granite/granite-3.1-1b-a400m-instruct
881
+ tokenizer_spec:
882
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
883
+ args:
884
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-1b-a400m-instruct
885
+ prefix_token: ""
886
+ end_of_text_token: ""
887
+
888
+ # Granite-3.1-1b-a400m-base
889
+ - name: ibm-granite/granite-3.1-1b-a400m-base
890
+ tokenizer_spec:
891
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
892
+ args:
893
+ pretrained_model_name_or_path: ibm-granite/granite-3.1-1b-a400m-base
894
+ prefix_token: ""
895
+ end_of_text_token: ""
896
+
897
+ - name: ibm-granite/granite-20b-code-instruct-8k
898
+ tokenizer_spec:
899
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
900
+ args:
901
+ pretrained_model_name_or_path: ibm-granite/granite-20b-code-instruct-8k
902
+ prefix_token: ""
903
+ end_of_text_token: ""
904
+
905
+ - name: ibm-granite/granite-3b-code-instruct-128k
906
+ tokenizer_spec:
907
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
908
+ args:
909
+ pretrained_model_name_or_path: ibm-granite/granite-3b-code-instruct-128k
910
+ prefix_token: ""
911
+ end_of_text_token: ""
912
+
913
+
914
+
915
+ - name: ibm-granite/granite-34b-code-instruct-8k
916
+ tokenizer_spec:
917
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
918
+ args:
919
+ pretrained_model_name_or_path: ibm-granite/granite-34b-code-instruct-8k
920
+ prefix_token: ""
921
+ end_of_text_token: ""
922
+
923
+ - name: ibm-granite/granite-8b-code-instruct-128k
924
+ tokenizer_spec:
925
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
926
+ args:
927
+ pretrained_model_name_or_path: ibm-granite/granite-8b-code-instruct-128k
928
+ prefix_token: ""
929
+ end_of_text_token: ""
930
+
931
+
932
+ - name: ibm-granite/granite-guardian-3.1-2b
933
+ tokenizer_spec:
934
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
935
+ args:
936
+ pretrained_model_name_or_path: ibm-granite/granite-guardian-3.1-2b
937
+ prefix_token: ""
938
+ end_of_text_token: ""
939
+
940
+ - name: ibm-granite/granite-guardian-3.1-8b
941
+ tokenizer_spec:
942
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
943
+ args:
944
+ pretrained_model_name_or_path: ibm-granite/granite-guardian-3.1-8b
945
+ prefix_token: ""
946
+ end_of_text_token: ""
947
+
948
+
949
+
950
+ # DeepSeek-R1-Distill-Llama-3.1-8b
951
+ - name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
952
+ tokenizer_spec:
953
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
954
+ args:
955
+ pretrained_model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
956
+ end_of_text_token: "<|end▁of▁sentence|>"
957
+ prefix_token: "<|begin▁of▁sentence|>"
958
+
959
+ # deepseek-ai/deepseek-coder-6.7b-instruct
960
+ - name: deepseek-ai/deepseek-coder-6.7b-instruct
961
+ tokenizer_spec:
962
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
963
+ args:
964
+ pretrained_model_name_or_path: deepseek-ai/deepseek-coder-6.7b-instruct
965
+ end_of_text_token: "<|end▁of▁sentence|>"
966
+ prefix_token: "<|begin▁of▁sentence|>"
helm/proxy/cli.py CHANGED
@@ -23,8 +23,8 @@ import sys
23
23
 
24
24
  from helm.common.hierarchical_logger import hlog
25
25
  from helm.common.authentication import Authentication
26
- from .accounts import Usage, Account
27
- from .services.remote_service import RemoteService, add_service_args, create_authentication
26
+ from helm.proxy.accounts import Usage, Account
27
+ from helm.proxy.services.remote_service import RemoteService, add_service_args, create_authentication
28
28
 
29
29
  GRANULARITIES = ["daily", "monthly", "total"]
30
30
  UNLIMITED_QUOTA = "unlimited"
@@ -1,6 +1,6 @@
1
1
  import textwrap
2
2
 
3
- from .query import Query
3
+ from helm.proxy.query import Query
4
4
 
5
5
 
6
6
  def dedent(text: str) -> str:
helm/proxy/server.py CHANGED
@@ -6,6 +6,7 @@ Look at `index.js` to see how the functionality is invoked.
6
6
  from urllib.parse import unquote_plus
7
7
  import argparse
8
8
  import dataclasses
9
+ import importlib_resources as resources
9
10
  import json
10
11
  import os
11
12
  import sys
@@ -29,9 +30,9 @@ from helm.common.perspective_api_request import PerspectiveAPIRequest
29
30
  from helm.common.moderations_api_request import ModerationAPIRequest
30
31
  from helm.common.tokenization_request import TokenizationRequest, DecodeRequest
31
32
  from helm.proxy.services.service import CACHE_DIR
32
- from .accounts import Account
33
- from .services.server_service import ServerService
34
- from .query import Query
33
+ from helm.proxy.accounts import Account
34
+ from helm.proxy.services.server_service import ServerService
35
+ from helm.proxy.query import Query
35
36
 
36
37
  try:
37
38
  import gunicorn # noqa
@@ -86,7 +87,7 @@ def handle_root():
86
87
 
87
88
  @app.get("/static/<filename:path>")
88
89
  def handle_static_filename(filename):
89
- resp = bottle.static_file(filename, root=os.path.join(os.path.dirname(__file__), "static"))
90
+ resp = bottle.static_file(filename, root=app.config["helm.staticpath"])
90
91
  resp.add_header("Cache-Control", "no-store, must-revalidate ")
91
92
  return resp
92
93
 
@@ -106,15 +107,6 @@ def handle_get_general_info():
106
107
  return safe_call(perform)
107
108
 
108
109
 
109
- @app.get("/api/window_service_info")
110
- def handle_get_window_service_info():
111
- def perform(args):
112
- global service
113
- return dataclasses.asdict(service.get_window_service_info(args["model_name"]))
114
-
115
- return safe_call(perform)
116
-
117
-
118
110
  @app.post("/api/account")
119
111
  def handle_create_account():
120
112
  def perform(args):
@@ -293,6 +285,12 @@ def main():
293
285
  ensure_directory_exists(sqlite_cache_path)
294
286
  cache_backend_config = SqliteCacheBackendConfig(sqlite_cache_path)
295
287
 
288
+ static_package_name = "helm.proxy.static"
289
+ resource_path = resources.files(static_package_name).joinpath("index.html")
290
+ with resources.as_file(resource_path) as resource_filename:
291
+ static_path = str(resource_filename.parent)
292
+ app.config["helm.staticpath"] = static_path
293
+
296
294
  service = ServerService(base_path=args.base_path, cache_backend_config=cache_backend_config)
297
295
 
298
296
  gunicorn_args = {
@@ -15,7 +15,6 @@ from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
15
15
  from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
16
16
  from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
17
17
  from helm.common.tokenization_request import (
18
- WindowServiceInfo,
19
18
  TokenizationRequest,
20
19
  TokenizationRequestResult,
21
20
  DecodeRequestResult,
@@ -25,7 +24,7 @@ from helm.common.request import Request, RequestResult
25
24
  from dacite import from_dict
26
25
  from helm.proxy.accounts import Account
27
26
  from helm.proxy.query import Query, QueryResult
28
- from .service import Service, GeneralInfo
27
+ from helm.proxy.services.service import Service, GeneralInfo
29
28
 
30
29
 
31
30
  class RemoteServiceError(Exception):
@@ -51,11 +50,6 @@ class RemoteService(Service):
51
50
  response = requests.get(f"{self.base_url}/api/general_info").json()
52
51
  return from_dict(GeneralInfo, response)
53
52
 
54
- def get_window_service_info(self, model_name) -> WindowServiceInfo:
55
- params = {"model_name": model_name}
56
- response = requests.get(f"{self.base_url}/api/window_service_info?{urllib.parse.urlencode(params)}").json()
57
- return from_dict(WindowServiceInfo, response)
58
-
59
53
  def expand_query(self, query: Query) -> QueryResult:
60
54
  params = asdict(query)
61
55
  response = requests.get(f"{self.base_url}/api/query?{urllib.parse.urlencode(params)}").json()
@@ -14,7 +14,6 @@ from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
14
14
  from helm.common.general import ensure_directory_exists, parse_hocon, get_credentials
15
15
  from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
16
16
  from helm.common.tokenization_request import (
17
- WindowServiceInfo,
18
17
  TokenizationRequest,
19
18
  TokenizationRequestResult,
20
19
  DecodeRequest,
@@ -36,7 +35,7 @@ from helm.proxy.query import Query, QueryResult
36
35
  from helm.proxy.retry import retry_request
37
36
  from helm.proxy.token_counters.auto_token_counter import AutoTokenCounter
38
37
  from helm.tokenizers.auto_tokenizer import AutoTokenizer
39
- from .service import (
38
+ from helm.proxy.services.service import (
40
39
  Service,
41
40
  CACHE_DIR,
42
41
  ACCOUNTS_FILE,
@@ -85,22 +84,6 @@ class ServerService(Service):
85
84
  all_models = [dataclasses.replace(model_metadata, release_date=None) for model_metadata in ALL_MODELS_METADATA]
86
85
  return GeneralInfo(version=VERSION, example_queries=example_queries, all_models=all_models)
87
86
 
88
- def get_window_service_info(self, model_name) -> WindowServiceInfo:
89
- # The import statement is placed here to avoid two problems, please refer to the link for details
90
- # https://github.com/stanford-crfm/helm/pull/1430#discussion_r1156686624
91
- from helm.benchmark.window_services.tokenizer_service import TokenizerService
92
- from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
93
-
94
- token_service = TokenizerService(self, Authentication(""))
95
- window_service = WindowServiceFactory.get_window_service(model_name, token_service)
96
- return WindowServiceInfo(
97
- tokenizer_name=window_service.tokenizer_name,
98
- max_sequence_length=window_service.max_sequence_length,
99
- max_request_length=window_service.max_request_length,
100
- end_of_text_token=window_service.end_of_text_token,
101
- prefix_token=window_service.prefix_token,
102
- )
103
-
104
87
  def expand_query(self, query: Query) -> QueryResult:
105
88
  """Turn the `query` into requests."""
106
89
  prompt = query.prompt
@@ -120,8 +103,12 @@ class ServerService(Service):
120
103
  return "dall_e"
121
104
  elif model_deployment.startswith("openai/gpt-4"):
122
105
  return "gpt4"
123
- else:
106
+ elif model_deployment.startswith("openai/gpt-3"):
124
107
  return "gpt3"
108
+ elif model_deployment.startswith("openai/o1"):
109
+ return "o1"
110
+ else:
111
+ return "openai"
125
112
  elif model_deployment.startswith("ai21/"):
126
113
  return "jurassic"
127
114
  else:
@@ -11,7 +11,6 @@ from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResu
11
11
  from helm.common.perspective_api_request import PerspectiveAPIRequestResult, PerspectiveAPIRequest
12
12
  from helm.common.moderations_api_request import ModerationAPIRequest, ModerationAPIRequestResult
13
13
  from helm.common.tokenization_request import (
14
- WindowServiceInfo,
15
14
  TokenizationRequest,
16
15
  TokenizationRequestResult,
17
16
  DecodeRequest,
@@ -85,11 +84,6 @@ class Service(ABC):
85
84
  """Get general info."""
86
85
  pass
87
86
 
88
- @abstractmethod
89
- def get_window_service_info(self, model_name: str) -> WindowServiceInfo:
90
- """Get window service info."""
91
- pass
92
-
93
87
  @abstractmethod
94
88
  def expand_query(self, query: Query) -> QueryResult:
95
89
  """Turn the `query` into requests."""
@@ -18,8 +18,8 @@ from helm.common.authentication import Authentication
18
18
  from helm.common.request import Request, RequestResult
19
19
  from helm.common.tokenization_request import TokenizationRequest, TokenizationRequestResult
20
20
  from helm.proxy.accounts import Account, set_default_quotas
21
- from .remote_service import RemoteService
22
- from .service import ACCOUNTS_FILE
21
+ from helm.proxy.services.remote_service import RemoteService
22
+ from helm.proxy.services.service import ACCOUNTS_FILE
23
23
 
24
24
 
25
25
  @dataclass(frozen=True)
@@ -8,7 +8,7 @@ from helm.common.authentication import Authentication
8
8
  from helm.common.request import Request
9
9
  from helm.proxy.accounts import AuthenticationError, Accounts
10
10
  from helm.proxy.query import Query
11
- from .server_service import ServerService
11
+ from helm.proxy.services.server_service import ServerService
12
12
 
13
13
 
14
14
  def get_authentication():