crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,456 @@
1
+ /**
2
+ * This is a very quick and dirty frontend for just interacting with the models.
3
+ * Please refrain from adding additional functionality to this.
4
+ * TODO: Write this in React.
5
+ */
6
+ $(function () {
7
+ const urlParams = decodeUrlParams(window.location.search);
8
+ let auth = null;
9
+
10
+ function censor(api_key) {
11
+ // Show only the first k letters
12
+ const k = 2;
13
+ if (api_key.length <= k) {
14
+ return api_key;
15
+ }
16
+ return api_key.substring(0, k) + "*".repeat(api_key.length - k);
17
+ }
18
+
19
+ // Logging in and out
20
+ function updateLogin() {
21
+ const $loginInfo = $("#loginInfo");
22
+ $loginInfo.empty();
23
+ let api_key = readCookie("api_key");
24
+ if (api_key) {
25
+ auth = { api_key };
26
+ $loginInfo.append(
27
+ $("<a>", { class: "nav-link", href: "#" })
28
+ .append("Logout of API key " + censor(api_key))
29
+ .click(() => {
30
+ eraseCookie("api_key");
31
+ updateLogin();
32
+ }),
33
+ );
34
+ } else {
35
+ auth = null;
36
+ $loginInfo.append(
37
+ $("<a>", { class: "nav-link", href: "#" })
38
+ .append("Login")
39
+ .click(() => {
40
+ api_key = prompt("Enter your API key:");
41
+ if (!api_key) {
42
+ return;
43
+ }
44
+
45
+ // Check the API key the user entered using endpoint /api/account
46
+ const args = { auth: JSON.stringify({ api_key }) };
47
+ $.getJSON("/api/account", args, (response) => {
48
+ console.log("/api/account", response);
49
+ if ("error" in response) {
50
+ alert("The API key you've entered is invalid. Try again.");
51
+ } else {
52
+ createCookie("api_key", api_key);
53
+ updateLogin();
54
+ }
55
+ });
56
+ }),
57
+ );
58
+ }
59
+ }
60
+
61
+ updateLogin();
62
+
63
+ ////////////////////////////////////////////////////////////
64
+ // Rendering functions
65
+
66
+ function renderExampleQueries(updateQuery) {
67
+ // Show links for each example query, so when you click on them, they populate the textboxes.
68
+ const $examplesBlock = $("<div>", { class: "examples-block" });
69
+ $examplesBlock.append($("<span>").append("Examples:"));
70
+ generalInfo.example_queries.forEach((query, i) => {
71
+ const href = "#";
72
+ const title =
73
+ "[Prompt]\n" +
74
+ query.prompt +
75
+ "\n[Settings]\n" +
76
+ query.settings +
77
+ "\n[Environments]\n" +
78
+ query.environments;
79
+ const $link = $("<a>", { href, title }).append(`[${i}]`);
80
+ $link.click(() => {
81
+ // Populate the query with the example
82
+ updateQuery(query);
83
+ urlParams.prompt = query.prompt;
84
+ urlParams.settings = query.settings;
85
+ urlParams.environments = query.environments;
86
+ updateBrowserLocation(urlParams);
87
+ });
88
+ $examplesBlock.append("&nbsp;");
89
+ $examplesBlock.append($link);
90
+ });
91
+ return $examplesBlock;
92
+ }
93
+
94
+ function renderQuery(handleQueryResult) {
95
+ // Render the textboxes for entering the query (which includes the prompt, settings, and environment)
96
+ const $queryBlock = $("<div>", { class: "block" });
97
+ const $prompt = $("<textarea>", {
98
+ cols: 90,
99
+ rows: 7,
100
+ placeholder: "Enter prompt",
101
+ }).val(urlParams.prompt);
102
+ const $settings = $("<textarea>", {
103
+ cols: 90,
104
+ rows: 5,
105
+ placeholder:
106
+ "Enter settings (e.g., model: openai/text-davinci-002 for Instruct GPT-3); click Help at the top to learn more",
107
+ }).val(urlParams.settings);
108
+ const $environments = $("<textarea>", {
109
+ cols: 90,
110
+ rows: 3,
111
+ placeholder:
112
+ "Enter environment variables (e.g., city: [Boston, New York]); click Help at the top to learn more",
113
+ }).val(urlParams.environments);
114
+
115
+ $queryBlock.data("prompt", $prompt);
116
+ $queryBlock.data("settings", $settings);
117
+ $queryBlock.data("environments", $environments);
118
+
119
+ function bindSubmit($text) {
120
+ $text.keypress((e) => {
121
+ if ((event.keyCode === 10 || event.keyCode === 13) && event.ctrlKey) {
122
+ submit();
123
+ }
124
+ });
125
+ }
126
+
127
+ bindSubmit($prompt);
128
+ bindSubmit($settings);
129
+ bindSubmit($environments);
130
+
131
+ function submit() {
132
+ if (!auth) {
133
+ alert("You must log in first.");
134
+ return;
135
+ }
136
+
137
+ const query = {
138
+ prompt: $prompt.val(),
139
+ settings: $settings.val(),
140
+ environments: $environments.val(),
141
+ };
142
+
143
+ urlParams.prompt = query.prompt;
144
+ urlParams.settings = query.settings;
145
+ urlParams.environments = query.environments;
146
+ updateBrowserLocation(urlParams);
147
+
148
+ $.getJSON("/api/query", query, handleQueryResult);
149
+ }
150
+
151
+ // Show examples of queries
152
+ const $exampleQueries = renderExampleQueries((query) => {
153
+ $queryBlock.data("prompt").val(query.prompt);
154
+ $queryBlock.data("settings").val(query.settings);
155
+ $queryBlock.data("environments").val(query.environments);
156
+ urlParams.prompt = query.prompt;
157
+ urlParams.settings = query.settings;
158
+ urlParams.environments = query.environments;
159
+ updateBrowserLocation();
160
+ });
161
+
162
+ const promptHelp =
163
+ "This is the text you feed into the language model to complete.\nExample:\n Life is like";
164
+ const settingsHelp =
165
+ "Specifies what information we want from the language model (see [Help] for more details).\nExample:\n temperature: ${temperature}\n model: openai/davinci\n max_tokens: 10\n num_completions: 5";
166
+ const environmentsHelp =
167
+ "Specifies a list of values to try for each variable that appears in the prompt or settings.\nExample:\n temperature: [0, 0.5, 1]";
168
+
169
+ const $promptLabel = $("<span>")
170
+ .append(helpIcon(promptHelp, "help.html#query"))
171
+ .append("Prompt");
172
+ const $settingsLabel = $("<span>")
173
+ .append(helpIcon(settingsHelp, "help.html#query"))
174
+ .append("Settings");
175
+ const $environmentsLabel = $("<span>")
176
+ .append(helpIcon(environmentsHelp, "help.html#query"))
177
+ .append("Environments");
178
+
179
+ $queryBlock.append($("<h4>").append("Query"));
180
+ $queryBlock.append($exampleQueries);
181
+ const $table = $("<table>", { class: "query-table" });
182
+ $table.append(
183
+ $("<tr>")
184
+ .append($("<td>").append($promptLabel))
185
+ .append($("<td>").append($prompt)),
186
+ );
187
+ $table.append(
188
+ $("<tr>")
189
+ .append($("<td>").append($settingsLabel))
190
+ .append($("<td>").append($settings)),
191
+ );
192
+ $table.append(
193
+ $("<tr>")
194
+ .append($("<td>").append($environmentsLabel))
195
+ .append($("<td>").append($environments)),
196
+ );
197
+ $queryBlock.append($table);
198
+ $queryBlock.append($("<button>").append("Submit").click(submit));
199
+
200
+ return $queryBlock;
201
+ }
202
+
203
+ function getChangingKeys(items) {
204
+ // Return the list of keys whose values vary across `items.`
205
+ if (items.length === 0) {
206
+ return [];
207
+ }
208
+ return Object.keys(items[0]).filter((key) => {
209
+ return !items.every(
210
+ (item) => JSON.stringify(item[key]) === JSON.stringify(items[0][key]),
211
+ );
212
+ });
213
+ }
214
+
215
+ function renderRequest(changingKeys, request) {
216
+ // Render the request metadata (e.g., temperature if it is changing)
217
+ const title = JSON.stringify(request);
218
+ // Always include model, never prompt (since that's shown right after).
219
+ const showKeys = ["model"].concat(
220
+ changingKeys.filter((key) => key !== "prompt" && key !== "model"),
221
+ );
222
+ const summary =
223
+ "[" + showKeys.map((key) => key + ":" + request[key]).join(", ") + "]";
224
+ return $("<div>", { title }).append(
225
+ summary + " " + multilineHtml(request.prompt),
226
+ );
227
+ }
228
+
229
+ function renderTime(time) {
230
+ return Math.round(time * 10) / 10 + "s";
231
+ }
232
+
233
+ function constructTokenGroups(tokens) {
234
+ // Note: sometimes multiple tokens correspond to one character, for example:
235
+ // ["bytes:\xe2\x80", "bytes:\x99"] => ’
236
+ // For these, we keep these in the buffer and collapse them, and concatenate the entries.
237
+ //
238
+ // get_num_bytes() and convert_tokens_to_text() in src/helm/benchmark/basic_metrics.py are adapted from this function.
239
+ const groups = [];
240
+ for (let i = 0; i < tokens.length;) {
241
+ // Aggregate consecutive tokens while they're "bytes:..."
242
+ const group = { tokens: [] };
243
+ if (tokens[i].text.startsWith("bytes:")) {
244
+ let bytestring = "";
245
+ while (i < tokens.length && tokens[i].text.startsWith("bytes:")) {
246
+ group.tokens.push(tokens[i]);
247
+ // Extract part after : (e.g., \xe2\x80)
248
+ bytestring += tokens[i].text.split(":")[1];
249
+ i++;
250
+ }
251
+ // Convert to encoded URI (e.g., %e2%80%99) and decode
252
+ group.text = decodeURIComponent(bytestring.replaceAll("\\x", "%"));
253
+ } else {
254
+ group.tokens.push(tokens[i]);
255
+ group.text = tokens[i].text;
256
+ i++;
257
+ }
258
+ groups.push(group);
259
+ }
260
+ return groups;
261
+ }
262
+
263
+ function renderTokens(tokens) {
264
+ // Render text as a sequence of tokens that you can interact with to see more information (e.g., logprobs)
265
+ const $result = $("<div>");
266
+ const groups = constructTokenGroups(tokens);
267
+ for (const group of groups) {
268
+ const $group = $("<span>", { class: "token" }).append(
269
+ multilineHtml(group.text),
270
+ );
271
+ $result.append($group);
272
+ }
273
+ return $result;
274
+ }
275
+
276
+ function renderRequestResult(requestResult) {
277
+ // Render the list of completions.
278
+ if (requestResult.error) {
279
+ return renderError(requestResult.error);
280
+ }
281
+ const $result = $("<div>");
282
+ requestResult.completions.forEach((completion) => {
283
+ const $contents = $("<span>", {
284
+ title: `logprob: ${completion.logprob}`,
285
+ }).append(renderTokens(completion.tokens));
286
+ const $metadata = $("<span>", { class: "metadata" });
287
+ $metadata.append(
288
+ $("<span>", { title: "Log probability" }).append(
289
+ round(completion.logprob, 2),
290
+ ),
291
+ );
292
+ if (completion.finish_reason) {
293
+ const title =
294
+ "Generation finished because of this reason: " +
295
+ JSON.stringify(completion.finish_reason);
296
+ $metadata
297
+ .append(" ")
298
+ .append(
299
+ $("<span>", { title }).append(completion.finish_reason.reason),
300
+ );
301
+ }
302
+ $result.append(
303
+ $("<div>", { class: "completion" }).append($metadata).append($contents),
304
+ );
305
+ });
306
+ $result.append($("<i>").append(renderTime(requestResult.request_time)));
307
+ return $result;
308
+ }
309
+
310
+ function renderAccount() {
311
+ // Render the account information (usage, quotas).
312
+ if (!auth) {
313
+ return null;
314
+ }
315
+
316
+ const $accountBlock = $("<div>", { class: "block" });
317
+ const args = { auth: JSON.stringify(auth) };
318
+ $.getJSON("/api/account", args, ([account]) => {
319
+ console.log("/api/account", account);
320
+ const items = [];
321
+ for (modelGroup in account.usages) {
322
+ for (granularity in account.usages[modelGroup]) {
323
+ const usage = account.usages[modelGroup][granularity];
324
+ // Only print out usage for model groups and granularities where there is a quota
325
+ if (usage.quota) {
326
+ const percent = Math.round((usage.used / usage.quota) * 100);
327
+ items.push(
328
+ `<b>${modelGroup}</b>: ${usage.period} (${usage.used} / ${usage.quota} = ${percent}%)`,
329
+ );
330
+ }
331
+ }
332
+ }
333
+ if (items.length === 0) {
334
+ items.push("no restrictions");
335
+ }
336
+ $accountBlock
337
+ .empty()
338
+ .append(
339
+ helpIcon(
340
+ "Specifies your usage/quota (321/10000) for each model group (e.g., gpt3) for the current period (e.g., 2022-1-2).",
341
+ "help.html#quotas",
342
+ ),
343
+ )
344
+ .append("Usage")
345
+ .append(": ")
346
+ .append(items.join(" | "));
347
+ });
348
+ return $accountBlock;
349
+ }
350
+
351
+ ////////////////////////////////////////////////////////////
352
+ // For index.html
353
+
354
+ function renderQueryInterface() {
355
+ // For index.html
356
+ const $accountBlock = $("<div>").append(renderAccount());
357
+
358
+ // Allow editing the query
359
+ const $queryBlock = renderQuery((queryResult) => {
360
+ // Create requests
361
+ console.log("/api/query", queryResult);
362
+ $requestsBlock.empty();
363
+
364
+ if (queryResult.error) {
365
+ $requestsBlock.append(renderError(queryResult.error));
366
+ return;
367
+ }
368
+
369
+ $requestsBlock.append(
370
+ $("<h4>").append(`Requests (${queryResult.requests.length})`),
371
+ );
372
+ if (queryResult.error) {
373
+ $requestsBlock.append(renderError(queryResult.error));
374
+ return;
375
+ }
376
+ const changingKeys = getChangingKeys(queryResult.requests);
377
+ queryResult.requests.forEach((request) => {
378
+ const $request = $("<div>", { class: "request" }).append(
379
+ renderRequest(changingKeys, request),
380
+ );
381
+ const $requestResult = $("<div>").append($("<i>").append("(waiting)"));
382
+ const args = {
383
+ auth: JSON.stringify(auth),
384
+ request: JSON.stringify(request),
385
+ };
386
+ $.getJSON("/api/request", args, (requestResult) => {
387
+ console.log("/api/request", request, requestResult);
388
+ $requestResult.empty().append(renderRequestResult(requestResult));
389
+ if (!requestResult.cached) {
390
+ $accountBlock.empty().append(renderAccount());
391
+ }
392
+ });
393
+ $request.append($requestResult);
394
+ $requestsBlock.append($request);
395
+ });
396
+ });
397
+
398
+ // Where the requests and responses come in
399
+ const $requestsBlock = $("<div>", { class: "block" });
400
+
401
+ const $group = $("<div>");
402
+ $group.append($accountBlock);
403
+ $group.append($queryBlock);
404
+ $group.append($requestsBlock);
405
+ return $group;
406
+ }
407
+
408
+ ////////////////////////////////////////////////////////////
409
+ // For help.html
410
+
411
+ function renderModelsTable() {
412
+ // Render the list of models
413
+ const $table = $("<table>", { class: "table" });
414
+ const $header = $("<tr>")
415
+ .append($("<td>").append("group"))
416
+ .append($("<td>").append("name"))
417
+ .append($("<td>").append("description"))
418
+ .append($("<td>").append("tags"));
419
+ $table.append($header);
420
+ generalInfo.all_models.forEach((model) => {
421
+ const $row = $("<tr>")
422
+ .append($("<td>").append($("<tt>").append(model.group)))
423
+ .append($("<td>").append($("<tt>").append(model.name)))
424
+ .append($("<td>").append(model.description))
425
+ .append($("<td>").append(model.tags.join(" ")));
426
+ $table.append($row);
427
+ });
428
+ return $table;
429
+ }
430
+
431
+ ////////////////////////////////////////////////////////////
432
+ // Main
433
+
434
+ let generalInfo;
435
+
436
+ $.getJSON("/api/general_info", (response) => {
437
+ generalInfo = response;
438
+ console.log("/api/general_info", generalInfo);
439
+ if (generalInfo.error) {
440
+ alert(generalInfo.error);
441
+ return;
442
+ }
443
+
444
+ // For index.html
445
+ const $main = $("#main");
446
+ if ($main.length > 0) {
447
+ $main.empty().append(renderQueryInterface());
448
+ }
449
+
450
+ // For help.html
451
+ const $helpModels = $("#help-models");
452
+ if ($helpModels.length > 0) {
453
+ $helpModels.empty().append(renderModelsTable());
454
+ }
455
+ });
456
+ });
Binary file
helm/proxy/test_retry.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from retrying import RetryError
2
2
 
3
3
  from helm.common.request import RequestResult
4
- from .retry import retry_request, get_retry_decorator, retry_if_request_failed
4
+ from helm.proxy.retry import retry_request, get_retry_decorator, retry_if_request_failed
5
5
 
6
6
 
7
7
  def test_retry_for_successful_request():
@@ -4,7 +4,7 @@ from helm.benchmark.model_deployment_registry import ModelDeployment, get_model_
4
4
  from helm.common.request import Request, GeneratedOutput
5
5
  from helm.tokenizers.auto_tokenizer import AutoTokenizer
6
6
  from helm.common.tokenization_request import TokenizationRequest, TokenizationRequestResult
7
- from .token_counter import TokenCounter
7
+ from helm.proxy.token_counters.token_counter import TokenCounter
8
8
 
9
9
 
10
10
  class AutoTokenCounter(TokenCounter):
@@ -10,7 +10,7 @@ from helm.common.tokenization_request import (
10
10
  DecodeRequest,
11
11
  TokenizationToken,
12
12
  )
13
- from .caching_tokenizer import CachingTokenizer
13
+ from helm.tokenizers.caching_tokenizer import CachingTokenizer
14
14
 
15
15
  try:
16
16
  from aleph_alpha_client import Client as AlephAlphaPythonClient
@@ -1,6 +1,6 @@
1
1
  from abc import abstractmethod
2
2
  from dataclasses import asdict
3
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, List
4
4
 
5
5
  from helm.common.cache import Cache, CacheConfig
6
6
  from helm.common.request import wrap_request_time
@@ -11,7 +11,7 @@ from helm.common.tokenization_request import (
11
11
  DecodeRequestResult,
12
12
  TokenizationToken,
13
13
  )
14
- from .tokenizer import Tokenizer
14
+ from helm.tokenizers.tokenizer import Tokenizer
15
15
 
16
16
 
17
17
  class CachingTokenizer(Tokenizer):
@@ -153,31 +153,3 @@ class CachingTokenizer(Tokenizer):
153
153
  )
154
154
  except Exception as error:
155
155
  raise ValueError(f"Failed to decode tokens with {self.__class__.__name__} tokenizer: {error}") from error
156
-
157
-
158
- def cleanup_str(token: str, tokenizer_name: Optional[str] = None) -> str:
159
- """
160
- Certain tokenizers introduce special characters to represent spaces, such as
161
- "Ġ" or "▁". This function removes those characters.
162
- """
163
- if tokenizer_name in [
164
- "TsinghuaKEG/ice",
165
- "bigscience/T0pp",
166
- "google/t5-11b",
167
- "google/flan-t5-xxl",
168
- "google/ul2",
169
- "Yandex/yalm",
170
- "ai21/j1",
171
- "together",
172
- ]:
173
- return token.replace("▁", " ")
174
- elif tokenizer_name is not None and tokenizer_name.startswith("huggingface"):
175
- return token.replace("Ġ", " ")
176
- return token
177
-
178
-
179
- def cleanup_tokens(tokens: List[str], tokenizer_name: Optional[str] = None) -> List[str]:
180
- """
181
- Applies `cleanup_str` to each token in `tokens`.
182
- """
183
- return [cleanup_str(token, tokenizer_name) for token in tokens]
@@ -0,0 +1,53 @@
1
+ import dataclasses
2
+ import os
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ import requests
6
+
7
+ from helm.common.cache import CacheConfig
8
+ from helm.common.tokenization_request import (
9
+ TokenizationRequest,
10
+ TokenizationToken,
11
+ )
12
+ from helm.tokenizers.caching_tokenizer import CachingTokenizer
13
+
14
+
15
+ class GrokAPITokenizer(CachingTokenizer):
16
+ """Tokenizer that uses the xAI Grok Tokenize Text API
17
+
18
+ Doc: https://docs.x.ai/docs/api-reference#tokenize-text"""
19
+
20
+ def __init__(self, cache_config: CacheConfig, api_key: Optional[str] = None) -> None:
21
+ super().__init__(cache_config)
22
+ self.api_key = api_key or os.environ.get("XAI_API_KEY")
23
+
24
+ def _tokenization_request_to_cache_key(self, request: TokenizationRequest) -> Dict[str, Any]:
25
+ cache_key = dataclasses.asdict(request)
26
+ # Delete encode because the Grok API simulateously gives string and integer tokens.
27
+ del cache_key["encode"]
28
+ return cache_key
29
+
30
+ def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
31
+ if not self.api_key:
32
+ raise Exception(
33
+ "No Grok API key found. "
34
+ "Set grokApiKey in credentials.conf or set the GROK_API_KEY environment variable"
35
+ )
36
+ text = request["text"]
37
+ model = request["tokenizer"].split("/")[-1]
38
+ response = requests.post(
39
+ url="https://api.x.ai/v1/tokenize-text",
40
+ headers={"Authorization": f"Bearer {self.api_key}"},
41
+ json={"text": text, "model": model},
42
+ )
43
+ response.raise_for_status()
44
+ return response.json()
45
+
46
+ def _tokenization_raw_response_to_tokens(
47
+ self, response: Dict[str, Any], request: TokenizationRequest
48
+ ) -> List[TokenizationToken]:
49
+ raw_token_field_name = "token_id" if request.encode else "string_token"
50
+ return [TokenizationToken(raw_token[raw_token_field_name]) for raw_token in response["token_ids"]]
51
+
52
+ def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
53
+ raise NotImplementedError("The xAI API does not support decoding.")
@@ -11,7 +11,7 @@ from helm.common.tokenization_request import (
11
11
  TokenizationRequestResult,
12
12
  TokenizationToken,
13
13
  )
14
- from .tokenizer import Tokenizer
14
+ from helm.tokenizers.tokenizer import Tokenizer
15
15
 
16
16
  import requests
17
17
 
@@ -7,8 +7,8 @@ from helm.common.concurrency import ThreadSafeWrapper
7
7
  from transformers import AutoTokenizer, PreTrainedTokenizerBase
8
8
 
9
9
  from helm.common.hierarchical_logger import htrack_block, hlog
10
- from .caching_tokenizer import CachingTokenizer
11
- from .tokenizer import cleanup_tokens
10
+ from helm.tokenizers.caching_tokenizer import CachingTokenizer
11
+ from helm.tokenizers.tokenizer import cleanup_tokens
12
12
 
13
13
 
14
14
  WrappedPreTrainedTokenizer = ThreadSafeWrapper[PreTrainedTokenizerBase]
@@ -119,7 +119,7 @@ class HuggingFaceTokenizer(CachingTokenizer):
119
119
  tokens = tokenizer.encode(
120
120
  request["text"],
121
121
  truncation=request["truncation"],
122
- max_length=request["max_length"],
122
+ max_length=max(request["max_length"], 0),
123
123
  add_special_tokens=False,
124
124
  )
125
125
  else:
@@ -5,7 +5,7 @@ import torch
5
5
 
6
6
  from helm.common.cache import CacheConfig
7
7
  from helm.common.optional_dependencies import OptionalDependencyNotInstalled
8
- from .caching_tokenizer import CachingTokenizer
8
+ from helm.tokenizers.caching_tokenizer import CachingTokenizer
9
9
 
10
10
  try:
11
11
  from lit_gpt import Tokenizer as InternalTokenizer
@@ -10,7 +10,7 @@ from helm.common.tokenization_request import (
10
10
  TokenizationRequest,
11
11
  TokenizationRequestResult,
12
12
  )
13
- from helm.tokenizers.anthropic_tokenizer import AnthropicTokenizer
13
+ from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
14
14
 
15
15
 
16
16
  class TestAnthropicTokenizer:
@@ -21,7 +21,11 @@ class TestAnthropicTokenizer:
21
21
  def setup_method(self, method):
22
22
  cache_file = tempfile.NamedTemporaryFile(delete=False)
23
23
  self.cache_path: str = cache_file.name
24
- self.tokenizer = AnthropicTokenizer(SqliteCacheConfig(self.cache_path))
24
+ self.tokenizer = HuggingFaceTokenizer(
25
+ SqliteCacheConfig(self.cache_path),
26
+ tokenizer_name="anthropic/claude",
27
+ pretrained_model_name_or_path="Xenova/claude-tokenizer",
28
+ )
25
29
 
26
30
  def teardown_method(self, method):
27
31
  os.remove(self.cache_path)
@@ -0,0 +1,33 @@
1
+ import os
2
+ import pytest
3
+
4
+ from helm.common.cache import BlackHoleCacheConfig
5
+ from helm.common.tokenization_request import (
6
+ TokenizationRequest,
7
+ TokenizationToken,
8
+ )
9
+ from helm.tokenizers.grok_tokenizer import GrokAPITokenizer
10
+
11
+
12
+ @pytest.mark.models
13
+ def test_tokenize():
14
+ if not os.environ.get("XAI_API_KEY"):
15
+ pytest.skip("No xAI API key found; skipping test")
16
+ tokenizer = GrokAPITokenizer(cache_config=BlackHoleCacheConfig())
17
+ request = TokenizationRequest(tokenizer="xai/grok-3-beta", text="otter 🦦")
18
+ result = tokenizer.tokenize(request)
19
+ assert result.success
20
+ assert not result.cached
21
+ assert result.tokens == [TokenizationToken(token) for token in ["otter", "", "", ""]]
22
+
23
+
24
+ @pytest.mark.models
25
+ def test_encode():
26
+ if not os.environ.get("XAI_API_KEY"):
27
+ pytest.skip("No xAI API key found; skipping test")
28
+ tokenizer = GrokAPITokenizer(cache_config=BlackHoleCacheConfig())
29
+ request = TokenizationRequest(tokenizer="xai/grok-3-beta", text="otter 🦦", encode=True)
30
+ result = tokenizer.tokenize(request)
31
+ assert result.success
32
+ assert not result.cached
33
+ assert result.tokens == [TokenizationToken(token) for token in [142507, 11637, 294, 294]]