crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (206) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  13. helm/benchmark/annotation/model_as_judge.py +12 -16
  14. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  15. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  16. helm/benchmark/executor.py +11 -12
  17. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  18. helm/benchmark/metrics/bias_word_lists.py +1 -1
  19. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  20. helm/benchmark/metrics/classification_metrics.py +3 -3
  21. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  22. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  23. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  24. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  25. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  26. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  27. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  28. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  29. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  30. helm/benchmark/metrics/medalign_metrics.py +9 -29
  31. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  32. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  33. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  34. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  35. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  36. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  37. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  38. helm/benchmark/metrics/metric_service.py +11 -11
  39. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  40. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  41. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  42. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  43. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  44. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  45. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  46. helm/benchmark/metrics/summac/model_summac.py +1 -2
  47. helm/benchmark/metrics/summarization_metrics.py +2 -1
  48. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  49. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  50. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  51. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  52. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  53. helm/benchmark/model_deployment_registry.py +6 -8
  54. helm/benchmark/presentation/contamination.py +3 -3
  55. helm/benchmark/presentation/create_plots.py +33 -12
  56. helm/benchmark/presentation/run_display.py +13 -0
  57. helm/benchmark/presentation/schema.py +2 -1
  58. helm/benchmark/presentation/summarize.py +76 -59
  59. helm/benchmark/reeval_run.py +3 -4
  60. helm/benchmark/reeval_runner.py +3 -3
  61. helm/benchmark/run.py +78 -73
  62. helm/benchmark/run_expander.py +12 -1
  63. helm/benchmark/run_spec_factory.py +7 -6
  64. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  65. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  66. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  67. helm/benchmark/run_specs/long_context_run_specs.py +67 -15
  68. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  69. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  70. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  71. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  72. helm/benchmark/runner.py +5 -5
  73. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  74. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  75. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  76. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  77. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  78. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  79. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  80. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  81. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  82. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  83. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  84. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  85. helm/benchmark/scenarios/clear_scenario.py +11 -7
  86. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  87. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  88. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  89. helm/benchmark/scenarios/grammar.py +2 -2
  90. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  91. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  92. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  93. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  94. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  95. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  96. helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
  97. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  98. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  99. helm/benchmark/scenarios/medec_scenario.py +6 -1
  100. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  101. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  102. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  103. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  104. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  105. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  106. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  107. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  108. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  109. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  110. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  111. helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
  112. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  113. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  114. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  115. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  116. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  117. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  118. helm/benchmark/scenarios/numeracy_scenario.py +2 -1
  119. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  120. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  121. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  122. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  123. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  124. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  125. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  126. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  127. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  128. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  129. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  130. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  131. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  132. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  133. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  134. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  135. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  136. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  137. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  138. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  139. helm/benchmark/server.py +2 -1
  140. helm/benchmark/static/schema_audio.yaml +60 -49
  141. helm/benchmark/static/schema_enterprise.yaml +21 -0
  142. helm/benchmark/static/schema_long_context.yaml +63 -20
  143. helm/benchmark/static/schema_medhelm.yaml +272 -213
  144. helm/benchmark/static/schema_melt.yaml +1257 -0
  145. helm/benchmark/static/schema_slphelm.yaml +162 -0
  146. helm/benchmark/static/schema_vhelm.yaml +26 -26
  147. helm/benchmark/static/schema_video.yaml +219 -0
  148. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  149. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  150. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  151. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  152. helm/benchmark/static_build/index.html +4 -4
  153. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  154. helm/benchmark/window_services/test_utils.py +3 -4
  155. helm/benchmark/window_services/tokenizer_service.py +7 -8
  156. helm/clients/anthropic_client.py +69 -29
  157. helm/clients/audio_language/diva_llama_client.py +4 -2
  158. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  159. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  160. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  161. helm/clients/audio_language/test.py +62 -0
  162. helm/clients/bedrock_client.py +3 -1
  163. helm/clients/client.py +7 -7
  164. helm/clients/grok_client.py +36 -0
  165. helm/clients/huggingface_client.py +42 -3
  166. helm/clients/huggingface_pipeline_client.py +138 -0
  167. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  168. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  169. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  170. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  171. helm/clients/openai_client.py +100 -54
  172. helm/clients/openai_responses_client.py +174 -0
  173. helm/clients/palmyra_client.py +2 -5
  174. helm/clients/reka_client.py +2 -2
  175. helm/clients/together_client.py +31 -4
  176. helm/clients/vertexai_client.py +6 -0
  177. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  178. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  179. helm/clients/vision_language/idefics_client.py +6 -2
  180. helm/clients/vision_language/paligemma_client.py +2 -2
  181. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  182. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  183. helm/clients/writer_client.py +102 -0
  184. helm/common/context.py +80 -0
  185. helm/common/credentials_utils.py +5 -5
  186. helm/common/general.py +9 -2
  187. helm/common/hierarchical_logger.py +46 -3
  188. helm/common/local_context.py +140 -0
  189. helm/common/remote_context.py +61 -0
  190. helm/common/request.py +8 -0
  191. helm/config/model_deployments.yaml +864 -193
  192. helm/config/model_metadata.yaml +667 -53
  193. helm/config/tokenizer_configs.yaml +144 -3
  194. helm/proxy/cli.py +3 -1
  195. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  196. helm/proxy/services/server_service.py +21 -85
  197. helm/tokenizers/grok_tokenizer.py +53 -0
  198. helm/tokenizers/huggingface_tokenizer.py +1 -1
  199. helm/tokenizers/test_grok_tokenizer.py +33 -0
  200. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  201. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  202. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  203. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  204. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
  205. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
  206. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -252,6 +252,19 @@ tokenizer_configs:
252
252
  end_of_text_token: "<eos>"
253
253
  prefix_token: "<bos>"
254
254
 
255
+ # Grok
256
+ - name: xai/grok-3-beta
257
+ tokenizer_spec:
258
+ class_name: "helm.tokenizers.grok_tokenizer.GrokAPITokenizer"
259
+ end_of_text_token: ""
260
+ prefix_token: ""
261
+
262
+ - name: xai/grok-3-mini-beta
263
+ tokenizer_spec:
264
+ class_name: "helm.tokenizers.grok_tokenizer.GrokAPITokenizer"
265
+ end_of_text_token: ""
266
+ prefix_token: ""
267
+
255
268
  # Hf-internal-testing
256
269
 
257
270
  # Tokenizer name hf-internal-testing/llama-tokenizer is taken from:
@@ -299,6 +312,22 @@ tokenizer_configs:
299
312
  end_of_text_token: "<|endoftext|>"
300
313
  prefix_token: "<|endoftext|>"
301
314
 
315
+ - name: huggingface/smollm2-135m
316
+ tokenizer_spec:
317
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
318
+ args:
319
+ pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M
320
+ end_of_text_token: "<|endoftext|>"
321
+ prefix_token: "<|endoftext|>"
322
+
323
+ - name: huggingface/smollm2-135m-instruct
324
+ tokenizer_spec:
325
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
326
+ args:
327
+ pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct
328
+ end_of_text_token: "<|endoftext|>"
329
+ prefix_token: "<|im_end|>"
330
+
302
331
  # Lighting AI
303
332
  - name: lightningai/lit-gpt
304
333
  tokenizer_spec:
@@ -396,6 +425,14 @@ tokenizer_configs:
396
425
  prefix_token: "<|begin_of_text|>"
397
426
  end_of_text_token: "<|eot_id|>"
398
427
 
428
+ - name: meta/llama-4-scout-17b-16e-instruct
429
+ tokenizer_spec:
430
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
431
+ args:
432
+ pretrained_model_name_or_path: meta-llama/Llama-4-Scout-17B-16E-Instruct
433
+ prefix_token: "<|begin_of_text|>"
434
+ end_of_text_token: "<|end_of_text|>"
435
+
399
436
  # 01-ai
400
437
  - name: 01-ai/Yi-6B
401
438
  tokenizer_spec:
@@ -432,9 +469,33 @@ tokenizer_configs:
432
469
  end_of_text_token: "<|endoftext|>"
433
470
  prefix_token: ""
434
471
 
472
+ - name: allenai/olmo-2-1124-7b-instruct
473
+ tokenizer_spec:
474
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
475
+ end_of_text_token: "<|endoftext|>"
476
+ prefix_token: "<|endoftext|>"
477
+
478
+ - name: allenai/olmo-2-0325-32b-instruct
479
+ tokenizer_spec:
480
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
481
+ end_of_text_token: "<|endoftext|>"
482
+ prefix_token: "<|endoftext|>"
483
+
484
+ - name: allenai/olmoe-1b-7b-0125-instruct
485
+ tokenizer_spec:
486
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
487
+ end_of_text_token: "|||IP_ADDRESS|||"
488
+ prefix_token: "|||IP_ADDRESS|||"
489
+
490
+ # Marin Community
491
+ - name: marin-community/marin-8b-instruct
492
+ tokenizer_spec:
493
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
494
+ end_of_text_token: "<|eot_id|>"
495
+ prefix_token: "<|begin_of_text|>"
435
496
 
436
497
  # Microsoft
437
- - name: microsoft/phi-2
498
+ - name: microsoft/phi-2
438
499
  tokenizer_spec:
439
500
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
440
501
  end_of_text_token: "<|endoftext|>"
@@ -619,6 +680,14 @@ tokenizer_configs:
619
680
  end_of_text_token: "<|im_end|>"
620
681
  prefix_token: "<|im_start|>"
621
682
 
683
+ - name: qwen/qwen3-235b-a22b
684
+ tokenizer_spec:
685
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
686
+ args:
687
+ pretrained_model_name_or_path: Qwen/Qwen3-235B-A22B
688
+ end_of_text_token: "<|im_end|>"
689
+ prefix_token: "<|im_start|>"
690
+
622
691
  - name: qwen/qwq-32b-preview
623
692
  tokenizer_spec:
624
693
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -664,6 +733,15 @@ tokenizer_configs:
664
733
  end_of_text_token: "<|endoftext|>"
665
734
  prefix_token: ""
666
735
 
736
+ - name: qwen/qwen2.5-omni-7b
737
+ tokenizer_spec:
738
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
739
+ args:
740
+ pretrained_model_name_or_path: Qwen/Qwen2.5-Omni-7B
741
+ trust_remote_code: false
742
+ end_of_text_token: "<|endoftext|>"
743
+ prefix_token: ""
744
+
667
745
  # SambaLingo
668
746
  - name: sambanova/sambalingo-thai-base
669
747
  tokenizer_spec:
@@ -910,8 +988,6 @@ tokenizer_configs:
910
988
  prefix_token: ""
911
989
  end_of_text_token: ""
912
990
 
913
-
914
-
915
991
  - name: ibm-granite/granite-34b-code-instruct-8k
916
992
  tokenizer_spec:
917
993
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -945,7 +1021,17 @@ tokenizer_configs:
945
1021
  prefix_token: ""
946
1022
  end_of_text_token: ""
947
1023
 
1024
+ # IBM Granite 3.3
1025
+
1026
+ - name: ibm/granite-3.3-8b-instruct
1027
+ tokenizer_spec:
1028
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1029
+ args:
1030
+ pretrained_model_name_or_path: ibm-granite/granite-3.3-8b-instruct
1031
+ end_of_text_token: "<|end_of_text|>"
1032
+ prefix_token: "<|end_of_text|>"
948
1033
 
1034
+
949
1035
 
950
1036
  # DeepSeek-R1-Distill-Llama-3.1-8b
951
1037
  - name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
@@ -964,3 +1050,58 @@ tokenizer_configs:
964
1050
  pretrained_model_name_or_path: deepseek-ai/deepseek-coder-6.7b-instruct
965
1051
  end_of_text_token: "<|end▁of▁sentence|>"
966
1052
  prefix_token: "<|begin▁of▁sentence|>"
1053
+
1054
+
1055
+ # vilm/vinallama-2.7b-chat
1056
+ - name: vilm/vinallama-2.7b-chat
1057
+ tokenizer_spec:
1058
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1059
+ args:
1060
+ pretrained_model_name_or_path: vilm/vinallama-2.7b-chat
1061
+ end_of_text_token: "<im_end>"
1062
+ prefix_token: "<im_start>"
1063
+
1064
+ # vilm/vinallama-7b-chat
1065
+ - name: vilm/vinallama-7b-chat
1066
+ tokenizer_spec:
1067
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1068
+ args:
1069
+ pretrained_model_name_or_path: vilm/vinallama-7b-chat
1070
+ end_of_text_token: "<im_end>"
1071
+ prefix_token: "<im_start>"
1072
+
1073
+ # vilm/vietcuna-7b-v3
1074
+ - name: vilm/vietcuna-7b-v3
1075
+ tokenizer_spec:
1076
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1077
+ args:
1078
+ pretrained_model_name_or_path: vilm/vietcuna-7b-v3
1079
+ end_of_text_token: "</s>"
1080
+ prefix_token: "<s>"
1081
+
1082
+ # Viet-Mistral/Vistral-7B-Chat
1083
+ - name: Viet-Mistral/Vistral-7B-Chat
1084
+ tokenizer_spec:
1085
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1086
+ args:
1087
+ pretrained_model_name_or_path: Viet-Mistral/Vistral-7B-Chat
1088
+ end_of_text_token: "</s>"
1089
+ prefix_token: "<s>"
1090
+
1091
+ # vinai/PhoGPT-7B5-Instruct
1092
+ - name: vinai/PhoGPT-7B5-Instruct
1093
+ tokenizer_spec:
1094
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1095
+ args:
1096
+ pretrained_model_name_or_path: vinai/PhoGPT-7B5-Instruct
1097
+ end_of_text_token: "</s>"
1098
+ prefix_token: "<s>"
1099
+
1100
+ # vinai/PhoGPT-4B-Chat
1101
+ - name: vinai/PhoGPT-4B-Chat
1102
+ tokenizer_spec:
1103
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1104
+ args:
1105
+ pretrained_model_name_or_path: vinai/PhoGPT-4B-Chat
1106
+ end_of_text_token: "</s>"
1107
+ prefix_token: "<s>"
helm/proxy/cli.py CHANGED
@@ -21,7 +21,7 @@ from typing import List, Dict
21
21
  import re
22
22
  import sys
23
23
 
24
- from helm.common.hierarchical_logger import hlog
24
+ from helm.common.hierarchical_logger import hlog, setup_default_logging
25
25
  from helm.common.authentication import Authentication
26
26
  from helm.proxy.accounts import Usage, Account
27
27
  from helm.proxy.services.remote_service import RemoteService, add_service_args, create_authentication
@@ -198,6 +198,8 @@ def main():
198
198
 
199
199
  args = parser.parse_args()
200
200
 
201
+ setup_default_logging()
202
+
201
203
  service = create_remote_service(args)
202
204
  auth = create_authentication(args)
203
205
 
@@ -38,7 +38,7 @@ def replace_emoji_characters(s: str) -> str:
38
38
  highpoints = re.compile("[\U00010000-\U0010ffff]")
39
39
  elif sys.maxunicode == 65535:
40
40
  # Python was built with '--enable-unicode=ucs2'
41
- highpoints = re.compile("[\uD800-\uDBFF][\uDC00-\uDFFF]")
41
+ highpoints = re.compile("[\ud800-\udbff][\udc00-\udfff]")
42
42
  else:
43
43
  raise UnicodeError("Unable to determine if Python was built using UCS-2 or UCS-4")
44
44
 
@@ -1,8 +1,8 @@
1
- import dataclasses
2
1
  import os
3
2
  import signal
4
- from typing import List, Optional
3
+ from typing import List
5
4
 
5
+ from helm.common.local_context import LocalContext
6
6
  from helm.common.cache import CacheConfig
7
7
  from helm.common.cache_backend_config import CacheBackendConfig, BlackHoleCacheBackendConfig
8
8
  from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
@@ -11,7 +11,6 @@ from helm.common.moderations_api_request import ModerationAPIRequest, Moderation
11
11
  from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
12
12
  from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
13
13
  from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
14
- from helm.common.general import ensure_directory_exists, parse_hocon, get_credentials
15
14
  from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
16
15
  from helm.common.tokenization_request import (
17
16
  TokenizationRequest,
@@ -22,27 +21,13 @@ from helm.common.tokenization_request import (
22
21
  from helm.common.request import Request, RequestResult
23
22
  from helm.common.hierarchical_logger import hlog
24
23
  from helm.proxy.accounts import Accounts, Account
25
- from helm.clients.auto_client import AutoClient
26
- from helm.clients.moderation_api_client import ModerationAPIClient
27
- from helm.clients.image_generation.nudity_check_client import NudityCheckClient
28
- from helm.clients.gcs_client import GCSClient
29
- from helm.clients.clip_score_client import CLIPScoreClient
30
- from helm.clients.toxicity_classifier_client import ToxicityClassifierClient
31
- from helm.proxy.example_queries import example_queries
32
- from helm.benchmark.model_metadata_registry import ALL_MODELS_METADATA
33
24
  from helm.benchmark.model_deployment_registry import get_model_deployment_host_organization
34
25
  from helm.proxy.query import Query, QueryResult
35
- from helm.proxy.retry import retry_request
36
26
  from helm.proxy.token_counters.auto_token_counter import AutoTokenCounter
37
- from helm.tokenizers.auto_tokenizer import AutoTokenizer
38
27
  from helm.proxy.services.service import (
39
28
  Service,
40
- CACHE_DIR,
41
29
  ACCOUNTS_FILE,
42
30
  GeneralInfo,
43
- VERSION,
44
- expand_environments,
45
- synthesize_request,
46
31
  )
47
32
 
48
33
 
@@ -57,43 +42,17 @@ class ServerService(Service):
57
42
  root_mode: bool = False,
58
43
  cache_backend_config: CacheBackendConfig = BlackHoleCacheBackendConfig(),
59
44
  ):
60
- ensure_directory_exists(base_path)
61
- client_file_storage_path = os.path.join(base_path, CACHE_DIR)
62
- ensure_directory_exists(client_file_storage_path)
63
-
64
- credentials = get_credentials(base_path)
65
45
  accounts_path = os.path.join(base_path, ACCOUNTS_FILE)
66
46
 
67
- self.cache_backend_config = cache_backend_config
68
- self.client = AutoClient(credentials, client_file_storage_path, cache_backend_config)
69
- self.tokenizer = AutoTokenizer(credentials, cache_backend_config)
70
- self.token_counter = AutoTokenCounter(self.tokenizer)
47
+ self.context = LocalContext(base_path, cache_backend_config)
48
+ self.token_counter = AutoTokenCounter(self.context.tokenizer)
71
49
  self.accounts = Accounts(accounts_path, root_mode=root_mode)
72
50
 
73
- # Lazily instantiate the following clients
74
- self.moderation_api_client: Optional[ModerationAPIClient] = None
75
- self.toxicity_classifier_client: Optional[ToxicityClassifierClient] = None
76
- self.perspective_api_client: Optional[ToxicityClassifierClient] = None
77
- self.nudity_check_client: Optional[NudityCheckClient] = None
78
- self.clip_score_client: Optional[CLIPScoreClient] = None
79
- self.gcs_client: Optional[GCSClient] = None
80
-
81
51
  def get_general_info(self) -> GeneralInfo:
82
- # Can't send release_dates in ModelMetadata bacause dates cannot be round-tripped to and from JSON easily.
83
- # TODO(#2158): Either fix this or delete get_general_info.
84
- all_models = [dataclasses.replace(model_metadata, release_date=None) for model_metadata in ALL_MODELS_METADATA]
85
- return GeneralInfo(version=VERSION, example_queries=example_queries, all_models=all_models)
52
+ return self.context.get_general_info()
86
53
 
87
54
  def expand_query(self, query: Query) -> QueryResult:
88
- """Turn the `query` into requests."""
89
- prompt = query.prompt
90
- settings = query.settings
91
- environments = parse_hocon(query.environments)
92
- requests = []
93
- for environment in expand_environments(environments):
94
- request = synthesize_request(prompt, settings, environment)
95
- requests.append(request)
96
- return QueryResult(requests=requests)
55
+ return self.context.expand_query(query)
97
56
 
98
57
  def _get_model_group_for_model_deployment(self, model_deployment: str) -> str:
99
58
  if model_deployment.startswith("openai/"):
@@ -105,7 +64,11 @@ class ServerService(Service):
105
64
  return "gpt4"
106
65
  elif model_deployment.startswith("openai/gpt-3"):
107
66
  return "gpt3"
108
- elif model_deployment.startswith("openai/o1"):
67
+ elif (
68
+ model_deployment.startswith("openai/o1")
69
+ or model_deployment.startswith("openai/o3")
70
+ or model_deployment.startswith("openai/o4")
71
+ ):
109
72
  return "o1"
110
73
  else:
111
74
  return "openai"
@@ -126,7 +89,7 @@ class ServerService(Service):
126
89
  self.accounts.check_can_use(auth.api_key, model_group)
127
90
 
128
91
  # Use!
129
- request_result: RequestResult = self.client.make_request(request)
92
+ request_result: RequestResult = self.context.make_request(request)
130
93
 
131
94
  # Only deduct if not cached
132
95
  if not request_result.cached:
@@ -139,66 +102,39 @@ class ServerService(Service):
139
102
  def tokenize(self, auth: Authentication, request: TokenizationRequest) -> TokenizationRequestResult:
140
103
  """Tokenize via an API."""
141
104
  self.accounts.authenticate(auth)
142
- return self.tokenizer.tokenize(request)
105
+ return self.context.tokenize(request)
143
106
 
144
107
  def decode(self, auth: Authentication, request: DecodeRequest) -> DecodeRequestResult:
145
108
  """Decodes to text."""
146
109
  self.accounts.authenticate(auth)
147
- return self.tokenizer.decode(request)
110
+ return self.context.decode(request)
148
111
 
149
112
  def upload(self, auth: Authentication, request: FileUploadRequest) -> FileUploadResult:
150
113
  """Uploads a file to external storage."""
151
114
  self.accounts.authenticate(auth)
152
-
153
- if not self.gcs_client:
154
- self.gcs_client = self.client.get_gcs_client()
155
-
156
- assert self.gcs_client
157
- return self.gcs_client.upload(request)
115
+ return self.context.upload(request)
158
116
 
159
117
  def check_nudity(self, auth: Authentication, request: NudityCheckRequest) -> NudityCheckResult:
160
118
  """Check for nudity."""
161
119
  self.accounts.authenticate(auth)
162
-
163
- if not self.nudity_check_client:
164
- self.nudity_check_client = self.client.get_nudity_check_client()
165
-
166
- assert self.nudity_check_client
167
- return self.nudity_check_client.check_nudity(request)
120
+ return self.context.check_nudity(request)
168
121
 
169
122
  def compute_clip_score(self, auth: Authentication, request: CLIPScoreRequest) -> CLIPScoreResult:
170
123
  """Computes CLIPScore for a given caption and image."""
171
124
  self.accounts.authenticate(auth)
172
-
173
- if not self.clip_score_client:
174
- self.clip_score_client = self.client.get_clip_score_client()
175
-
176
- assert self.clip_score_client
177
- return self.clip_score_client.compute_score(request)
125
+ return self.context.compute_clip_score(request)
178
126
 
179
127
  def get_toxicity_scores(self, auth: Authentication, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
180
- @retry_request
181
- def get_toxicity_scores_with_retry(request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
182
- if not self.toxicity_classifier_client:
183
- self.toxicity_classifier_client = self.client.get_toxicity_classifier_client()
184
- return self.toxicity_classifier_client.get_toxicity_scores(request)
185
-
186
128
  self.accounts.authenticate(auth)
187
- return get_toxicity_scores_with_retry(request)
129
+ return self.context.get_toxicity_scores(request)
188
130
 
189
131
  def get_moderation_results(self, auth: Authentication, request: ModerationAPIRequest) -> ModerationAPIRequestResult:
190
- @retry_request
191
- def get_moderation_results_with_retry(request: ModerationAPIRequest) -> ModerationAPIRequestResult:
192
- if not self.moderation_api_client:
193
- self.moderation_api_client = self.client.get_moderation_api_client()
194
- return self.moderation_api_client.get_moderation_results(request)
195
-
196
132
  self.accounts.authenticate(auth)
197
- return get_moderation_results_with_retry(request)
133
+ return self.context.get_moderation_results(request)
198
134
 
199
135
  def make_critique_request(self, auth: Authentication, request: CritiqueRequest) -> CritiqueRequestResult:
200
136
  self.accounts.authenticate(auth)
201
- return self.client.get_critique_client().make_critique_request(request)
137
+ return self.context.make_critique_request(request)
202
138
 
203
139
  def create_account(self, auth: Authentication) -> Account:
204
140
  """Creates a new account."""
@@ -233,4 +169,4 @@ class ServerService(Service):
233
169
  hlog("Done.")
234
170
 
235
171
  def get_cache_config(self, shard_name: str) -> CacheConfig:
236
- return self.cache_backend_config.get_cache_config(shard_name)
172
+ return self.context.get_cache_config(shard_name)
@@ -0,0 +1,53 @@
1
+ import dataclasses
2
+ import os
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ import requests
6
+
7
+ from helm.common.cache import CacheConfig
8
+ from helm.common.tokenization_request import (
9
+ TokenizationRequest,
10
+ TokenizationToken,
11
+ )
12
+ from helm.tokenizers.caching_tokenizer import CachingTokenizer
13
+
14
+
15
+ class GrokAPITokenizer(CachingTokenizer):
16
+ """Tokenizer that uses the xAI Grok Tokenize Text API
17
+
18
+ Doc: https://docs.x.ai/docs/api-reference#tokenize-text"""
19
+
20
+ def __init__(self, cache_config: CacheConfig, api_key: Optional[str] = None) -> None:
21
+ super().__init__(cache_config)
22
+ self.api_key = api_key or os.environ.get("XAI_API_KEY")
23
+
24
+ def _tokenization_request_to_cache_key(self, request: TokenizationRequest) -> Dict[str, Any]:
25
+ cache_key = dataclasses.asdict(request)
26
+ # Delete encode because the Grok API simulateously gives string and integer tokens.
27
+ del cache_key["encode"]
28
+ return cache_key
29
+
30
+ def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
31
+ if not self.api_key:
32
+ raise Exception(
33
+ "No Grok API key found. "
34
+ "Set grokApiKey in credentials.conf or set the GROK_API_KEY environment variable"
35
+ )
36
+ text = request["text"]
37
+ model = request["tokenizer"].split("/")[-1]
38
+ response = requests.post(
39
+ url="https://api.x.ai/v1/tokenize-text",
40
+ headers={"Authorization": f"Bearer {self.api_key}"},
41
+ json={"text": text, "model": model},
42
+ )
43
+ response.raise_for_status()
44
+ return response.json()
45
+
46
+ def _tokenization_raw_response_to_tokens(
47
+ self, response: Dict[str, Any], request: TokenizationRequest
48
+ ) -> List[TokenizationToken]:
49
+ raw_token_field_name = "token_id" if request.encode else "string_token"
50
+ return [TokenizationToken(raw_token[raw_token_field_name]) for raw_token in response["token_ids"]]
51
+
52
+ def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
53
+ raise NotImplementedError("The xAI API does not support decoding.")
@@ -119,7 +119,7 @@ class HuggingFaceTokenizer(CachingTokenizer):
119
119
  tokens = tokenizer.encode(
120
120
  request["text"],
121
121
  truncation=request["truncation"],
122
- max_length=request["max_length"],
122
+ max_length=max(request["max_length"], 0),
123
123
  add_special_tokens=False,
124
124
  )
125
125
  else:
@@ -0,0 +1,33 @@
1
+ import os
2
+ import pytest
3
+
4
+ from helm.common.cache import BlackHoleCacheConfig
5
+ from helm.common.tokenization_request import (
6
+ TokenizationRequest,
7
+ TokenizationToken,
8
+ )
9
+ from helm.tokenizers.grok_tokenizer import GrokAPITokenizer
10
+
11
+
12
+ @pytest.mark.models
13
+ def test_tokenize():
14
+ if not os.environ.get("XAI_API_KEY"):
15
+ pytest.skip("No xAI API key found; skipping test")
16
+ tokenizer = GrokAPITokenizer(cache_config=BlackHoleCacheConfig())
17
+ request = TokenizationRequest(tokenizer="xai/grok-3-beta", text="otter 🦦")
18
+ result = tokenizer.tokenize(request)
19
+ assert result.success
20
+ assert not result.cached
21
+ assert result.tokens == [TokenizationToken(token) for token in ["otter", "", "", ""]]
22
+
23
+
24
+ @pytest.mark.models
25
+ def test_encode():
26
+ if not os.environ.get("XAI_API_KEY"):
27
+ pytest.skip("No xAI API key found; skipping test")
28
+ tokenizer = GrokAPITokenizer(cache_config=BlackHoleCacheConfig())
29
+ request = TokenizationRequest(tokenizer="xai/grok-3-beta", text="otter 🦦", encode=True)
30
+ result = tokenizer.tokenize(request)
31
+ assert result.success
32
+ assert not result.cached
33
+ assert result.tokens == [TokenizationToken(token) for token in [142507, 11637, 294, 294]]
@@ -1,46 +0,0 @@
1
- import pytest
2
- import re
3
- from tempfile import TemporaryDirectory
4
- from helm.benchmark.scenarios.infinite_bench_sum_scenario import InfiniteBenchSumScenario
5
- from helm.benchmark.scenarios.scenario import CORRECT_TAG
6
-
7
-
8
- def count_words(text: str) -> int:
9
- return len(re.split(r"\s+", text.strip()))
10
-
11
-
12
- @pytest.mark.scenarios
13
- def test_infinite_bench_sum_scenario():
14
- with TemporaryDirectory() as tmpdir:
15
- scenario = InfiniteBenchSumScenario(min_num_words=0, max_num_words=10000000)
16
- instances = scenario.get_instances(tmpdir)
17
- assert len(instances) == 103
18
- assert instances[0].split == "test"
19
- assert len(instances[0].input.text) == 1745528
20
- assert instances[0].extra_data
21
- assert instances[0].extra_data["word_count"] == 308762
22
- references = instances[0].references
23
- assert len(references[0].output.text) == 2865
24
- assert references[0].tags == [CORRECT_TAG]
25
-
26
- scenario = InfiniteBenchSumScenario(min_num_words=0, max_num_words=100000)
27
- instances = scenario.get_instances(tmpdir)
28
- assert len(instances) == 48
29
- assert instances[0].split == "test"
30
- assert len(instances[0].input.text) == 381778
31
- assert instances[0].extra_data
32
- assert instances[0].extra_data["word_count"] == 69458
33
- references = instances[0].references
34
- assert len(references[0].output.text) == 4217
35
- assert references[0].tags == [CORRECT_TAG]
36
-
37
- scenario = InfiniteBenchSumScenario(min_num_words=30000, max_num_words=80000)
38
- instances = scenario.get_instances(tmpdir)
39
- assert len(instances) == 32
40
- assert instances[0].split == "test"
41
- assert len(instances[1].input.text) == 383396
42
- assert instances[1].extra_data
43
- assert instances[1].extra_data["word_count"] == 68482
44
- references = instances[1].references
45
- assert len(references[0].output.text) == 5667
46
- assert references[0].tags == [CORRECT_TAG]