crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +2 -2
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  13. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  14. helm/benchmark/annotation/model_as_judge.py +12 -16
  15. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  16. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  17. helm/benchmark/executor.py +11 -12
  18. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  19. helm/benchmark/metrics/bias_word_lists.py +1 -1
  20. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  21. helm/benchmark/metrics/classification_metrics.py +3 -3
  22. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  23. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  24. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  25. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  26. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  27. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  28. helm/benchmark/metrics/comet_metric.py +1 -1
  29. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  30. helm/benchmark/metrics/copyright_metrics.py +1 -1
  31. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  32. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  33. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  34. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  35. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  36. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  37. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  38. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  39. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  40. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  41. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  42. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  43. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  44. helm/benchmark/metrics/medalign_metrics.py +9 -29
  45. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  46. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  47. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  48. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  49. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  50. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  51. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  52. helm/benchmark/metrics/metric_service.py +11 -11
  53. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  54. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  55. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  56. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  57. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  58. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  59. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  60. helm/benchmark/metrics/summac/model_summac.py +2 -3
  61. helm/benchmark/metrics/summarization_metrics.py +2 -1
  62. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  63. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  64. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  65. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  66. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  67. helm/benchmark/model_deployment_registry.py +16 -26
  68. helm/benchmark/presentation/contamination.py +3 -3
  69. helm/benchmark/presentation/create_plots.py +43 -13
  70. helm/benchmark/presentation/run_display.py +13 -0
  71. helm/benchmark/presentation/schema.py +7 -1
  72. helm/benchmark/presentation/summarize.py +84 -61
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/reeval_run.py +3 -4
  75. helm/benchmark/reeval_runner.py +3 -3
  76. helm/benchmark/run.py +84 -73
  77. helm/benchmark/run_expander.py +12 -1
  78. helm/benchmark/run_spec_factory.py +7 -6
  79. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  80. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  81. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  82. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  83. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  84. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  85. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  86. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  87. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  88. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  89. helm/benchmark/run_specs/long_context_run_specs.py +114 -15
  90. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  91. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  92. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  93. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
  94. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  95. helm/benchmark/runner.py +5 -5
  96. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  97. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  98. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  99. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  100. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  101. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  102. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  103. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  104. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  105. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
  106. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  107. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
  108. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
  109. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
  110. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  111. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  112. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  113. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  115. helm/benchmark/scenarios/clear_scenario.py +11 -7
  116. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  117. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  118. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  119. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  120. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  121. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  122. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  123. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  124. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  125. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  126. helm/benchmark/scenarios/grammar.py +2 -2
  127. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  128. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  129. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  130. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  131. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  132. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  133. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  134. helm/benchmark/scenarios/math_scenario.py +21 -20
  135. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  136. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  137. helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
  138. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  139. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  140. helm/benchmark/scenarios/medec_scenario.py +6 -1
  141. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  142. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  143. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  144. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  145. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  146. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  147. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  148. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  149. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  150. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  151. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  152. helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
  153. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  154. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  155. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  156. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  157. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  158. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  159. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  160. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  161. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  162. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  163. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  164. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  165. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  166. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  167. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  168. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  169. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  170. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  171. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  172. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  173. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  174. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  175. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  176. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  177. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  178. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  179. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  180. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  181. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  182. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  183. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  184. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  185. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  186. helm/benchmark/server.py +2 -1
  187. helm/benchmark/slurm_jobs.py +1 -2
  188. helm/benchmark/slurm_runner.py +8 -1
  189. helm/benchmark/static/schema_arabic.yaml +228 -0
  190. helm/benchmark/static/schema_audio.yaml +60 -49
  191. helm/benchmark/static/schema_classic.yaml +0 -17
  192. helm/benchmark/static/schema_enterprise.yaml +21 -0
  193. helm/benchmark/static/schema_long_context.yaml +81 -20
  194. helm/benchmark/static/schema_medhelm.yaml +272 -213
  195. helm/benchmark/static/schema_melt.yaml +1257 -0
  196. helm/benchmark/static/schema_slphelm.yaml +162 -0
  197. helm/benchmark/static/schema_vhelm.yaml +26 -26
  198. helm/benchmark/static/schema_video.yaml +219 -0
  199. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  200. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  201. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  202. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  203. helm/benchmark/static_build/index.html +4 -4
  204. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  205. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  206. helm/benchmark/window_services/test_utils.py +3 -4
  207. helm/benchmark/window_services/tokenizer_service.py +7 -8
  208. helm/clients/anthropic_client.py +69 -29
  209. helm/clients/audio_language/diva_llama_client.py +4 -2
  210. helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
  211. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  212. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  213. helm/clients/audio_language/test.py +62 -0
  214. helm/clients/bedrock_client.py +3 -1
  215. helm/clients/client.py +7 -7
  216. helm/clients/grok_client.py +36 -0
  217. helm/clients/huggingface_client.py +42 -3
  218. helm/clients/huggingface_pipeline_client.py +138 -0
  219. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  220. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  221. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  222. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  223. helm/clients/openai_client.py +102 -55
  224. helm/clients/openai_responses_client.py +176 -0
  225. helm/clients/palmyra_client.py +2 -5
  226. helm/clients/reka_client.py +2 -2
  227. helm/clients/test_huggingface_client.py +3 -3
  228. helm/clients/together_client.py +31 -6
  229. helm/clients/vertexai_client.py +17 -9
  230. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  231. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  232. helm/clients/vision_language/idefics_client.py +6 -2
  233. helm/clients/vision_language/paligemma_client.py +2 -2
  234. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  235. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  236. helm/clients/vllm_client.py +43 -7
  237. helm/clients/vllm_granite_thinking_client.py +56 -0
  238. helm/clients/writer_client.py +102 -0
  239. helm/common/context.py +80 -0
  240. helm/common/credentials_utils.py +5 -5
  241. helm/common/critique_request.py +0 -1
  242. helm/common/general.py +9 -2
  243. helm/common/hierarchical_logger.py +104 -12
  244. helm/common/local_context.py +140 -0
  245. helm/common/object_spec.py +23 -8
  246. helm/common/remote_context.py +61 -0
  247. helm/common/request.py +8 -0
  248. helm/common/test_logging.py +94 -0
  249. helm/config/model_deployments.yaml +995 -45
  250. helm/config/model_metadata.yaml +780 -59
  251. helm/config/tokenizer_configs.yaml +224 -3
  252. helm/proxy/cli.py +4 -2
  253. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  254. helm/proxy/retry.py +5 -0
  255. helm/proxy/services/server_service.py +21 -85
  256. helm/tokenizers/grok_tokenizer.py +55 -0
  257. helm/tokenizers/huggingface_tokenizer.py +1 -1
  258. helm/tokenizers/test_grok_tokenizer.py +33 -0
  259. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  260. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  261. helm/benchmark/scenarios/numeracy_scenario.py +0 -793
  262. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  263. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  264. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  265. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  266. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  267. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
  268. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -252,6 +252,25 @@ tokenizer_configs:
252
252
  end_of_text_token: "<eos>"
253
253
  prefix_token: "<bos>"
254
254
 
255
+ # Grok
256
+ - name: xai/grok-3-beta
257
+ tokenizer_spec:
258
+ class_name: "helm.tokenizers.grok_tokenizer.GrokAPITokenizer"
259
+ end_of_text_token: ""
260
+ prefix_token: ""
261
+
262
+ - name: xai/grok-3-mini-beta
263
+ tokenizer_spec:
264
+ class_name: "helm.tokenizers.grok_tokenizer.GrokAPITokenizer"
265
+ end_of_text_token: ""
266
+ prefix_token: ""
267
+
268
+ - name: xai/grok-4-0709
269
+ tokenizer_spec:
270
+ class_name: "helm.tokenizers.grok_tokenizer.GrokAPITokenizer"
271
+ end_of_text_token: ""
272
+ prefix_token: ""
273
+
255
274
  # Hf-internal-testing
256
275
 
257
276
  # Tokenizer name hf-internal-testing/llama-tokenizer is taken from:
@@ -299,6 +318,22 @@ tokenizer_configs:
299
318
  end_of_text_token: "<|endoftext|>"
300
319
  prefix_token: "<|endoftext|>"
301
320
 
321
+ - name: huggingface/smollm2-135m
322
+ tokenizer_spec:
323
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
324
+ args:
325
+ pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M
326
+ end_of_text_token: "<|endoftext|>"
327
+ prefix_token: "<|endoftext|>"
328
+
329
+ - name: huggingface/smollm2-135m-instruct
330
+ tokenizer_spec:
331
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
332
+ args:
333
+ pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct
334
+ end_of_text_token: "<|endoftext|>"
335
+ prefix_token: "<|im_end|>"
336
+
302
337
  # Lighting AI
303
338
  - name: lightningai/lit-gpt
304
339
  tokenizer_spec:
@@ -396,6 +431,14 @@ tokenizer_configs:
396
431
  prefix_token: "<|begin_of_text|>"
397
432
  end_of_text_token: "<|eot_id|>"
398
433
 
434
+ - name: meta/llama-4-scout-17b-16e-instruct
435
+ tokenizer_spec:
436
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
437
+ args:
438
+ pretrained_model_name_or_path: meta-llama/Llama-4-Scout-17B-16E-Instruct
439
+ prefix_token: "<|begin_of_text|>"
440
+ end_of_text_token: "<|end_of_text|>"
441
+
399
442
  # 01-ai
400
443
  - name: 01-ai/Yi-6B
401
444
  tokenizer_spec:
@@ -432,9 +475,33 @@ tokenizer_configs:
432
475
  end_of_text_token: "<|endoftext|>"
433
476
  prefix_token: ""
434
477
 
478
+ - name: allenai/olmo-2-1124-7b-instruct
479
+ tokenizer_spec:
480
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
481
+ end_of_text_token: "<|endoftext|>"
482
+ prefix_token: "<|endoftext|>"
483
+
484
+ - name: allenai/olmo-2-0325-32b-instruct
485
+ tokenizer_spec:
486
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
487
+ end_of_text_token: "<|endoftext|>"
488
+ prefix_token: "<|endoftext|>"
489
+
490
+ - name: allenai/olmoe-1b-7b-0125-instruct
491
+ tokenizer_spec:
492
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
493
+ end_of_text_token: "|||IP_ADDRESS|||"
494
+ prefix_token: "|||IP_ADDRESS|||"
495
+
496
+ # Marin Community
497
+ - name: marin-community/marin-8b-instruct
498
+ tokenizer_spec:
499
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
500
+ end_of_text_token: "<|eot_id|>"
501
+ prefix_token: "<|begin_of_text|>"
435
502
 
436
503
  # Microsoft
437
- - name: microsoft/phi-2
504
+ - name: microsoft/phi-2
438
505
  tokenizer_spec:
439
506
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
440
507
  end_of_text_token: "<|endoftext|>"
@@ -521,6 +588,17 @@ tokenizer_configs:
521
588
  end_of_text_token: "</s>"
522
589
  prefix_token: "<s>"
523
590
 
591
+ # Moonshot AI
592
+ - name: moonshotai/kimi-k2-instruct
593
+ tokenizer_spec:
594
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
595
+ args:
596
+ pretrained_model_name_or_path: moonshotai/Kimi-K2-Instruct
597
+ trust_remote_code: true
598
+ revision: 4f239503ad9d1a042f0a4bacac457931ab972cfc
599
+ end_of_text_token: "[EOS]"
600
+ prefix_token: "[BOS]"
601
+
524
602
  # Nectec
525
603
  - name: nectec/OpenThaiLLM-Prebuilt-7B
526
604
  tokenizer_spec:
@@ -619,6 +697,14 @@ tokenizer_configs:
619
697
  end_of_text_token: "<|im_end|>"
620
698
  prefix_token: "<|im_start|>"
621
699
 
700
+ - name: qwen/qwen3-235b-a22b
701
+ tokenizer_spec:
702
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
703
+ args:
704
+ pretrained_model_name_or_path: Qwen/Qwen3-235B-A22B
705
+ end_of_text_token: "<|im_end|>"
706
+ prefix_token: "<|im_start|>"
707
+
622
708
  - name: qwen/qwq-32b-preview
623
709
  tokenizer_spec:
624
710
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -664,6 +750,15 @@ tokenizer_configs:
664
750
  end_of_text_token: "<|endoftext|>"
665
751
  prefix_token: ""
666
752
 
753
+ - name: qwen/qwen2.5-omni-7b
754
+ tokenizer_spec:
755
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
756
+ args:
757
+ pretrained_model_name_or_path: Qwen/Qwen2.5-Omni-7B
758
+ trust_remote_code: false
759
+ end_of_text_token: "<|endoftext|>"
760
+ prefix_token: ""
761
+
667
762
  # SambaLingo
668
763
  - name: sambanova/sambalingo-thai-base
669
764
  tokenizer_spec:
@@ -814,6 +909,7 @@ tokenizer_configs:
814
909
  end_of_text_token: ""
815
910
  prefix_token: ""
816
911
 
912
+ # Maritaca AI
817
913
  - name: maritaca-ai/sabia-7b
818
914
  tokenizer_spec:
819
915
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -822,6 +918,14 @@ tokenizer_configs:
822
918
  end_of_text_token: "</s>"
823
919
  prefix_token: "<s>"
824
920
 
921
+ - name: maritaca-ai/sabia-2-tokenizer-medium
922
+ tokenizer_spec:
923
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
924
+ args:
925
+ pretrained_model_name_or_path: maritaca-ai/sabia-2-tokenizer-medium
926
+ end_of_text_token: "</s>"
927
+ prefix_token: "<s>"
928
+
825
929
  # Granite-3.1-8b-base
826
930
  - name: ibm-granite/granite-3.1-8b-base
827
931
  tokenizer_spec:
@@ -910,8 +1014,6 @@ tokenizer_configs:
910
1014
  prefix_token: ""
911
1015
  end_of_text_token: ""
912
1016
 
913
-
914
-
915
1017
  - name: ibm-granite/granite-34b-code-instruct-8k
916
1018
  tokenizer_spec:
917
1019
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -945,7 +1047,17 @@ tokenizer_configs:
945
1047
  prefix_token: ""
946
1048
  end_of_text_token: ""
947
1049
 
1050
+ # IBM Granite 3.3
948
1051
 
1052
+ - name: ibm/granite-3.3-8b-instruct
1053
+ tokenizer_spec:
1054
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1055
+ args:
1056
+ pretrained_model_name_or_path: ibm-granite/granite-3.3-8b-instruct
1057
+ end_of_text_token: "<|end_of_text|>"
1058
+ prefix_token: "<|end_of_text|>"
1059
+
1060
+
949
1061
 
950
1062
  # DeepSeek-R1-Distill-Llama-3.1-8b
951
1063
  - name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
@@ -964,3 +1076,112 @@ tokenizer_configs:
964
1076
  pretrained_model_name_or_path: deepseek-ai/deepseek-coder-6.7b-instruct
965
1077
  end_of_text_token: "<|end▁of▁sentence|>"
966
1078
  prefix_token: "<|begin▁of▁sentence|>"
1079
+
1080
+
1081
+ # vilm/vinallama-2.7b-chat
1082
+ - name: vilm/vinallama-2.7b-chat
1083
+ tokenizer_spec:
1084
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1085
+ args:
1086
+ pretrained_model_name_or_path: vilm/vinallama-2.7b-chat
1087
+ end_of_text_token: "<im_end>"
1088
+ prefix_token: "<im_start>"
1089
+
1090
+ # vilm/vinallama-7b-chat
1091
+ - name: vilm/vinallama-7b-chat
1092
+ tokenizer_spec:
1093
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1094
+ args:
1095
+ pretrained_model_name_or_path: vilm/vinallama-7b-chat
1096
+ end_of_text_token: "<im_end>"
1097
+ prefix_token: "<im_start>"
1098
+
1099
+ # vilm/vietcuna-7b-v3
1100
+ - name: vilm/vietcuna-7b-v3
1101
+ tokenizer_spec:
1102
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1103
+ args:
1104
+ pretrained_model_name_or_path: vilm/vietcuna-7b-v3
1105
+ end_of_text_token: "</s>"
1106
+ prefix_token: "<s>"
1107
+
1108
+ # Viet-Mistral/Vistral-7B-Chat
1109
+ - name: Viet-Mistral/Vistral-7B-Chat
1110
+ tokenizer_spec:
1111
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1112
+ args:
1113
+ pretrained_model_name_or_path: Viet-Mistral/Vistral-7B-Chat
1114
+ end_of_text_token: "</s>"
1115
+ prefix_token: "<s>"
1116
+
1117
+ # vinai/PhoGPT-7B5-Instruct
1118
+ - name: vinai/PhoGPT-7B5-Instruct
1119
+ tokenizer_spec:
1120
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1121
+ args:
1122
+ pretrained_model_name_or_path: vinai/PhoGPT-7B5-Instruct
1123
+ end_of_text_token: "</s>"
1124
+ prefix_token: "<s>"
1125
+
1126
+ # vinai/PhoGPT-4B-Chat
1127
+ - name: vinai/PhoGPT-4B-Chat
1128
+ tokenizer_spec:
1129
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1130
+ args:
1131
+ pretrained_model_name_or_path: vinai/PhoGPT-4B-Chat
1132
+ end_of_text_token: "</s>"
1133
+ prefix_token: "<s>"
1134
+
1135
+ # Gemma-3-Gaia-PT-BR-4b-it
1136
+ - name: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
1137
+ tokenizer_spec:
1138
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1139
+ args:
1140
+ pretrained_model_name_or_path: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
1141
+ end_of_text_token: "<eos>"
1142
+ prefix_token: "<bos>"
1143
+
1144
+ # Bode 13B Alpaca PT-BR
1145
+ - name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
1146
+ tokenizer_spec:
1147
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1148
+ args:
1149
+ pretrained_model_name_or_path: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
1150
+ end_of_text_token: "</s>"
1151
+ prefix_token: "<s>"
1152
+
1153
+ # Cabrita 7B PT-BR tokenizer
1154
+ - name: 22h/cabrita_7b_pt_850000
1155
+ tokenizer_spec:
1156
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1157
+ args:
1158
+ pretrained_model_name_or_path: 22h/cabrita_7b_pt_850000
1159
+ end_of_text_token: "</s>"
1160
+ prefix_token: "<s>"
1161
+
1162
+ # Gervásio 7B PT‑BR/PT‑PT tokenizer
1163
+ - name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
1164
+ tokenizer_spec:
1165
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1166
+ args:
1167
+ pretrained_model_name_or_path: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
1168
+ end_of_text_token: "</s>"
1169
+ prefix_token: "<s>"
1170
+
1171
+ # Tucano 2b4 PT-BR tokenizer
1172
+ - name: TucanoBR/Tucano-2b4
1173
+ tokenizer_spec:
1174
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1175
+ args:
1176
+ pretrained_model_name_or_path: TucanoBR/Tucano-2b4
1177
+ end_of_text_token: "</s>"
1178
+ prefix_token: "<s>"
1179
+
1180
+ # TeenyTinyLlama 460M PT-BR tokenizer
1181
+ - name: nicholasKluge/TeenyTinyLlama-460m
1182
+ tokenizer_spec:
1183
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1184
+ args:
1185
+ pretrained_model_name_or_path: nicholasKluge/TeenyTinyLlama-460m
1186
+ end_of_text_token: "</s>"
1187
+ prefix_token: "<s>"
helm/proxy/cli.py CHANGED
@@ -21,7 +21,7 @@ from typing import List, Dict
21
21
  import re
22
22
  import sys
23
23
 
24
- from helm.common.hierarchical_logger import hlog
24
+ from helm.common.hierarchical_logger import hlog, setup_default_logging
25
25
  from helm.common.authentication import Authentication
26
26
  from helm.proxy.accounts import Usage, Account
27
27
  from helm.proxy.services.remote_service import RemoteService, add_service_args, create_authentication
@@ -123,7 +123,7 @@ def do_create_update_command(service: RemoteService, auth: Authentication, args)
123
123
 
124
124
  # Update quotas
125
125
  for quota_str in args.quotas:
126
- m = re.match(f"(\w+)\.(\w+)=(\d+|{UNLIMITED_QUOTA})", quota_str)
126
+ m = re.match(rf"(\w+)\.(\w+)=(\d+|{UNLIMITED_QUOTA})", quota_str)
127
127
  if not m:
128
128
  raise Exception(
129
129
  f"Invalid format: {quota_str}, expect <model_group>.<granularity>=<quota> "
@@ -198,6 +198,8 @@ def main():
198
198
 
199
199
  args = parser.parse_args()
200
200
 
201
+ setup_default_logging()
202
+
201
203
  service = create_remote_service(args)
202
204
  auth = create_authentication(args)
203
205
 
@@ -38,7 +38,7 @@ def replace_emoji_characters(s: str) -> str:
38
38
  highpoints = re.compile("[\U00010000-\U0010ffff]")
39
39
  elif sys.maxunicode == 65535:
40
40
  # Python was built with '--enable-unicode=ucs2'
41
- highpoints = re.compile("[\uD800-\uDBFF][\uDC00-\uDFFF]")
41
+ highpoints = re.compile("[\ud800-\udbff][\udc00-\udfff]")
42
42
  else:
43
43
  raise UnicodeError("Unable to determine if Python was built using UCS-2 or UCS-4")
44
44
 
helm/proxy/retry.py CHANGED
@@ -5,6 +5,7 @@ from retrying import Retrying
5
5
  from helm.common.request import RequestResult
6
6
  from helm.common.tokenization_request import TokenizationRequestResult
7
7
  from helm.common.hierarchical_logger import hlog
8
+ import os
8
9
  import traceback
9
10
  import threading
10
11
 
@@ -19,6 +20,10 @@ Example usage:
19
20
  ...
20
21
  """
21
22
 
23
+ # TODO: make these configurable at a config / cli level
24
+ HELM_RETRIES = int(os.environ.get("HELM_RETRIES", "5"))
25
+ HELM_TOKENIZER_RETRIES = int(os.environ.get("HELM_TOKENIZER_RETRIES", HELM_RETRIES))
26
+
22
27
  # The lock is used to prevent multiple threads from printing at the same time.
23
28
  # This can cause issues when printing the stack trace.
24
29
  # (The stack traces can get mixed up and become unreadable.)
@@ -1,8 +1,8 @@
1
- import dataclasses
2
1
  import os
3
2
  import signal
4
- from typing import List, Optional
3
+ from typing import List
5
4
 
5
+ from helm.common.local_context import LocalContext
6
6
  from helm.common.cache import CacheConfig
7
7
  from helm.common.cache_backend_config import CacheBackendConfig, BlackHoleCacheBackendConfig
8
8
  from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
@@ -11,7 +11,6 @@ from helm.common.moderations_api_request import ModerationAPIRequest, Moderation
11
11
  from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
12
12
  from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
13
13
  from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
14
- from helm.common.general import ensure_directory_exists, parse_hocon, get_credentials
15
14
  from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
16
15
  from helm.common.tokenization_request import (
17
16
  TokenizationRequest,
@@ -22,27 +21,13 @@ from helm.common.tokenization_request import (
22
21
  from helm.common.request import Request, RequestResult
23
22
  from helm.common.hierarchical_logger import hlog
24
23
  from helm.proxy.accounts import Accounts, Account
25
- from helm.clients.auto_client import AutoClient
26
- from helm.clients.moderation_api_client import ModerationAPIClient
27
- from helm.clients.image_generation.nudity_check_client import NudityCheckClient
28
- from helm.clients.gcs_client import GCSClient
29
- from helm.clients.clip_score_client import CLIPScoreClient
30
- from helm.clients.toxicity_classifier_client import ToxicityClassifierClient
31
- from helm.proxy.example_queries import example_queries
32
- from helm.benchmark.model_metadata_registry import ALL_MODELS_METADATA
33
24
  from helm.benchmark.model_deployment_registry import get_model_deployment_host_organization
34
25
  from helm.proxy.query import Query, QueryResult
35
- from helm.proxy.retry import retry_request
36
26
  from helm.proxy.token_counters.auto_token_counter import AutoTokenCounter
37
- from helm.tokenizers.auto_tokenizer import AutoTokenizer
38
27
  from helm.proxy.services.service import (
39
28
  Service,
40
- CACHE_DIR,
41
29
  ACCOUNTS_FILE,
42
30
  GeneralInfo,
43
- VERSION,
44
- expand_environments,
45
- synthesize_request,
46
31
  )
47
32
 
48
33
 
@@ -57,43 +42,17 @@ class ServerService(Service):
57
42
  root_mode: bool = False,
58
43
  cache_backend_config: CacheBackendConfig = BlackHoleCacheBackendConfig(),
59
44
  ):
60
- ensure_directory_exists(base_path)
61
- client_file_storage_path = os.path.join(base_path, CACHE_DIR)
62
- ensure_directory_exists(client_file_storage_path)
63
-
64
- credentials = get_credentials(base_path)
65
45
  accounts_path = os.path.join(base_path, ACCOUNTS_FILE)
66
46
 
67
- self.cache_backend_config = cache_backend_config
68
- self.client = AutoClient(credentials, client_file_storage_path, cache_backend_config)
69
- self.tokenizer = AutoTokenizer(credentials, cache_backend_config)
70
- self.token_counter = AutoTokenCounter(self.tokenizer)
47
+ self.context = LocalContext(base_path, cache_backend_config)
48
+ self.token_counter = AutoTokenCounter(self.context.tokenizer)
71
49
  self.accounts = Accounts(accounts_path, root_mode=root_mode)
72
50
 
73
- # Lazily instantiate the following clients
74
- self.moderation_api_client: Optional[ModerationAPIClient] = None
75
- self.toxicity_classifier_client: Optional[ToxicityClassifierClient] = None
76
- self.perspective_api_client: Optional[ToxicityClassifierClient] = None
77
- self.nudity_check_client: Optional[NudityCheckClient] = None
78
- self.clip_score_client: Optional[CLIPScoreClient] = None
79
- self.gcs_client: Optional[GCSClient] = None
80
-
81
51
  def get_general_info(self) -> GeneralInfo:
82
- # Can't send release_dates in ModelMetadata bacause dates cannot be round-tripped to and from JSON easily.
83
- # TODO(#2158): Either fix this or delete get_general_info.
84
- all_models = [dataclasses.replace(model_metadata, release_date=None) for model_metadata in ALL_MODELS_METADATA]
85
- return GeneralInfo(version=VERSION, example_queries=example_queries, all_models=all_models)
52
+ return self.context.get_general_info()
86
53
 
87
54
  def expand_query(self, query: Query) -> QueryResult:
88
- """Turn the `query` into requests."""
89
- prompt = query.prompt
90
- settings = query.settings
91
- environments = parse_hocon(query.environments)
92
- requests = []
93
- for environment in expand_environments(environments):
94
- request = synthesize_request(prompt, settings, environment)
95
- requests.append(request)
96
- return QueryResult(requests=requests)
55
+ return self.context.expand_query(query)
97
56
 
98
57
  def _get_model_group_for_model_deployment(self, model_deployment: str) -> str:
99
58
  if model_deployment.startswith("openai/"):
@@ -105,7 +64,11 @@ class ServerService(Service):
105
64
  return "gpt4"
106
65
  elif model_deployment.startswith("openai/gpt-3"):
107
66
  return "gpt3"
108
- elif model_deployment.startswith("openai/o1"):
67
+ elif (
68
+ model_deployment.startswith("openai/o1")
69
+ or model_deployment.startswith("openai/o3")
70
+ or model_deployment.startswith("openai/o4")
71
+ ):
109
72
  return "o1"
110
73
  else:
111
74
  return "openai"
@@ -126,7 +89,7 @@ class ServerService(Service):
126
89
  self.accounts.check_can_use(auth.api_key, model_group)
127
90
 
128
91
  # Use!
129
- request_result: RequestResult = self.client.make_request(request)
92
+ request_result: RequestResult = self.context.make_request(request)
130
93
 
131
94
  # Only deduct if not cached
132
95
  if not request_result.cached:
@@ -139,66 +102,39 @@ class ServerService(Service):
139
102
  def tokenize(self, auth: Authentication, request: TokenizationRequest) -> TokenizationRequestResult:
140
103
  """Tokenize via an API."""
141
104
  self.accounts.authenticate(auth)
142
- return self.tokenizer.tokenize(request)
105
+ return self.context.tokenize(request)
143
106
 
144
107
  def decode(self, auth: Authentication, request: DecodeRequest) -> DecodeRequestResult:
145
108
  """Decodes to text."""
146
109
  self.accounts.authenticate(auth)
147
- return self.tokenizer.decode(request)
110
+ return self.context.decode(request)
148
111
 
149
112
  def upload(self, auth: Authentication, request: FileUploadRequest) -> FileUploadResult:
150
113
  """Uploads a file to external storage."""
151
114
  self.accounts.authenticate(auth)
152
-
153
- if not self.gcs_client:
154
- self.gcs_client = self.client.get_gcs_client()
155
-
156
- assert self.gcs_client
157
- return self.gcs_client.upload(request)
115
+ return self.context.upload(request)
158
116
 
159
117
  def check_nudity(self, auth: Authentication, request: NudityCheckRequest) -> NudityCheckResult:
160
118
  """Check for nudity."""
161
119
  self.accounts.authenticate(auth)
162
-
163
- if not self.nudity_check_client:
164
- self.nudity_check_client = self.client.get_nudity_check_client()
165
-
166
- assert self.nudity_check_client
167
- return self.nudity_check_client.check_nudity(request)
120
+ return self.context.check_nudity(request)
168
121
 
169
122
  def compute_clip_score(self, auth: Authentication, request: CLIPScoreRequest) -> CLIPScoreResult:
170
123
  """Computes CLIPScore for a given caption and image."""
171
124
  self.accounts.authenticate(auth)
172
-
173
- if not self.clip_score_client:
174
- self.clip_score_client = self.client.get_clip_score_client()
175
-
176
- assert self.clip_score_client
177
- return self.clip_score_client.compute_score(request)
125
+ return self.context.compute_clip_score(request)
178
126
 
179
127
  def get_toxicity_scores(self, auth: Authentication, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
180
- @retry_request
181
- def get_toxicity_scores_with_retry(request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
182
- if not self.toxicity_classifier_client:
183
- self.toxicity_classifier_client = self.client.get_toxicity_classifier_client()
184
- return self.toxicity_classifier_client.get_toxicity_scores(request)
185
-
186
128
  self.accounts.authenticate(auth)
187
- return get_toxicity_scores_with_retry(request)
129
+ return self.context.get_toxicity_scores(request)
188
130
 
189
131
  def get_moderation_results(self, auth: Authentication, request: ModerationAPIRequest) -> ModerationAPIRequestResult:
190
- @retry_request
191
- def get_moderation_results_with_retry(request: ModerationAPIRequest) -> ModerationAPIRequestResult:
192
- if not self.moderation_api_client:
193
- self.moderation_api_client = self.client.get_moderation_api_client()
194
- return self.moderation_api_client.get_moderation_results(request)
195
-
196
132
  self.accounts.authenticate(auth)
197
- return get_moderation_results_with_retry(request)
133
+ return self.context.get_moderation_results(request)
198
134
 
199
135
  def make_critique_request(self, auth: Authentication, request: CritiqueRequest) -> CritiqueRequestResult:
200
136
  self.accounts.authenticate(auth)
201
- return self.client.get_critique_client().make_critique_request(request)
137
+ return self.context.make_critique_request(request)
202
138
 
203
139
  def create_account(self, auth: Authentication) -> Account:
204
140
  """Creates a new account."""
@@ -233,4 +169,4 @@ class ServerService(Service):
233
169
  hlog("Done.")
234
170
 
235
171
  def get_cache_config(self, shard_name: str) -> CacheConfig:
236
- return self.cache_backend_config.get_cache_config(shard_name)
172
+ return self.context.get_cache_config(shard_name)
@@ -0,0 +1,55 @@
1
+ import dataclasses
2
+ import os
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ import requests
6
+
7
+ from helm.common.cache import CacheConfig
8
+ from helm.common.tokenization_request import (
9
+ TokenizationRequest,
10
+ TokenizationToken,
11
+ )
12
+ from helm.tokenizers.caching_tokenizer import CachingTokenizer
13
+
14
+
15
+ class GrokAPITokenizer(CachingTokenizer):
16
+ """Tokenizer that uses the xAI Grok Tokenize Text API
17
+
18
+ Doc: https://docs.x.ai/docs/api-reference#tokenize-text"""
19
+
20
+ def __init__(self, cache_config: CacheConfig, api_key: Optional[str] = None) -> None:
21
+ super().__init__(cache_config)
22
+ self.api_key = api_key or os.environ.get("XAI_API_KEY")
23
+
24
+ def _tokenization_request_to_cache_key(self, request: TokenizationRequest) -> Dict[str, Any]:
25
+ cache_key = dataclasses.asdict(request)
26
+ # Delete encode because the Grok API simulateously gives string and integer tokens.
27
+ del cache_key["encode"]
28
+ return cache_key
29
+
30
+ def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
31
+ if not self.api_key:
32
+ raise Exception(
33
+ "No Grok API key found. "
34
+ "Set grokApiKey in credentials.conf or set the GROK_API_KEY environment variable"
35
+ )
36
+ text = request["text"]
37
+ if not text:
38
+ return {"token_ids": []}
39
+ model = request["tokenizer"].split("/")[-1]
40
+ response = requests.post(
41
+ url="https://api.x.ai/v1/tokenize-text",
42
+ headers={"Authorization": f"Bearer {self.api_key}"},
43
+ json={"text": text, "model": model},
44
+ )
45
+ response.raise_for_status()
46
+ return response.json()
47
+
48
+ def _tokenization_raw_response_to_tokens(
49
+ self, response: Dict[str, Any], request: TokenizationRequest
50
+ ) -> List[TokenizationToken]:
51
+ raw_token_field_name = "token_id" if request.encode else "string_token"
52
+ return [TokenizationToken(raw_token[raw_token_field_name]) for raw_token in response["token_ids"]]
53
+
54
+ def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
55
+ raise NotImplementedError("The xAI API does not support decoding.")
@@ -119,7 +119,7 @@ class HuggingFaceTokenizer(CachingTokenizer):
119
119
  tokens = tokenizer.encode(
120
120
  request["text"],
121
121
  truncation=request["truncation"],
122
- max_length=request["max_length"],
122
+ max_length=max(request["max_length"], 0),
123
123
  add_special_tokens=False,
124
124
  )
125
125
  else:
@@ -0,0 +1,33 @@
1
+ import os
2
+ import pytest
3
+
4
+ from helm.common.cache import BlackHoleCacheConfig
5
+ from helm.common.tokenization_request import (
6
+ TokenizationRequest,
7
+ TokenizationToken,
8
+ )
9
+ from helm.tokenizers.grok_tokenizer import GrokAPITokenizer
10
+
11
+
12
+ @pytest.mark.models
13
+ def test_tokenize():
14
+ if not os.environ.get("XAI_API_KEY"):
15
+ pytest.skip("No xAI API key found; skipping test")
16
+ tokenizer = GrokAPITokenizer(cache_config=BlackHoleCacheConfig())
17
+ request = TokenizationRequest(tokenizer="xai/grok-3-beta", text="otter 🦦")
18
+ result = tokenizer.tokenize(request)
19
+ assert result.success
20
+ assert not result.cached
21
+ assert result.tokens == [TokenizationToken(token) for token in ["otter", "", "", ""]]
22
+
23
+
24
+ @pytest.mark.models
25
+ def test_encode():
26
+ if not os.environ.get("XAI_API_KEY"):
27
+ pytest.skip("No xAI API key found; skipping test")
28
+ tokenizer = GrokAPITokenizer(cache_config=BlackHoleCacheConfig())
29
+ request = TokenizationRequest(tokenizer="xai/grok-3-beta", text="otter 🦦", encode=True)
30
+ result = tokenizer.tokenize(request)
31
+ assert result.success
32
+ assert not result.cached
33
+ assert result.tokens == [TokenizationToken(token) for token in [142507, 11637, 294, 294]]