crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/annotation/air_bench_annotator.py +2 -2
- helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
- helm/benchmark/annotation/bird_sql_annotator.py +2 -2
- helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
- helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +12 -16
- helm/benchmark/annotation/omni_math_annotator.py +13 -14
- helm/benchmark/annotation/wildbench_annotator.py +9 -9
- helm/benchmark/executor.py +11 -12
- helm/benchmark/metrics/aci_bench_metrics.py +9 -29
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
- helm/benchmark/metrics/classification_metrics.py +3 -3
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dischargeme_metrics.py +9 -29
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/ifeval_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/med_dialog_metrics.py +9 -29
- helm/benchmark/metrics/medalign_metrics.py +9 -29
- helm/benchmark/metrics/medi_qa_metrics.py +9 -29
- helm/benchmark/metrics/medication_qa_metrics.py +10 -30
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +9 -29
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_metrics.py +2 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/toxicity_metrics.py +2 -2
- helm/benchmark/metrics/unitxt_metrics.py +3 -4
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/model_deployment_registry.py +16 -26
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +43 -13
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +7 -1
- helm/benchmark/presentation/summarize.py +84 -61
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/reeval_run.py +3 -4
- helm/benchmark/reeval_runner.py +3 -3
- helm/benchmark/run.py +84 -73
- helm/benchmark/run_expander.py +12 -1
- helm/benchmark/run_spec_factory.py +7 -6
- helm/benchmark/run_specs/arabic_run_specs.py +73 -0
- helm/benchmark/run_specs/audio_run_specs.py +52 -8
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +0 -53
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
- helm/benchmark/run_specs/experimental_run_specs.py +31 -1
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +114 -15
- helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
- helm/benchmark/run_specs/vlm_run_specs.py +28 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
- helm/benchmark/scenarios/aratrust_scenario.py +76 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
- helm/benchmark/scenarios/bluex_scenario.py +66 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
- helm/benchmark/scenarios/clear_scenario.py +11 -7
- helm/benchmark/scenarios/cleva_scenario.py +1 -1
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
- helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
- helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/headqa_scenario.py +6 -1
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/math_scenario.py +21 -20
- helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
- helm/benchmark/scenarios/medalign_scenario.py +9 -3
- helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
- helm/benchmark/scenarios/medbullets_scenario.py +7 -2
- helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
- helm/benchmark/scenarios/medec_scenario.py +6 -1
- helm/benchmark/scenarios/medhallu_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
- helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +16 -5
- helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
- helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
- helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
- helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
- helm/benchmark/scenarios/seahelm_scenario.py +2 -2
- helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
- helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
- helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
- helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
- helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/server.py +2 -1
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +228 -0
- helm/benchmark/static/schema_audio.yaml +60 -49
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_enterprise.yaml +21 -0
- helm/benchmark/static/schema_long_context.yaml +81 -20
- helm/benchmark/static/schema_medhelm.yaml +272 -213
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_vhelm.yaml +26 -26
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
- helm/benchmark/static_build/index.html +4 -4
- helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/benchmark/window_services/test_utils.py +3 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/diva_llama_client.py +4 -2
- helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
- helm/clients/audio_language/qwen_audiolm_client.py +4 -2
- helm/clients/audio_language/test.py +62 -0
- helm/clients/bedrock_client.py +3 -1
- helm/clients/client.py +7 -7
- helm/clients/grok_client.py +36 -0
- helm/clients/huggingface_client.py +42 -3
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
- helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/openai_client.py +102 -55
- helm/clients/openai_responses_client.py +176 -0
- helm/clients/palmyra_client.py +2 -5
- helm/clients/reka_client.py +2 -2
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/together_client.py +31 -6
- helm/clients/vertexai_client.py +17 -9
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +66 -53
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +102 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +0 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +104 -12
- helm/common/local_context.py +140 -0
- helm/common/object_spec.py +23 -8
- helm/common/remote_context.py +61 -0
- helm/common/request.py +8 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +995 -45
- helm/config/model_metadata.yaml +780 -59
- helm/config/tokenizer_configs.yaml +224 -3
- helm/proxy/cli.py +4 -2
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/retry.py +5 -0
- helm/proxy/services/server_service.py +21 -85
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/tokenizers/huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -793
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
- helm/benchmark/static_build/assets/index-262903c1.js +0 -10
- helm/benchmark/static_build/assets/index-42060d71.css +0 -1
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
|
@@ -252,6 +252,25 @@ tokenizer_configs:
|
|
|
252
252
|
end_of_text_token: "<eos>"
|
|
253
253
|
prefix_token: "<bos>"
|
|
254
254
|
|
|
255
|
+
# Grok
|
|
256
|
+
- name: xai/grok-3-beta
|
|
257
|
+
tokenizer_spec:
|
|
258
|
+
class_name: "helm.tokenizers.grok_tokenizer.GrokAPITokenizer"
|
|
259
|
+
end_of_text_token: ""
|
|
260
|
+
prefix_token: ""
|
|
261
|
+
|
|
262
|
+
- name: xai/grok-3-mini-beta
|
|
263
|
+
tokenizer_spec:
|
|
264
|
+
class_name: "helm.tokenizers.grok_tokenizer.GrokAPITokenizer"
|
|
265
|
+
end_of_text_token: ""
|
|
266
|
+
prefix_token: ""
|
|
267
|
+
|
|
268
|
+
- name: xai/grok-4-0709
|
|
269
|
+
tokenizer_spec:
|
|
270
|
+
class_name: "helm.tokenizers.grok_tokenizer.GrokAPITokenizer"
|
|
271
|
+
end_of_text_token: ""
|
|
272
|
+
prefix_token: ""
|
|
273
|
+
|
|
255
274
|
# Hf-internal-testing
|
|
256
275
|
|
|
257
276
|
# Tokenizer name hf-internal-testing/llama-tokenizer is taken from:
|
|
@@ -299,6 +318,22 @@ tokenizer_configs:
|
|
|
299
318
|
end_of_text_token: "<|endoftext|>"
|
|
300
319
|
prefix_token: "<|endoftext|>"
|
|
301
320
|
|
|
321
|
+
- name: huggingface/smollm2-135m
|
|
322
|
+
tokenizer_spec:
|
|
323
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
324
|
+
args:
|
|
325
|
+
pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M
|
|
326
|
+
end_of_text_token: "<|endoftext|>"
|
|
327
|
+
prefix_token: "<|endoftext|>"
|
|
328
|
+
|
|
329
|
+
- name: huggingface/smollm2-135m-instruct
|
|
330
|
+
tokenizer_spec:
|
|
331
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
332
|
+
args:
|
|
333
|
+
pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct
|
|
334
|
+
end_of_text_token: "<|endoftext|>"
|
|
335
|
+
prefix_token: "<|im_end|>"
|
|
336
|
+
|
|
302
337
|
# Lighting AI
|
|
303
338
|
- name: lightningai/lit-gpt
|
|
304
339
|
tokenizer_spec:
|
|
@@ -396,6 +431,14 @@ tokenizer_configs:
|
|
|
396
431
|
prefix_token: "<|begin_of_text|>"
|
|
397
432
|
end_of_text_token: "<|eot_id|>"
|
|
398
433
|
|
|
434
|
+
- name: meta/llama-4-scout-17b-16e-instruct
|
|
435
|
+
tokenizer_spec:
|
|
436
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
437
|
+
args:
|
|
438
|
+
pretrained_model_name_or_path: meta-llama/Llama-4-Scout-17B-16E-Instruct
|
|
439
|
+
prefix_token: "<|begin_of_text|>"
|
|
440
|
+
end_of_text_token: "<|end_of_text|>"
|
|
441
|
+
|
|
399
442
|
# 01-ai
|
|
400
443
|
- name: 01-ai/Yi-6B
|
|
401
444
|
tokenizer_spec:
|
|
@@ -432,9 +475,33 @@ tokenizer_configs:
|
|
|
432
475
|
end_of_text_token: "<|endoftext|>"
|
|
433
476
|
prefix_token: ""
|
|
434
477
|
|
|
478
|
+
- name: allenai/olmo-2-1124-7b-instruct
|
|
479
|
+
tokenizer_spec:
|
|
480
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
481
|
+
end_of_text_token: "<|endoftext|>"
|
|
482
|
+
prefix_token: "<|endoftext|>"
|
|
483
|
+
|
|
484
|
+
- name: allenai/olmo-2-0325-32b-instruct
|
|
485
|
+
tokenizer_spec:
|
|
486
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
487
|
+
end_of_text_token: "<|endoftext|>"
|
|
488
|
+
prefix_token: "<|endoftext|>"
|
|
489
|
+
|
|
490
|
+
- name: allenai/olmoe-1b-7b-0125-instruct
|
|
491
|
+
tokenizer_spec:
|
|
492
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
493
|
+
end_of_text_token: "|||IP_ADDRESS|||"
|
|
494
|
+
prefix_token: "|||IP_ADDRESS|||"
|
|
495
|
+
|
|
496
|
+
# Marin Community
|
|
497
|
+
- name: marin-community/marin-8b-instruct
|
|
498
|
+
tokenizer_spec:
|
|
499
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
500
|
+
end_of_text_token: "<|eot_id|>"
|
|
501
|
+
prefix_token: "<|begin_of_text|>"
|
|
435
502
|
|
|
436
503
|
# Microsoft
|
|
437
|
-
- name: microsoft/phi-2
|
|
504
|
+
- name: microsoft/phi-2
|
|
438
505
|
tokenizer_spec:
|
|
439
506
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
440
507
|
end_of_text_token: "<|endoftext|>"
|
|
@@ -521,6 +588,17 @@ tokenizer_configs:
|
|
|
521
588
|
end_of_text_token: "</s>"
|
|
522
589
|
prefix_token: "<s>"
|
|
523
590
|
|
|
591
|
+
# Moonshot AI
|
|
592
|
+
- name: moonshotai/kimi-k2-instruct
|
|
593
|
+
tokenizer_spec:
|
|
594
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
595
|
+
args:
|
|
596
|
+
pretrained_model_name_or_path: moonshotai/Kimi-K2-Instruct
|
|
597
|
+
trust_remote_code: true
|
|
598
|
+
revision: 4f239503ad9d1a042f0a4bacac457931ab972cfc
|
|
599
|
+
end_of_text_token: "[EOS]"
|
|
600
|
+
prefix_token: "[BOS]"
|
|
601
|
+
|
|
524
602
|
# Nectec
|
|
525
603
|
- name: nectec/OpenThaiLLM-Prebuilt-7B
|
|
526
604
|
tokenizer_spec:
|
|
@@ -619,6 +697,14 @@ tokenizer_configs:
|
|
|
619
697
|
end_of_text_token: "<|im_end|>"
|
|
620
698
|
prefix_token: "<|im_start|>"
|
|
621
699
|
|
|
700
|
+
- name: qwen/qwen3-235b-a22b
|
|
701
|
+
tokenizer_spec:
|
|
702
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
703
|
+
args:
|
|
704
|
+
pretrained_model_name_or_path: Qwen/Qwen3-235B-A22B
|
|
705
|
+
end_of_text_token: "<|im_end|>"
|
|
706
|
+
prefix_token: "<|im_start|>"
|
|
707
|
+
|
|
622
708
|
- name: qwen/qwq-32b-preview
|
|
623
709
|
tokenizer_spec:
|
|
624
710
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
@@ -664,6 +750,15 @@ tokenizer_configs:
|
|
|
664
750
|
end_of_text_token: "<|endoftext|>"
|
|
665
751
|
prefix_token: ""
|
|
666
752
|
|
|
753
|
+
- name: qwen/qwen2.5-omni-7b
|
|
754
|
+
tokenizer_spec:
|
|
755
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
756
|
+
args:
|
|
757
|
+
pretrained_model_name_or_path: Qwen/Qwen2.5-Omni-7B
|
|
758
|
+
trust_remote_code: false
|
|
759
|
+
end_of_text_token: "<|endoftext|>"
|
|
760
|
+
prefix_token: ""
|
|
761
|
+
|
|
667
762
|
# SambaLingo
|
|
668
763
|
- name: sambanova/sambalingo-thai-base
|
|
669
764
|
tokenizer_spec:
|
|
@@ -814,6 +909,7 @@ tokenizer_configs:
|
|
|
814
909
|
end_of_text_token: ""
|
|
815
910
|
prefix_token: ""
|
|
816
911
|
|
|
912
|
+
# Maritaca AI
|
|
817
913
|
- name: maritaca-ai/sabia-7b
|
|
818
914
|
tokenizer_spec:
|
|
819
915
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
@@ -822,6 +918,14 @@ tokenizer_configs:
|
|
|
822
918
|
end_of_text_token: "</s>"
|
|
823
919
|
prefix_token: "<s>"
|
|
824
920
|
|
|
921
|
+
- name: maritaca-ai/sabia-2-tokenizer-medium
|
|
922
|
+
tokenizer_spec:
|
|
923
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
924
|
+
args:
|
|
925
|
+
pretrained_model_name_or_path: maritaca-ai/sabia-2-tokenizer-medium
|
|
926
|
+
end_of_text_token: "</s>"
|
|
927
|
+
prefix_token: "<s>"
|
|
928
|
+
|
|
825
929
|
# Granite-3.1-8b-base
|
|
826
930
|
- name: ibm-granite/granite-3.1-8b-base
|
|
827
931
|
tokenizer_spec:
|
|
@@ -910,8 +1014,6 @@ tokenizer_configs:
|
|
|
910
1014
|
prefix_token: ""
|
|
911
1015
|
end_of_text_token: ""
|
|
912
1016
|
|
|
913
|
-
|
|
914
|
-
|
|
915
1017
|
- name: ibm-granite/granite-34b-code-instruct-8k
|
|
916
1018
|
tokenizer_spec:
|
|
917
1019
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
@@ -945,7 +1047,17 @@ tokenizer_configs:
|
|
|
945
1047
|
prefix_token: ""
|
|
946
1048
|
end_of_text_token: ""
|
|
947
1049
|
|
|
1050
|
+
# IBM Granite 3.3
|
|
948
1051
|
|
|
1052
|
+
- name: ibm/granite-3.3-8b-instruct
|
|
1053
|
+
tokenizer_spec:
|
|
1054
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1055
|
+
args:
|
|
1056
|
+
pretrained_model_name_or_path: ibm-granite/granite-3.3-8b-instruct
|
|
1057
|
+
end_of_text_token: "<|end_of_text|>"
|
|
1058
|
+
prefix_token: "<|end_of_text|>"
|
|
1059
|
+
|
|
1060
|
+
|
|
949
1061
|
|
|
950
1062
|
# DeepSeek-R1-Distill-Llama-3.1-8b
|
|
951
1063
|
- name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
|
|
@@ -964,3 +1076,112 @@ tokenizer_configs:
|
|
|
964
1076
|
pretrained_model_name_or_path: deepseek-ai/deepseek-coder-6.7b-instruct
|
|
965
1077
|
end_of_text_token: "<|end▁of▁sentence|>"
|
|
966
1078
|
prefix_token: "<|begin▁of▁sentence|>"
|
|
1079
|
+
|
|
1080
|
+
|
|
1081
|
+
# vilm/vinallama-2.7b-chat
|
|
1082
|
+
- name: vilm/vinallama-2.7b-chat
|
|
1083
|
+
tokenizer_spec:
|
|
1084
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1085
|
+
args:
|
|
1086
|
+
pretrained_model_name_or_path: vilm/vinallama-2.7b-chat
|
|
1087
|
+
end_of_text_token: "<im_end>"
|
|
1088
|
+
prefix_token: "<im_start>"
|
|
1089
|
+
|
|
1090
|
+
# vilm/vinallama-7b-chat
|
|
1091
|
+
- name: vilm/vinallama-7b-chat
|
|
1092
|
+
tokenizer_spec:
|
|
1093
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1094
|
+
args:
|
|
1095
|
+
pretrained_model_name_or_path: vilm/vinallama-7b-chat
|
|
1096
|
+
end_of_text_token: "<im_end>"
|
|
1097
|
+
prefix_token: "<im_start>"
|
|
1098
|
+
|
|
1099
|
+
# vilm/vietcuna-7b-v3
|
|
1100
|
+
- name: vilm/vietcuna-7b-v3
|
|
1101
|
+
tokenizer_spec:
|
|
1102
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1103
|
+
args:
|
|
1104
|
+
pretrained_model_name_or_path: vilm/vietcuna-7b-v3
|
|
1105
|
+
end_of_text_token: "</s>"
|
|
1106
|
+
prefix_token: "<s>"
|
|
1107
|
+
|
|
1108
|
+
# Viet-Mistral/Vistral-7B-Chat
|
|
1109
|
+
- name: Viet-Mistral/Vistral-7B-Chat
|
|
1110
|
+
tokenizer_spec:
|
|
1111
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1112
|
+
args:
|
|
1113
|
+
pretrained_model_name_or_path: Viet-Mistral/Vistral-7B-Chat
|
|
1114
|
+
end_of_text_token: "</s>"
|
|
1115
|
+
prefix_token: "<s>"
|
|
1116
|
+
|
|
1117
|
+
# vinai/PhoGPT-7B5-Instruct
|
|
1118
|
+
- name: vinai/PhoGPT-7B5-Instruct
|
|
1119
|
+
tokenizer_spec:
|
|
1120
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1121
|
+
args:
|
|
1122
|
+
pretrained_model_name_or_path: vinai/PhoGPT-7B5-Instruct
|
|
1123
|
+
end_of_text_token: "</s>"
|
|
1124
|
+
prefix_token: "<s>"
|
|
1125
|
+
|
|
1126
|
+
# vinai/PhoGPT-4B-Chat
|
|
1127
|
+
- name: vinai/PhoGPT-4B-Chat
|
|
1128
|
+
tokenizer_spec:
|
|
1129
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1130
|
+
args:
|
|
1131
|
+
pretrained_model_name_or_path: vinai/PhoGPT-4B-Chat
|
|
1132
|
+
end_of_text_token: "</s>"
|
|
1133
|
+
prefix_token: "<s>"
|
|
1134
|
+
|
|
1135
|
+
# Gemma-3-Gaia-PT-BR-4b-it
|
|
1136
|
+
- name: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
|
|
1137
|
+
tokenizer_spec:
|
|
1138
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1139
|
+
args:
|
|
1140
|
+
pretrained_model_name_or_path: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
|
|
1141
|
+
end_of_text_token: "<eos>"
|
|
1142
|
+
prefix_token: "<bos>"
|
|
1143
|
+
|
|
1144
|
+
# Bode 13B Alpaca PT-BR
|
|
1145
|
+
- name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
|
|
1146
|
+
tokenizer_spec:
|
|
1147
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1148
|
+
args:
|
|
1149
|
+
pretrained_model_name_or_path: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
|
|
1150
|
+
end_of_text_token: "</s>"
|
|
1151
|
+
prefix_token: "<s>"
|
|
1152
|
+
|
|
1153
|
+
# Cabrita 7B PT-BR tokenizer
|
|
1154
|
+
- name: 22h/cabrita_7b_pt_850000
|
|
1155
|
+
tokenizer_spec:
|
|
1156
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1157
|
+
args:
|
|
1158
|
+
pretrained_model_name_or_path: 22h/cabrita_7b_pt_850000
|
|
1159
|
+
end_of_text_token: "</s>"
|
|
1160
|
+
prefix_token: "<s>"
|
|
1161
|
+
|
|
1162
|
+
# Gervásio 7B PT‑BR/PT‑PT tokenizer
|
|
1163
|
+
- name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
|
|
1164
|
+
tokenizer_spec:
|
|
1165
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1166
|
+
args:
|
|
1167
|
+
pretrained_model_name_or_path: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
|
|
1168
|
+
end_of_text_token: "</s>"
|
|
1169
|
+
prefix_token: "<s>"
|
|
1170
|
+
|
|
1171
|
+
# Tucano 2b4 PT-BR tokenizer
|
|
1172
|
+
- name: TucanoBR/Tucano-2b4
|
|
1173
|
+
tokenizer_spec:
|
|
1174
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1175
|
+
args:
|
|
1176
|
+
pretrained_model_name_or_path: TucanoBR/Tucano-2b4
|
|
1177
|
+
end_of_text_token: "</s>"
|
|
1178
|
+
prefix_token: "<s>"
|
|
1179
|
+
|
|
1180
|
+
# TeenyTinyLlama 460M PT-BR tokenizer
|
|
1181
|
+
- name: nicholasKluge/TeenyTinyLlama-460m
|
|
1182
|
+
tokenizer_spec:
|
|
1183
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1184
|
+
args:
|
|
1185
|
+
pretrained_model_name_or_path: nicholasKluge/TeenyTinyLlama-460m
|
|
1186
|
+
end_of_text_token: "</s>"
|
|
1187
|
+
prefix_token: "<s>"
|
helm/proxy/cli.py
CHANGED
|
@@ -21,7 +21,7 @@ from typing import List, Dict
|
|
|
21
21
|
import re
|
|
22
22
|
import sys
|
|
23
23
|
|
|
24
|
-
from helm.common.hierarchical_logger import hlog
|
|
24
|
+
from helm.common.hierarchical_logger import hlog, setup_default_logging
|
|
25
25
|
from helm.common.authentication import Authentication
|
|
26
26
|
from helm.proxy.accounts import Usage, Account
|
|
27
27
|
from helm.proxy.services.remote_service import RemoteService, add_service_args, create_authentication
|
|
@@ -123,7 +123,7 @@ def do_create_update_command(service: RemoteService, auth: Authentication, args)
|
|
|
123
123
|
|
|
124
124
|
# Update quotas
|
|
125
125
|
for quota_str in args.quotas:
|
|
126
|
-
m = re.match(
|
|
126
|
+
m = re.match(rf"(\w+)\.(\w+)=(\d+|{UNLIMITED_QUOTA})", quota_str)
|
|
127
127
|
if not m:
|
|
128
128
|
raise Exception(
|
|
129
129
|
f"Invalid format: {quota_str}, expect <model_group>.<granularity>=<quota> "
|
|
@@ -198,6 +198,8 @@ def main():
|
|
|
198
198
|
|
|
199
199
|
args = parser.parse_args()
|
|
200
200
|
|
|
201
|
+
setup_default_logging()
|
|
202
|
+
|
|
201
203
|
service = create_remote_service(args)
|
|
202
204
|
auth = create_authentication(args)
|
|
203
205
|
|
|
@@ -38,7 +38,7 @@ def replace_emoji_characters(s: str) -> str:
|
|
|
38
38
|
highpoints = re.compile("[\U00010000-\U0010ffff]")
|
|
39
39
|
elif sys.maxunicode == 65535:
|
|
40
40
|
# Python was built with '--enable-unicode=ucs2'
|
|
41
|
-
highpoints = re.compile("[\
|
|
41
|
+
highpoints = re.compile("[\ud800-\udbff][\udc00-\udfff]")
|
|
42
42
|
else:
|
|
43
43
|
raise UnicodeError("Unable to determine if Python was built using UCS-2 or UCS-4")
|
|
44
44
|
|
helm/proxy/retry.py
CHANGED
|
@@ -5,6 +5,7 @@ from retrying import Retrying
|
|
|
5
5
|
from helm.common.request import RequestResult
|
|
6
6
|
from helm.common.tokenization_request import TokenizationRequestResult
|
|
7
7
|
from helm.common.hierarchical_logger import hlog
|
|
8
|
+
import os
|
|
8
9
|
import traceback
|
|
9
10
|
import threading
|
|
10
11
|
|
|
@@ -19,6 +20,10 @@ Example usage:
|
|
|
19
20
|
...
|
|
20
21
|
"""
|
|
21
22
|
|
|
23
|
+
# TODO: make these configurable at a config / cli level
|
|
24
|
+
HELM_RETRIES = int(os.environ.get("HELM_RETRIES", "5"))
|
|
25
|
+
HELM_TOKENIZER_RETRIES = int(os.environ.get("HELM_TOKENIZER_RETRIES", HELM_RETRIES))
|
|
26
|
+
|
|
22
27
|
# The lock is used to prevent multiple threads from printing at the same time.
|
|
23
28
|
# This can cause issues when printing the stack trace.
|
|
24
29
|
# (The stack traces can get mixed up and become unreadable.)
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import dataclasses
|
|
2
1
|
import os
|
|
3
2
|
import signal
|
|
4
|
-
from typing import List
|
|
3
|
+
from typing import List
|
|
5
4
|
|
|
5
|
+
from helm.common.local_context import LocalContext
|
|
6
6
|
from helm.common.cache import CacheConfig
|
|
7
7
|
from helm.common.cache_backend_config import CacheBackendConfig, BlackHoleCacheBackendConfig
|
|
8
8
|
from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
|
|
@@ -11,7 +11,6 @@ from helm.common.moderations_api_request import ModerationAPIRequest, Moderation
|
|
|
11
11
|
from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
|
|
12
12
|
from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
|
|
13
13
|
from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
|
|
14
|
-
from helm.common.general import ensure_directory_exists, parse_hocon, get_credentials
|
|
15
14
|
from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
|
|
16
15
|
from helm.common.tokenization_request import (
|
|
17
16
|
TokenizationRequest,
|
|
@@ -22,27 +21,13 @@ from helm.common.tokenization_request import (
|
|
|
22
21
|
from helm.common.request import Request, RequestResult
|
|
23
22
|
from helm.common.hierarchical_logger import hlog
|
|
24
23
|
from helm.proxy.accounts import Accounts, Account
|
|
25
|
-
from helm.clients.auto_client import AutoClient
|
|
26
|
-
from helm.clients.moderation_api_client import ModerationAPIClient
|
|
27
|
-
from helm.clients.image_generation.nudity_check_client import NudityCheckClient
|
|
28
|
-
from helm.clients.gcs_client import GCSClient
|
|
29
|
-
from helm.clients.clip_score_client import CLIPScoreClient
|
|
30
|
-
from helm.clients.toxicity_classifier_client import ToxicityClassifierClient
|
|
31
|
-
from helm.proxy.example_queries import example_queries
|
|
32
|
-
from helm.benchmark.model_metadata_registry import ALL_MODELS_METADATA
|
|
33
24
|
from helm.benchmark.model_deployment_registry import get_model_deployment_host_organization
|
|
34
25
|
from helm.proxy.query import Query, QueryResult
|
|
35
|
-
from helm.proxy.retry import retry_request
|
|
36
26
|
from helm.proxy.token_counters.auto_token_counter import AutoTokenCounter
|
|
37
|
-
from helm.tokenizers.auto_tokenizer import AutoTokenizer
|
|
38
27
|
from helm.proxy.services.service import (
|
|
39
28
|
Service,
|
|
40
|
-
CACHE_DIR,
|
|
41
29
|
ACCOUNTS_FILE,
|
|
42
30
|
GeneralInfo,
|
|
43
|
-
VERSION,
|
|
44
|
-
expand_environments,
|
|
45
|
-
synthesize_request,
|
|
46
31
|
)
|
|
47
32
|
|
|
48
33
|
|
|
@@ -57,43 +42,17 @@ class ServerService(Service):
|
|
|
57
42
|
root_mode: bool = False,
|
|
58
43
|
cache_backend_config: CacheBackendConfig = BlackHoleCacheBackendConfig(),
|
|
59
44
|
):
|
|
60
|
-
ensure_directory_exists(base_path)
|
|
61
|
-
client_file_storage_path = os.path.join(base_path, CACHE_DIR)
|
|
62
|
-
ensure_directory_exists(client_file_storage_path)
|
|
63
|
-
|
|
64
|
-
credentials = get_credentials(base_path)
|
|
65
45
|
accounts_path = os.path.join(base_path, ACCOUNTS_FILE)
|
|
66
46
|
|
|
67
|
-
self.
|
|
68
|
-
self.
|
|
69
|
-
self.tokenizer = AutoTokenizer(credentials, cache_backend_config)
|
|
70
|
-
self.token_counter = AutoTokenCounter(self.tokenizer)
|
|
47
|
+
self.context = LocalContext(base_path, cache_backend_config)
|
|
48
|
+
self.token_counter = AutoTokenCounter(self.context.tokenizer)
|
|
71
49
|
self.accounts = Accounts(accounts_path, root_mode=root_mode)
|
|
72
50
|
|
|
73
|
-
# Lazily instantiate the following clients
|
|
74
|
-
self.moderation_api_client: Optional[ModerationAPIClient] = None
|
|
75
|
-
self.toxicity_classifier_client: Optional[ToxicityClassifierClient] = None
|
|
76
|
-
self.perspective_api_client: Optional[ToxicityClassifierClient] = None
|
|
77
|
-
self.nudity_check_client: Optional[NudityCheckClient] = None
|
|
78
|
-
self.clip_score_client: Optional[CLIPScoreClient] = None
|
|
79
|
-
self.gcs_client: Optional[GCSClient] = None
|
|
80
|
-
|
|
81
51
|
def get_general_info(self) -> GeneralInfo:
|
|
82
|
-
|
|
83
|
-
# TODO(#2158): Either fix this or delete get_general_info.
|
|
84
|
-
all_models = [dataclasses.replace(model_metadata, release_date=None) for model_metadata in ALL_MODELS_METADATA]
|
|
85
|
-
return GeneralInfo(version=VERSION, example_queries=example_queries, all_models=all_models)
|
|
52
|
+
return self.context.get_general_info()
|
|
86
53
|
|
|
87
54
|
def expand_query(self, query: Query) -> QueryResult:
|
|
88
|
-
|
|
89
|
-
prompt = query.prompt
|
|
90
|
-
settings = query.settings
|
|
91
|
-
environments = parse_hocon(query.environments)
|
|
92
|
-
requests = []
|
|
93
|
-
for environment in expand_environments(environments):
|
|
94
|
-
request = synthesize_request(prompt, settings, environment)
|
|
95
|
-
requests.append(request)
|
|
96
|
-
return QueryResult(requests=requests)
|
|
55
|
+
return self.context.expand_query(query)
|
|
97
56
|
|
|
98
57
|
def _get_model_group_for_model_deployment(self, model_deployment: str) -> str:
|
|
99
58
|
if model_deployment.startswith("openai/"):
|
|
@@ -105,7 +64,11 @@ class ServerService(Service):
|
|
|
105
64
|
return "gpt4"
|
|
106
65
|
elif model_deployment.startswith("openai/gpt-3"):
|
|
107
66
|
return "gpt3"
|
|
108
|
-
elif
|
|
67
|
+
elif (
|
|
68
|
+
model_deployment.startswith("openai/o1")
|
|
69
|
+
or model_deployment.startswith("openai/o3")
|
|
70
|
+
or model_deployment.startswith("openai/o4")
|
|
71
|
+
):
|
|
109
72
|
return "o1"
|
|
110
73
|
else:
|
|
111
74
|
return "openai"
|
|
@@ -126,7 +89,7 @@ class ServerService(Service):
|
|
|
126
89
|
self.accounts.check_can_use(auth.api_key, model_group)
|
|
127
90
|
|
|
128
91
|
# Use!
|
|
129
|
-
request_result: RequestResult = self.
|
|
92
|
+
request_result: RequestResult = self.context.make_request(request)
|
|
130
93
|
|
|
131
94
|
# Only deduct if not cached
|
|
132
95
|
if not request_result.cached:
|
|
@@ -139,66 +102,39 @@ class ServerService(Service):
|
|
|
139
102
|
def tokenize(self, auth: Authentication, request: TokenizationRequest) -> TokenizationRequestResult:
|
|
140
103
|
"""Tokenize via an API."""
|
|
141
104
|
self.accounts.authenticate(auth)
|
|
142
|
-
return self.
|
|
105
|
+
return self.context.tokenize(request)
|
|
143
106
|
|
|
144
107
|
def decode(self, auth: Authentication, request: DecodeRequest) -> DecodeRequestResult:
|
|
145
108
|
"""Decodes to text."""
|
|
146
109
|
self.accounts.authenticate(auth)
|
|
147
|
-
return self.
|
|
110
|
+
return self.context.decode(request)
|
|
148
111
|
|
|
149
112
|
def upload(self, auth: Authentication, request: FileUploadRequest) -> FileUploadResult:
|
|
150
113
|
"""Uploads a file to external storage."""
|
|
151
114
|
self.accounts.authenticate(auth)
|
|
152
|
-
|
|
153
|
-
if not self.gcs_client:
|
|
154
|
-
self.gcs_client = self.client.get_gcs_client()
|
|
155
|
-
|
|
156
|
-
assert self.gcs_client
|
|
157
|
-
return self.gcs_client.upload(request)
|
|
115
|
+
return self.context.upload(request)
|
|
158
116
|
|
|
159
117
|
def check_nudity(self, auth: Authentication, request: NudityCheckRequest) -> NudityCheckResult:
|
|
160
118
|
"""Check for nudity."""
|
|
161
119
|
self.accounts.authenticate(auth)
|
|
162
|
-
|
|
163
|
-
if not self.nudity_check_client:
|
|
164
|
-
self.nudity_check_client = self.client.get_nudity_check_client()
|
|
165
|
-
|
|
166
|
-
assert self.nudity_check_client
|
|
167
|
-
return self.nudity_check_client.check_nudity(request)
|
|
120
|
+
return self.context.check_nudity(request)
|
|
168
121
|
|
|
169
122
|
def compute_clip_score(self, auth: Authentication, request: CLIPScoreRequest) -> CLIPScoreResult:
|
|
170
123
|
"""Computes CLIPScore for a given caption and image."""
|
|
171
124
|
self.accounts.authenticate(auth)
|
|
172
|
-
|
|
173
|
-
if not self.clip_score_client:
|
|
174
|
-
self.clip_score_client = self.client.get_clip_score_client()
|
|
175
|
-
|
|
176
|
-
assert self.clip_score_client
|
|
177
|
-
return self.clip_score_client.compute_score(request)
|
|
125
|
+
return self.context.compute_clip_score(request)
|
|
178
126
|
|
|
179
127
|
def get_toxicity_scores(self, auth: Authentication, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
|
|
180
|
-
@retry_request
|
|
181
|
-
def get_toxicity_scores_with_retry(request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
|
|
182
|
-
if not self.toxicity_classifier_client:
|
|
183
|
-
self.toxicity_classifier_client = self.client.get_toxicity_classifier_client()
|
|
184
|
-
return self.toxicity_classifier_client.get_toxicity_scores(request)
|
|
185
|
-
|
|
186
128
|
self.accounts.authenticate(auth)
|
|
187
|
-
return
|
|
129
|
+
return self.context.get_toxicity_scores(request)
|
|
188
130
|
|
|
189
131
|
def get_moderation_results(self, auth: Authentication, request: ModerationAPIRequest) -> ModerationAPIRequestResult:
|
|
190
|
-
@retry_request
|
|
191
|
-
def get_moderation_results_with_retry(request: ModerationAPIRequest) -> ModerationAPIRequestResult:
|
|
192
|
-
if not self.moderation_api_client:
|
|
193
|
-
self.moderation_api_client = self.client.get_moderation_api_client()
|
|
194
|
-
return self.moderation_api_client.get_moderation_results(request)
|
|
195
|
-
|
|
196
132
|
self.accounts.authenticate(auth)
|
|
197
|
-
return
|
|
133
|
+
return self.context.get_moderation_results(request)
|
|
198
134
|
|
|
199
135
|
def make_critique_request(self, auth: Authentication, request: CritiqueRequest) -> CritiqueRequestResult:
|
|
200
136
|
self.accounts.authenticate(auth)
|
|
201
|
-
return self.
|
|
137
|
+
return self.context.make_critique_request(request)
|
|
202
138
|
|
|
203
139
|
def create_account(self, auth: Authentication) -> Account:
|
|
204
140
|
"""Creates a new account."""
|
|
@@ -233,4 +169,4 @@ class ServerService(Service):
|
|
|
233
169
|
hlog("Done.")
|
|
234
170
|
|
|
235
171
|
def get_cache_config(self, shard_name: str) -> CacheConfig:
|
|
236
|
-
return self.
|
|
172
|
+
return self.context.get_cache_config(shard_name)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import os
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
from helm.common.cache import CacheConfig
|
|
8
|
+
from helm.common.tokenization_request import (
|
|
9
|
+
TokenizationRequest,
|
|
10
|
+
TokenizationToken,
|
|
11
|
+
)
|
|
12
|
+
from helm.tokenizers.caching_tokenizer import CachingTokenizer
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class GrokAPITokenizer(CachingTokenizer):
|
|
16
|
+
"""Tokenizer that uses the xAI Grok Tokenize Text API
|
|
17
|
+
|
|
18
|
+
Doc: https://docs.x.ai/docs/api-reference#tokenize-text"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, cache_config: CacheConfig, api_key: Optional[str] = None) -> None:
|
|
21
|
+
super().__init__(cache_config)
|
|
22
|
+
self.api_key = api_key or os.environ.get("XAI_API_KEY")
|
|
23
|
+
|
|
24
|
+
def _tokenization_request_to_cache_key(self, request: TokenizationRequest) -> Dict[str, Any]:
|
|
25
|
+
cache_key = dataclasses.asdict(request)
|
|
26
|
+
# Delete encode because the Grok API simulateously gives string and integer tokens.
|
|
27
|
+
del cache_key["encode"]
|
|
28
|
+
return cache_key
|
|
29
|
+
|
|
30
|
+
def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
|
|
31
|
+
if not self.api_key:
|
|
32
|
+
raise Exception(
|
|
33
|
+
"No Grok API key found. "
|
|
34
|
+
"Set grokApiKey in credentials.conf or set the GROK_API_KEY environment variable"
|
|
35
|
+
)
|
|
36
|
+
text = request["text"]
|
|
37
|
+
if not text:
|
|
38
|
+
return {"token_ids": []}
|
|
39
|
+
model = request["tokenizer"].split("/")[-1]
|
|
40
|
+
response = requests.post(
|
|
41
|
+
url="https://api.x.ai/v1/tokenize-text",
|
|
42
|
+
headers={"Authorization": f"Bearer {self.api_key}"},
|
|
43
|
+
json={"text": text, "model": model},
|
|
44
|
+
)
|
|
45
|
+
response.raise_for_status()
|
|
46
|
+
return response.json()
|
|
47
|
+
|
|
48
|
+
def _tokenization_raw_response_to_tokens(
|
|
49
|
+
self, response: Dict[str, Any], request: TokenizationRequest
|
|
50
|
+
) -> List[TokenizationToken]:
|
|
51
|
+
raw_token_field_name = "token_id" if request.encode else "string_token"
|
|
52
|
+
return [TokenizationToken(raw_token[raw_token_field_name]) for raw_token in response["token_ids"]]
|
|
53
|
+
|
|
54
|
+
def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
|
|
55
|
+
raise NotImplementedError("The xAI API does not support decoding.")
|
|
@@ -119,7 +119,7 @@ class HuggingFaceTokenizer(CachingTokenizer):
|
|
|
119
119
|
tokens = tokenizer.encode(
|
|
120
120
|
request["text"],
|
|
121
121
|
truncation=request["truncation"],
|
|
122
|
-
max_length=request["max_length"],
|
|
122
|
+
max_length=max(request["max_length"], 0),
|
|
123
123
|
add_special_tokens=False,
|
|
124
124
|
)
|
|
125
125
|
else:
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pytest
|
|
3
|
+
|
|
4
|
+
from helm.common.cache import BlackHoleCacheConfig
|
|
5
|
+
from helm.common.tokenization_request import (
|
|
6
|
+
TokenizationRequest,
|
|
7
|
+
TokenizationToken,
|
|
8
|
+
)
|
|
9
|
+
from helm.tokenizers.grok_tokenizer import GrokAPITokenizer
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@pytest.mark.models
|
|
13
|
+
def test_tokenize():
|
|
14
|
+
if not os.environ.get("XAI_API_KEY"):
|
|
15
|
+
pytest.skip("No xAI API key found; skipping test")
|
|
16
|
+
tokenizer = GrokAPITokenizer(cache_config=BlackHoleCacheConfig())
|
|
17
|
+
request = TokenizationRequest(tokenizer="xai/grok-3-beta", text="otter 🦦")
|
|
18
|
+
result = tokenizer.tokenize(request)
|
|
19
|
+
assert result.success
|
|
20
|
+
assert not result.cached
|
|
21
|
+
assert result.tokens == [TokenizationToken(token) for token in ["otter", "", "", ""]]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@pytest.mark.models
|
|
25
|
+
def test_encode():
|
|
26
|
+
if not os.environ.get("XAI_API_KEY"):
|
|
27
|
+
pytest.skip("No xAI API key found; skipping test")
|
|
28
|
+
tokenizer = GrokAPITokenizer(cache_config=BlackHoleCacheConfig())
|
|
29
|
+
request = TokenizationRequest(tokenizer="xai/grok-3-beta", text="otter 🦦", encode=True)
|
|
30
|
+
result = tokenizer.tokenize(request)
|
|
31
|
+
assert result.success
|
|
32
|
+
assert not result.cached
|
|
33
|
+
assert result.tokens == [TokenizationToken(token) for token in [142507, 11637, 294, 294]]
|